Beta: Firebase Genkit is in Beta, which means that it is not subject to any SLA or deprecation policy and could change in backwards-incompatible ways. Throughout the Beta period, Firebase Genkit and its documentation will be updated and improved.

หน้านี้ได้รับการแปลโดย Cloud Translation API

การเขียน Genkit Evaluator

คุณสามารถขยาย Firebase Genkit เพื่อรองรับการประเมินที่กําหนดเองได้โดยใช้ LLM เป็นตัวตัดสิน หรือการประเมินแบบเป็นโปรแกรม (Heuristic)

คําจํากัดความของผู้ประเมิน

เครื่องมือประเมินคือฟังก์ชันที่ประเมินคำตอบของ LLM การประเมินอัตโนมัติมี 2 วิธีหลัก ได้แก่ การประเมินแบบเฮิวริสติกและการประเมินตาม LLM ในแนวทางเฮิวริสติก คุณจะกําหนดฟังก์ชันแบบกำหนดได้ ในทางตรงกันข้าม การประเมินที่อิงตาม LLM จะส่งเนื้อหากลับไปให้ LLM และขอให้ LLM ให้คะแนนเอาต์พุตตามเกณฑ์ที่กำหนดไว้ในพรอมต์

เมธอด ai.defineEvaluator ที่คุณใช้กําหนดการดําเนินการของ evaluator ใน Genkit รองรับทั้ง 2 แนวทาง เอกสารนี้อธิบายตัวอย่างวิธีใช้วิธีการนี้สําหรับการประเมินแบบเฮิวริสติกและการประเมินตาม LLM

ผู้ประเมินตาม LLM

ผู้ประเมินที่อิงตาม LLM ใช้ประโยชน์จาก LLM เพื่อประเมินinput, context และ output ของฟีเจอร์ Generative AI

เครื่องมือประเมินที่อิงตาม LLM ใน Genkit ประกอบด้วย 3 องค์ประกอบ ได้แก่

พรอมต์
ฟังก์ชันการให้คะแนน
การดำเนินการของผู้ประเมิน

กําหนดพรอมต์

ในตัวอย่างนี้ ผู้ประเมินใช้ประโยชน์จาก LLM เพื่อพิจารณาว่าอาหาร (output) อร่อยหรือไม่ ก่อนอื่น ให้บริบทกับ LLM จากนั้นอธิบายสิ่งที่ต้องการให้ทำ และสุดท้าย ให้ตัวอย่าง 2-3 รายการเพื่อใช้เป็นพื้นฐานในการตอบกลับ

ยูทิลิตี definePrompt ของ Genkit ช่วยให้คุณกำหนดพรอมต์ที่มีการตรวจสอบอินพุตและเอาต์พุตได้อย่างง่ายดาย โค้ดต่อไปนี้เป็นตัวอย่างการสร้างพรอมต์การประเมินด้วย definePrompt

import { z } from "genkit";

const DELICIOUSNESS_VALUES = ['yes', 'no', 'maybe'] as const;

const DeliciousnessDetectionResponseSchema = z.object({
  reason: z.string(),
  verdict: z.enum(DELICIOUSNESS_VALUES),
});

function getDeliciousnessPrompt(ai: Genkit) {
  return  ai.definePrompt({
      name: 'deliciousnessPrompt',
      input: {
        schema: z.object({
          responseToTest: z.string(),
        }),
      },
      output: {
        schema: DeliciousnessDetectionResponseSchema,
      }
    },
    `You are a food critic. Assess whether the provided output sounds delicious, giving only "yes" (delicious), "no" (not delicious), or "maybe" (undecided) as the verdict.

    Examples:
    Output: Chicken parm sandwich
    Response: { "reason": "A classic and beloved dish.", "verdict": "yes" }

    Output: Boston Logan Airport tarmac
    Response: { "reason": "Not edible.", "verdict": "no" }

    Output: A juicy piece of gossip
    Response: { "reason": "Metaphorically 'tasty' but not food.", "verdict": "maybe" }

    New Output: {{ responseToTest }}
    Response:
    `
  );
}

กำหนดฟังก์ชันการให้คะแนน

กำหนดฟังก์ชันที่ใช้ตัวอย่างที่มี output ตามพรอมต์ที่กำหนด และคะแนนผลลัพธ์ เทสเคส Genkit มี input เป็นฟิลด์ที่ต้องกรอก โดยมี output และ context เป็นฟิลด์ที่ไม่บังคับ ผู้ประเมินมีหน้าที่ตรวจสอบว่าฟิลด์ที่จำเป็นทั้งหมดสำหรับการประเมินมีข้อมูลอยู่

import { ModelArgument, z } from 'genkit';
import { BaseEvalDataPoint, Score } from 'genkit/evaluator';

/**
 * Score an individual test case for delciousness.
 */
export async function deliciousnessScore<
  CustomModelOptions extends z.ZodTypeAny,
>(
  judgeLlm: ModelArgument<CustomModelOptions>,
  dataPoint: BaseEvalDataPoint,
  judgeConfig?: CustomModelOptions
): Promise<Score> {
  const d = dataPoint;
  // Validate the input has required fields
  if (!d.output) {
    throw new Error('Output is required for Deliciousness detection');
  }

  // Hydrate the prompt and generate an evaluation result
  const deliciousnessPrompt = getDeliciousnessPrompt(ai);
  const response = await deliciousnessPrompt(
    {
      responseToTest: d.output as string,
    },
    {
      model: judgeLlm,
      config: judgeConfig,
    }
  );

  // Parse the output
  const parsedResponse = response.output;
  if (!parsedResponse) {
    throw new Error(`Unable to parse evaluator response: ${response.text}`);
  }

  // Return a scored response
  return {
    score: parsedResponse.verdict,
    details: { reasoning: parsedResponse.reason },
  };
}

กําหนดการดําเนินการของโปรแกรมประเมิน

ขั้นตอนสุดท้ายคือเขียนฟังก์ชันที่กําหนด EvaluatorAction

import { Genkit, z } from 'genkit';
import { BaseEvalDataPoint, EvaluatorAction } from 'genkit/evaluator';

/**
 * Create the Deliciousness evaluator action.
 */
export function createDeliciousnessEvaluator<
  ModelCustomOptions extends z.ZodTypeAny,
>(
  ai: Genkit,
  judge: ModelArgument<ModelCustomOptions>,
  judgeConfig?: z.infer<ModelCustomOptions>
): EvaluatorAction {
  return ai.defineEvaluator(
    {
      name: `myCustomEvals/deliciousnessEvaluator`,
      displayName: 'Deliciousness',
      definition: 'Determines if output is considered delicous.',
      isBilled: true,
    },
    async (datapoint: BaseEvalDataPoint) => {
      const score = await deliciousnessScore(judge, datapoint, judgeConfig);
      return {
        testCaseId: datapoint.testCaseId,
        evaluation: score,
      };
    }
  );
}

เมธอด defineEvaluator คล้ายกับคอนสตรคเตอร์ Genkit อื่นๆ เช่น defineFlow และ defineRetriever เมธอดนี้กำหนดให้ต้องระบุ EvaluatorFn เป็นคอลแบ็ก เมธอด EvaluatorFn จะยอมรับออบเจ็กต์ BaseEvalDataPoint ซึ่งสอดคล้องกับรายการเดียวในชุดข้อมูลภายใต้การประเมิน พร้อมกับพารามิเตอร์ตัวเลือกที่กำหนดเอง (ไม่บังคับ) หากระบุ ฟังก์ชันจะประมวลผลจุดข้อมูลและแสดงผลออบเจ็กต์ EvalResponse

สคีมา Zod สำหรับ BaseEvalDataPoint และ EvalResponse มีดังนี้

`BaseEvalDataPoint`

export const BaseEvalDataPoint = z.object({
  testCaseId: z.string(),
  input: z.unknown(),
  output: z.unknown().optional(),
  context: z.array(z.unknown()).optional(),
  reference: z.unknown().optional(),
  testCaseId: z.string().optional(),
  traceIds: z.array(z.string()).optional(),
});

export const EvalResponse = z.object({
  sampleIndex: z.number().optional(),
  testCaseId: z.string(),
  traceId: z.string().optional(),
  spanId: z.string().optional(),
  evaluation: z.union([ScoreSchema, z.array(ScoreSchema)]),
});

`ScoreSchema`

const ScoreSchema = z.object({
  id: z.string().describe('Optional ID to differentiate multiple scores').optional(),
  score: z.union([z.number(), z.string(), z.boolean()]).optional(),
  error: z.string().optional(),
  details: z
    .object({
      reasoning: z.string().optional(),
    })
    .passthrough()
    .optional(),
});

ออบเจ็กต์ defineEvaluator ช่วยให้ผู้ใช้ระบุชื่อ ชื่อที่แสดงที่ผู้ใช้อ่านได้ และคำจำกัดความสำหรับเครื่องมือประเมิน ชื่อที่แสดงและคำจำกัดความจะแสดงพร้อมกับผลการทดสอบใน UI ของนักพัฒนาซอฟต์แวร์ นอกจากนี้ยังมีช่อง isBilled ที่ไม่บังคับซึ่งระบุว่าเครื่องมือประเมินนี้อาจทําให้เกิดการเรียกเก็บเงินหรือไม่ (เช่น ใช้ LLM หรือ API ที่มีการเรียกเก็บเงิน) หากมีการเรียกเก็บเงินจากผู้ประเมิน UI จะแจ้งให้ผู้ใช้ยืนยันใน CLI ก่อนที่จะอนุญาตให้เรียกใช้การประเมิน ขั้นตอนนี้จะช่วยป้องกันค่าใช้จ่ายที่ไม่ตั้งใจ

เครื่องมือประเมินแบบเฮuristic

เครื่องมือประเมินแบบเฮิวริสติกอาจเป็นฟังก์ชันใดก็ได้ที่ใช้ประเมิน input, context หรือ output ของฟีเจอร์ Generative AI

เครื่องมือประเมินแบบเฮิวริสติกใน Genkit ประกอบด้วย 2 ส่วน ได้แก่

ฟังก์ชันการให้คะแนน
การดำเนินการของผู้ประเมิน

กำหนดฟังก์ชันการให้คะแนน

กำหนดฟังก์ชันการให้คะแนนเช่นเดียวกับเครื่องมือประเมินที่อิงตาม LLM ในกรณีนี้ ฟังก์ชันการให้คะแนนไม่จำเป็นต้องมี LLM ของผู้ตัดสิน

import { EvalResponses } from 'genkit';
import { BaseEvalDataPoint, Score } from 'genkit/evaluator';

const US_PHONE_REGEX =
  /[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4}/i;

/**
 * Scores whether a datapoint output contains a US Phone number.
 */
export async function usPhoneRegexScore(
  dataPoint: BaseEvalDataPoint
): Promise<Score> {
  const d = dataPoint;
  if (!d.output || typeof d.output !== 'string') {
    throw new Error('String output is required for regex matching');
  }
  const matches = US_PHONE_REGEX.test(d.output as string);
  const reasoning = matches
    ? `Output matched US_PHONE_REGEX`
    : `Output did not match US_PHONE_REGEX`;
  return {
    score: matches,
    details: { reasoning },
  };
}

กําหนดการดําเนินการของโปรแกรมประเมิน

import { Genkit } from 'genkit';
import { BaseEvalDataPoint, EvaluatorAction } from 'genkit/evaluator';

/**
 * Configures a regex evaluator to match a US phone number.
 */
export function createUSPhoneRegexEvaluator(ai: Genkit): EvaluatorAction {
  return ai.defineEvaluator(
    {
      name: `myCustomEvals/usPhoneRegexEvaluator`,
      displayName: "Regex Match for US PHONE NUMBER",
      definition: "Uses Regex to check if output matches a US phone number",
      isBilled: false,
    },
    async (datapoint: BaseEvalDataPoint) => {
      const score = await usPhoneRegexScore(datapoint);
      return {
        testCaseId: datapoint.testCaseId,
        evaluation: score,
      };
    }
  );
}

การนำข้อมูลทั้งหมดมารวมกัน

คําจํากัดความของปลั๊กอิน

ปลั๊กอินจะลงทะเบียนกับเฟรมเวิร์กโดยการติดตั้งเมื่อเริ่มต้น Genkit หากต้องการกําหนดค่าปลั๊กอินใหม่ ให้ใช้genkitPluginเมธอดตัวช่วยเพื่อสร้างอินสแตนซ์ของการดำเนินการ Genkit ทั้งหมดภายในบริบทของปลั๊กอิน

ตัวอย่างโค้ดนี้แสดงเครื่องมือประเมิน 2 รายการ ได้แก่ เครื่องมือประเมินความอร่อยที่อิงตาม LLM และเครื่องมือประเมินหมายเลขโทรศัพท์ในสหรัฐอเมริกาที่อิงตามนิพจน์ทั่วไป การสร้างอินสแตนซ์ของเครื่องมือประเมินเหล่านี้ภายในบริบทของปลั๊กอินจะเป็นการลงทะเบียนเครื่องมือประเมินกับปลั๊กอิน

import { GenkitPlugin, genkitPlugin } from 'genkit/plugin';

export function myCustomEvals<
  ModelCustomOptions extends z.ZodTypeAny
>(options: {
  judge: ModelArgument<ModelCustomOptions>;
  judgeConfig?: ModelCustomOptions;
}): GenkitPlugin {
  // Define the new plugin
  return genkitPlugin("myCustomEvals", async (ai: Genkit) => {
    const { judge, judgeConfig } = options;

    // The plugin instatiates our custom evaluators within the context
    // of the `ai` object, making them available
    // throughout our Genkit application.
    createDeliciousnessEvaluator(ai, judge, judgeConfig);
    createUSPhoneRegexEvaluator(ai);
  });
}
export default myCustomEvals;

กำหนดค่า Genkit

เพิ่มปลั๊กอิน myCustomEvals ลงในการกำหนดค่า Genkit

สําหรับการประเมินด้วย Gemini ให้ปิดใช้การตั้งค่าความปลอดภัยเพื่อให้ผู้ประเมินยอมรับ ตรวจจับ และให้คะแนนเนื้อหาที่อาจเป็นอันตรายได้

import { gemini15Pro } from '@genkit-ai/googleai';

const ai = genkit({
  plugins: [
    vertexAI(),
    ...
    myCustomEvals({
      judge: gemini15Pro,
    }),
  ],
  ...
});

การใช้เครื่องมือประเมินที่กำหนดเอง

เมื่อสร้างอินสแตนซ์เครื่องมือประเมินที่กําหนดเองภายในบริบทแอป Genkit (ผ่านปลั๊กอินหรือโดยตรง) เครื่องมือประเมินดังกล่าวก็พร้อมใช้งาน ตัวอย่างต่อไปนี้แสดงวิธีทดสอบเครื่องประเมินความอร่อยด้วยอินพุตและเอาต์พุตตัวอย่าง

1. สร้างไฟล์ JSON `deliciousness_dataset.json` ที่มีเนื้อหาต่อไปนี้

[
  {
    "testCaseId": "delicous_mango",
    "input": "What is a super delicious fruit",
    "output": "A perfectly ripe mango – sweet, juicy, and with a hint of tropical sunshine."
  },
  {
    "testCaseId": "disgusting_soggy_cereal",
    "input": "What is something that is tasty when fresh but less tasty after some time?",
    "output": "Stale, flavorless cereal that's been sitting in the box too long."
  }
]

2. ใช้ Genkit CLI เพื่อเรียกใช้เครื่องมือประเมินกับกรณีทดสอบเหล่านี้

# Start your genkit runtime
genkit start -- <command to start your app>
genkit eval:run deliciousness_dataset.json --evaluators=myCustomEvals/deliciousnessEvaluator

3. ไปที่ `localhost:4000/evaluate` เพื่อดูผลลัพธ์ใน UI ของ Genkit

โปรดทราบว่าความเชื่อมั่นในเครื่องมือประเมินที่กําหนดเองจะเพิ่มขึ้นเมื่อคุณเปรียบเทียบกับชุดข้อมูลหรือแนวทางมาตรฐาน ตรวจสอบผลลัพธ์ของข้อมูลเปรียบเทียบดังกล่าวเพื่อปรับปรุงประสิทธิภาพของผู้ประเมินจนกว่าจะถึงระดับคุณภาพที่กำหนด