Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Checks evaluator plugin returns multiple scores. #1370

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ js/testapps/firebase-functions-sample1/.firebase
js/testapps/firebase-functions-sample1/.firebaserc
js/testapps/firebase-functions-sample1/public/bundle.js
js/testapps/firebase-functions-sample1/public/config.js
.genkit
js/**/.genkit
samples/**/.genkit
go/**/.genkit
Expand Down
24 changes: 24 additions & 0 deletions js/plugins/checks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,32 @@ Create a JSON file with the data you want to test. Add as many test cases as you
### Run the evaluators

```bash
# Run all configured classifiers.
genkit eval:run test-dataset.json --evaluators=checks/all_metrics

# Run just the DANGEROUS_CONTENT classifier.
genkit eval:run test-dataset.json --evaluators=checks/dangerous_content

# Run just the HARASSMENT classifier.
genkit eval:run test-dataset.json --evaluators=checks/harassment

# Run just the HATE_SPEECH classifier.
genkit eval:run test-dataset.json --evaluators=checks/hate_speech

# Run just the MEDICAL_INFO classifier.
genkit eval:run test-dataset.json --evaluators=checks/medical_info

# Run just the OBSCENITY_AND_PROFANITY classifier.
genkit eval:run test-dataset.json --evaluators=checks/obscenity_and_profanity

# Run just the PII_SOLICITING_RECITING classifier.
genkit eval:run test-dataset.json --evaluators=checks/pii_soliciting_reciting

# Run just the SEXUALLY_EXPLICIT classifier.
genkit eval:run test-dataset.json --evaluators=checks/sexually_explicit

# Run just the VIOLENCE_AND_GORE classifier.
genkit eval:run test-dataset.json --evaluators=checks/violence_and_gore
```

```bash
Expand Down
50 changes: 34 additions & 16 deletions js/plugins/checks/src/evaluation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,22 @@ export function checksEvaluators(
}
);

// Individual evaluators, one per configured metric.
const evaluators = policy_configs.map((policy_config) => {
return createPolicyEvaluator(projectId, auth, ai, policy_config);
return createPolicyEvaluator(
projectId,
auth,
ai,
[policy_config],
policy_config.type as string
);
});

// Single evaluator instnace with all configured policies.
evaluators.push(
createPolicyEvaluator(projectId, auth, ai, policy_configs, 'all_metrics')
);

return evaluators;
}

Expand All @@ -104,15 +116,14 @@ function createPolicyEvaluator(
projectId: string,
auth: GoogleAuth,
ai: Genkit,
policy_config: ChecksEvaluationMetricConfig
policy_config: ChecksEvaluationMetricConfig[],
name: string
): EvaluatorAction {
const policyType = policy_config.type as string;

return ai.defineEvaluator(
{
name: `checks/${policyType.toLowerCase()}`,
displayName: policyType,
definition: `Evaluates text against the Checks ${policyType} policy.`,
name: `checks/${name.toLowerCase()}`,
displayName: name,
definition: `Evaluates text against the Checks ${name} policy.`,
},
async (datapoint: BaseEvalDataPoint) => {
const partialRequest = {
Expand All @@ -121,10 +132,12 @@ function createPolicyEvaluator(
content: datapoint.output as string,
},
},
policies: {
policy_type: policy_config.type,
threshold: policy_config.threshold,
},
policies: policy_config.map((config) => {
return {
policy_type: config.type,
threshold: config.threshold,
};
}),
};

const response = await checksEvalInstance(
Expand All @@ -134,13 +147,18 @@ function createPolicyEvaluator(
ResponseSchema
);

return {
evaluation: {
score: response.policyResults[0].score,
const evaluationResults = response.policyResults.map((result) => {
return {
id: result.policyType,
score: result.score,
details: {
reasoning: response.policyResults[0].violationResult,
reasoning: `Status ${result.violationResult}`,
},
},
};
});

return {
evaluation: evaluationResults,
testCaseId: datapoint.testCaseId,
};
}
Expand Down
Loading