Skip to content

Commit 1f1f67e

Browse files
jsondaicopybara-github
authored andcommitted
feat: GenAI Client(evals) - Add pass_rate to AggregatedMetricResult and calculate it for adaptive rubric metrics.
PiperOrigin-RevId: 831546537
1 parent abf2d23 commit 1f1f67e

File tree

3 files changed

+23
-2
lines changed

3 files changed

+23
-2
lines changed

tests/unit/vertexai/genai/replays/test_evaluate.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def test_evaluation_byor(client):
8686
assert isinstance(summary, types.AggregatedMetricResult)
8787
assert summary.metric_name is not None
8888
assert summary.mean_score is not None
89+
assert summary.pass_rate is not None
8990

9091
assert evaluation_result.eval_case_results is not None
9192
assert len(evaluation_result.eval_case_results) > 0

vertexai/_genai/_evals_metric_handlers.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,17 +93,22 @@ def _extract_text_from_content(
9393
def _default_aggregate_scores(
9494
metric_name: str,
9595
eval_case_metric_results: list[types.EvalCaseMetricResult],
96+
calculate_pass_rate: bool = False,
9697
) -> types.AggregatedMetricResult:
9798
"""Default aggregation logic using mean and standard deviation."""
9899
scores = []
99100
num_error = 0
100101
num_valid = 0
102+
num_passing = 0
101103

102104
for result in eval_case_metric_results:
103105
if result.error_message is None and result.score is not None:
104106
try:
105-
scores.append(float(result.score))
107+
score = float(result.score)
108+
scores.append(score)
106109
num_valid += 1
110+
if calculate_pass_rate and score == 1.0:
111+
num_passing += 1
107112
except (ValueError, TypeError):
108113
logger.warning(
109114
"Could not convert score '%s' to float for metric '%s' during"
@@ -117,11 +122,16 @@ def _default_aggregate_scores(
117122

118123
mean_score = None
119124
stdev_score = None
125+
pass_rate = None
126+
120127
if num_valid > 0:
121128
try:
122129
mean_score = statistics.mean(scores)
123130
except statistics.StatisticsError as e:
124131
logger.warning("Could not calculate mean for %s: %s", metric_name, e)
132+
if calculate_pass_rate:
133+
pass_rate = num_passing / num_valid
134+
125135
if num_valid > 1:
126136
try:
127137
stdev_score = statistics.stdev(scores)
@@ -135,6 +145,7 @@ def _default_aggregate_scores(
135145
num_cases_error=num_error,
136146
mean_score=mean_score,
137147
stdev_score=stdev_score,
148+
pass_rate=pass_rate if calculate_pass_rate else None,
138149
)
139150

140151

@@ -1062,7 +1073,9 @@ def aggregate(
10621073
) -> types.AggregatedMetricResult:
10631074
"""Aggregates the metric results for a predefined metric."""
10641075
logger.debug("Aggregating results for predefined metric: %s", self.metric.name)
1065-
return _default_aggregate_scores(self.metric.name, eval_case_metric_results)
1076+
return _default_aggregate_scores(
1077+
self.metric.name, eval_case_metric_results, calculate_pass_rate=True
1078+
)
10661079

10671080

10681081
_METRIC_HANDLER_MAPPING = [

vertexai/_genai/types/common.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1201,6 +1201,10 @@ class AggregatedMetricResult(_common.BaseModel):
12011201
stdev_score: Optional[float] = Field(
12021202
default=None, description="""Standard deviation of the metric."""
12031203
)
1204+
pass_rate: Optional[float] = Field(
1205+
default=None,
1206+
description="""Pass rate of the adaptive rubric metric. Calculated as the number of cases where all criteria passed divided by the total number of valid cases. A case is passing if it has a score of 1.0.""",
1207+
)
12041208

12051209
# Allow extra fields to support custom aggregation stats.
12061210
model_config = ConfigDict(extra="allow")
@@ -1227,6 +1231,9 @@ class AggregatedMetricResultDict(TypedDict, total=False):
12271231
stdev_score: Optional[float]
12281232
"""Standard deviation of the metric."""
12291233

1234+
pass_rate: Optional[float]
1235+
"""Pass rate of the adaptive rubric metric. Calculated as the number of cases where all criteria passed divided by the total number of valid cases. A case is passing if it has a score of 1.0."""
1236+
12301237

12311238
AggregatedMetricResultOrDict = Union[AggregatedMetricResult, AggregatedMetricResultDict]
12321239

0 commit comments

Comments
 (0)