Files
life-echo/api/app/features/evaluation/gating_service.py

93 lines
2.7 KiB
Python
Raw Normal View History

"""平台门禁:均分提升 + 受保护 session 无明显退步。"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
from app.core.config import settings
from app.features.evaluation.models import EvalCase, EvalRun
@dataclass
class GateResult:
passed: bool
mean_baseline_composite: float
mean_candidate_composite: float
mean_delta: float
protected_regressions: list[dict[str, Any]]
per_case: list[dict[str, Any]]
def compute_gate(
*,
cases: list[EvalCase],
runs: list[EvalRun],
regression_threshold: float | None = None,
) -> GateResult:
thr = (
regression_threshold
if regression_threshold is not None
else settings.eval_gate_protected_regression_threshold
)
by_case: dict[str, dict[str, EvalRun]] = {}
for r in runs:
if r.status != "completed":
continue
by_case.setdefault(r.case_id, {})[r.side] = r
per_case: list[dict[str, Any]] = []
base_scores: list[float] = []
cand_scores: list[float] = []
protected_regs: list[dict[str, Any]] = []
case_map = {c.id: c for c in cases}
for cid, sides in by_case.items():
b = sides.get("baseline")
c_run = sides.get("candidate")
if not b or not c_run:
continue
if b.composite_score is None or c_run.composite_score is None:
continue
bs = float(b.composite_score)
cs = float(c_run.composite_score)
delta = cs - bs
base_scores.append(bs)
cand_scores.append(cs)
ec = case_map.get(cid)
protected = bool(ec and ec.is_protected)
row = {
"case_id": cid,
"title": ec.title if ec else None,
"baseline_composite": bs,
"candidate_composite": cs,
"delta": delta,
"protected": protected,
}
per_case.append(row)
if protected and delta < -thr:
protected_regs.append(
{
"case_id": cid,
"title": ec.title if ec else None,
"delta": delta,
"threshold": thr,
}
)
mean_b = sum(base_scores) / len(base_scores) if base_scores else 0.0
mean_c = sum(cand_scores) / len(cand_scores) if cand_scores else 0.0
mean_delta = mean_c - mean_b
passed = (
mean_c > mean_b + 1e-6 and len(protected_regs) == 0 and len(base_scores) > 0
)
return GateResult(
passed=passed,
mean_baseline_composite=mean_b,
mean_candidate_composite=mean_c,
mean_delta=mean_delta,
protected_regressions=protected_regs,
per_case=per_case,
)