93 lines
2.7 KiB
Python
93 lines
2.7 KiB
Python
"""平台门禁:均分提升 + 受保护 session 无明显退步。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
from app.core.config import settings
|
|
from app.features.evaluation.models import EvalCase, EvalRun
|
|
|
|
|
|
@dataclass
|
|
class GateResult:
|
|
passed: bool
|
|
mean_baseline_composite: float
|
|
mean_candidate_composite: float
|
|
mean_delta: float
|
|
protected_regressions: list[dict[str, Any]]
|
|
per_case: list[dict[str, Any]]
|
|
|
|
|
|
def compute_gate(
|
|
*,
|
|
cases: list[EvalCase],
|
|
runs: list[EvalRun],
|
|
regression_threshold: float | None = None,
|
|
) -> GateResult:
|
|
thr = (
|
|
regression_threshold
|
|
if regression_threshold is not None
|
|
else settings.eval_gate_protected_regression_threshold
|
|
)
|
|
by_case: dict[str, dict[str, EvalRun]] = {}
|
|
for r in runs:
|
|
if r.status != "completed":
|
|
continue
|
|
by_case.setdefault(r.case_id, {})[r.side] = r
|
|
|
|
per_case: list[dict[str, Any]] = []
|
|
base_scores: list[float] = []
|
|
cand_scores: list[float] = []
|
|
protected_regs: list[dict[str, Any]] = []
|
|
|
|
case_map = {c.id: c for c in cases}
|
|
for cid, sides in by_case.items():
|
|
b = sides.get("baseline")
|
|
c_run = sides.get("candidate")
|
|
if not b or not c_run:
|
|
continue
|
|
if b.composite_score is None or c_run.composite_score is None:
|
|
continue
|
|
bs = float(b.composite_score)
|
|
cs = float(c_run.composite_score)
|
|
delta = cs - bs
|
|
base_scores.append(bs)
|
|
cand_scores.append(cs)
|
|
ec = case_map.get(cid)
|
|
protected = bool(ec and ec.is_protected)
|
|
row = {
|
|
"case_id": cid,
|
|
"title": ec.title if ec else None,
|
|
"baseline_composite": bs,
|
|
"candidate_composite": cs,
|
|
"delta": delta,
|
|
"protected": protected,
|
|
}
|
|
per_case.append(row)
|
|
if protected and delta < -thr:
|
|
protected_regs.append(
|
|
{
|
|
"case_id": cid,
|
|
"title": ec.title if ec else None,
|
|
"delta": delta,
|
|
"threshold": thr,
|
|
}
|
|
)
|
|
|
|
mean_b = sum(base_scores) / len(base_scores) if base_scores else 0.0
|
|
mean_c = sum(cand_scores) / len(cand_scores) if cand_scores else 0.0
|
|
mean_delta = mean_c - mean_b
|
|
passed = (
|
|
mean_c > mean_b + 1e-6 and len(protected_regs) == 0 and len(base_scores) > 0
|
|
)
|
|
|
|
return GateResult(
|
|
passed=passed,
|
|
mean_baseline_composite=mean_b,
|
|
mean_candidate_composite=mean_c,
|
|
mean_delta=mean_delta,
|
|
protected_regressions=protected_regs,
|
|
per_case=per_case,
|
|
)
|