feat/ 导出开发容器内的数据用于评估
This commit is contained in:
92
api/app/features/evaluation/gating_service.py
Normal file
92
api/app/features/evaluation/gating_service.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""平台门禁:均分提升 + 受保护 session 无明显退步。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from app.core.config import settings
|
||||
from app.features.evaluation.models import EvalCase, EvalRun
|
||||
|
||||
|
||||
@dataclass
|
||||
class GateResult:
|
||||
passed: bool
|
||||
mean_baseline_composite: float
|
||||
mean_candidate_composite: float
|
||||
mean_delta: float
|
||||
protected_regressions: list[dict[str, Any]]
|
||||
per_case: list[dict[str, Any]]
|
||||
|
||||
|
||||
def compute_gate(
|
||||
*,
|
||||
cases: list[EvalCase],
|
||||
runs: list[EvalRun],
|
||||
regression_threshold: float | None = None,
|
||||
) -> GateResult:
|
||||
thr = (
|
||||
regression_threshold
|
||||
if regression_threshold is not None
|
||||
else settings.eval_gate_protected_regression_threshold
|
||||
)
|
||||
by_case: dict[str, dict[str, EvalRun]] = {}
|
||||
for r in runs:
|
||||
if r.status != "completed":
|
||||
continue
|
||||
by_case.setdefault(r.case_id, {})[r.side] = r
|
||||
|
||||
per_case: list[dict[str, Any]] = []
|
||||
base_scores: list[float] = []
|
||||
cand_scores: list[float] = []
|
||||
protected_regs: list[dict[str, Any]] = []
|
||||
|
||||
case_map = {c.id: c for c in cases}
|
||||
for cid, sides in by_case.items():
|
||||
b = sides.get("baseline")
|
||||
c_run = sides.get("candidate")
|
||||
if not b or not c_run:
|
||||
continue
|
||||
if b.composite_score is None or c_run.composite_score is None:
|
||||
continue
|
||||
bs = float(b.composite_score)
|
||||
cs = float(c_run.composite_score)
|
||||
delta = cs - bs
|
||||
base_scores.append(bs)
|
||||
cand_scores.append(cs)
|
||||
ec = case_map.get(cid)
|
||||
protected = bool(ec and ec.is_protected)
|
||||
row = {
|
||||
"case_id": cid,
|
||||
"title": ec.title if ec else None,
|
||||
"baseline_composite": bs,
|
||||
"candidate_composite": cs,
|
||||
"delta": delta,
|
||||
"protected": protected,
|
||||
}
|
||||
per_case.append(row)
|
||||
if protected and delta < -thr:
|
||||
protected_regs.append(
|
||||
{
|
||||
"case_id": cid,
|
||||
"title": ec.title if ec else None,
|
||||
"delta": delta,
|
||||
"threshold": thr,
|
||||
}
|
||||
)
|
||||
|
||||
mean_b = sum(base_scores) / len(base_scores) if base_scores else 0.0
|
||||
mean_c = sum(cand_scores) / len(cand_scores) if cand_scores else 0.0
|
||||
mean_delta = mean_c - mean_b
|
||||
passed = (
|
||||
mean_c > mean_b + 1e-6 and len(protected_regs) == 0 and len(base_scores) > 0
|
||||
)
|
||||
|
||||
return GateResult(
|
||||
passed=passed,
|
||||
mean_baseline_composite=mean_b,
|
||||
mean_candidate_composite=mean_c,
|
||||
mean_delta=mean_delta,
|
||||
protected_regressions=protected_regs,
|
||||
per_case=per_case,
|
||||
)
|
||||
Reference in New Issue
Block a user