316 lines
8.4 KiB
Python
316 lines
8.4 KiB
Python
"""评测域数据访问。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import uuid
|
|
from typing import Any
|
|
|
|
from sqlalchemy import select
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.features.evaluation.models import (
|
|
EvalCase,
|
|
EvalExperiment,
|
|
EvalGateVerdict,
|
|
EvalRegressionSet,
|
|
EvalRun,
|
|
EvalRunTurn,
|
|
EvalVersion,
|
|
)
|
|
|
|
|
|
def _id() -> str:
|
|
return str(uuid.uuid4()).replace("-", "")[:32]
|
|
|
|
|
|
async def list_regression_sets(db: AsyncSession) -> list[EvalRegressionSet]:
|
|
res = await db.execute(
|
|
select(EvalRegressionSet).order_by(EvalRegressionSet.created_at.desc())
|
|
)
|
|
return list(res.scalars().unique().all())
|
|
|
|
|
|
async def get_regression_set(db: AsyncSession, sid: str) -> EvalRegressionSet | None:
|
|
return await db.get(EvalRegressionSet, sid)
|
|
|
|
|
|
async def create_regression_set(
|
|
db: AsyncSession, *, name: str, description: str | None = None
|
|
) -> EvalRegressionSet:
|
|
row = EvalRegressionSet(id=_id(), name=name, description=description)
|
|
db.add(row)
|
|
await db.flush()
|
|
return row
|
|
|
|
|
|
async def list_cases(db: AsyncSession, regression_set_id: str) -> list[EvalCase]:
|
|
res = await db.execute(
|
|
select(EvalCase)
|
|
.where(EvalCase.regression_set_id == regression_set_id)
|
|
.order_by(EvalCase.created_at.asc())
|
|
)
|
|
return list(res.scalars().all())
|
|
|
|
|
|
async def create_case(
|
|
db: AsyncSession,
|
|
*,
|
|
regression_set_id: str,
|
|
user_utterances: list[str],
|
|
title: str | None = None,
|
|
source_conversation_id: str | None = None,
|
|
source_user_id: str | None = None,
|
|
reference_memoir_markdown: str | None = None,
|
|
is_protected: bool = False,
|
|
meta: dict[str, Any] | None = None,
|
|
) -> EvalCase:
|
|
row = EvalCase(
|
|
id=_id(),
|
|
regression_set_id=regression_set_id,
|
|
source_conversation_id=source_conversation_id,
|
|
source_user_id=source_user_id,
|
|
title=title,
|
|
user_utterances=list(user_utterances),
|
|
reference_memoir_markdown=reference_memoir_markdown,
|
|
is_protected=is_protected,
|
|
meta=meta,
|
|
)
|
|
db.add(row)
|
|
await db.flush()
|
|
return row
|
|
|
|
|
|
async def get_case(db: AsyncSession, case_id: str) -> EvalCase | None:
|
|
return await db.get(EvalCase, case_id)
|
|
|
|
|
|
async def list_versions(db: AsyncSession) -> list[EvalVersion]:
|
|
res = await db.execute(select(EvalVersion).order_by(EvalVersion.created_at.desc()))
|
|
return list(res.scalars().all())
|
|
|
|
|
|
async def create_version(
|
|
db: AsyncSession,
|
|
*,
|
|
name: str,
|
|
runner_kind: str = "llm_chat_v1",
|
|
config_json: dict[str, Any] | None = None,
|
|
) -> EvalVersion:
|
|
row = EvalVersion(
|
|
id=_id(),
|
|
name=name,
|
|
runner_kind=runner_kind,
|
|
config_json=config_json,
|
|
)
|
|
db.add(row)
|
|
await db.flush()
|
|
return row
|
|
|
|
|
|
async def get_version(db: AsyncSession, vid: str) -> EvalVersion | None:
|
|
return await db.get(EvalVersion, vid)
|
|
|
|
|
|
async def create_experiment(
|
|
db: AsyncSession,
|
|
*,
|
|
name: str,
|
|
regression_set_id: str,
|
|
baseline_version_id: str,
|
|
candidate_version_id: str,
|
|
rubric_pack: str = "conversation_v1+memoir_v1",
|
|
composite_weights_json: dict[str, Any] | None = None,
|
|
) -> EvalExperiment:
|
|
row = EvalExperiment(
|
|
id=_id(),
|
|
name=name,
|
|
regression_set_id=regression_set_id,
|
|
baseline_version_id=baseline_version_id,
|
|
candidate_version_id=candidate_version_id,
|
|
rubric_pack=rubric_pack,
|
|
composite_weights_json=composite_weights_json,
|
|
status="pending",
|
|
)
|
|
db.add(row)
|
|
await db.flush()
|
|
return row
|
|
|
|
|
|
async def get_experiment(db: AsyncSession, eid: str) -> EvalExperiment | None:
|
|
return await db.get(EvalExperiment, eid)
|
|
|
|
|
|
async def list_experiments(db: AsyncSession, limit: int = 50) -> list[EvalExperiment]:
|
|
res = await db.execute(
|
|
select(EvalExperiment).order_by(EvalExperiment.created_at.desc()).limit(limit)
|
|
)
|
|
return list(res.scalars().all())
|
|
|
|
|
|
async def update_experiment(
|
|
db: AsyncSession,
|
|
exp: EvalExperiment,
|
|
*,
|
|
status: str | None = None,
|
|
error_message: str | None = None,
|
|
completed_at: Any | None = ...,
|
|
) -> None:
|
|
if status is not None:
|
|
exp.status = status
|
|
if error_message is not None:
|
|
exp.error_message = error_message
|
|
if completed_at is not ...:
|
|
exp.completed_at = completed_at
|
|
|
|
|
|
async def get_run(
|
|
db: AsyncSession, experiment_id: str, case_id: str, side: str
|
|
) -> EvalRun | None:
|
|
res = await db.execute(
|
|
select(EvalRun).where(
|
|
EvalRun.experiment_id == experiment_id,
|
|
EvalRun.case_id == case_id,
|
|
EvalRun.side == side,
|
|
)
|
|
)
|
|
return res.scalars().first()
|
|
|
|
|
|
async def create_run(
|
|
db: AsyncSession,
|
|
*,
|
|
experiment_id: str,
|
|
case_id: str,
|
|
side: str,
|
|
) -> EvalRun:
|
|
row = EvalRun(
|
|
id=_id(),
|
|
experiment_id=experiment_id,
|
|
case_id=case_id,
|
|
side=side,
|
|
status="pending",
|
|
)
|
|
db.add(row)
|
|
await db.flush()
|
|
return row
|
|
|
|
|
|
async def list_runs_for_experiment(
|
|
db: AsyncSession, experiment_id: str
|
|
) -> list[EvalRun]:
|
|
res = await db.execute(
|
|
select(EvalRun).where(EvalRun.experiment_id == experiment_id)
|
|
)
|
|
return list(res.scalars().all())
|
|
|
|
|
|
async def update_run(
|
|
db: AsyncSession,
|
|
run: EvalRun,
|
|
*,
|
|
status: str | None = None,
|
|
error_message: str | None = None,
|
|
memoir_markdown: str | None = None,
|
|
conversation_score_total: float | None = None,
|
|
memoir_score_total: float | None = None,
|
|
composite_score: float | None = None,
|
|
judge_bundle_json: dict[str, Any] | None = None,
|
|
started_at: Any | None = ...,
|
|
completed_at: Any | None = ...,
|
|
) -> None:
|
|
if status is not None:
|
|
run.status = status
|
|
if error_message is not None:
|
|
run.error_message = error_message
|
|
if memoir_markdown is not None:
|
|
run.memoir_markdown = memoir_markdown
|
|
if conversation_score_total is not None:
|
|
run.conversation_score_total = conversation_score_total
|
|
if memoir_score_total is not None:
|
|
run.memoir_score_total = memoir_score_total
|
|
if composite_score is not None:
|
|
run.composite_score = composite_score
|
|
if judge_bundle_json is not None:
|
|
run.judge_bundle_json = judge_bundle_json
|
|
if started_at is not ...:
|
|
run.started_at = started_at
|
|
if completed_at is not ...:
|
|
run.completed_at = completed_at
|
|
|
|
|
|
async def add_turn(
|
|
db: AsyncSession,
|
|
*,
|
|
run_id: str,
|
|
turn_index: int,
|
|
user_utterance: str,
|
|
assistant_reply: str | None,
|
|
duration_ms: int | None,
|
|
judge_scores_json: dict[str, Any] | None,
|
|
judge_rationale: str | None,
|
|
) -> EvalRunTurn:
|
|
row = EvalRunTurn(
|
|
id=_id(),
|
|
run_id=run_id,
|
|
turn_index=turn_index,
|
|
user_utterance=user_utterance,
|
|
assistant_reply=assistant_reply,
|
|
duration_ms=duration_ms,
|
|
judge_scores_json=judge_scores_json,
|
|
judge_rationale=judge_rationale,
|
|
)
|
|
db.add(row)
|
|
await db.flush()
|
|
return row
|
|
|
|
|
|
async def list_turns(db: AsyncSession, run_id: str) -> list[EvalRunTurn]:
|
|
res = await db.execute(
|
|
select(EvalRunTurn)
|
|
.where(EvalRunTurn.run_id == run_id)
|
|
.order_by(EvalRunTurn.turn_index.asc())
|
|
)
|
|
return list(res.scalars().all())
|
|
|
|
|
|
async def upsert_gate_verdict(
|
|
db: AsyncSession,
|
|
*,
|
|
experiment_id: str,
|
|
passed: bool,
|
|
mean_composite_delta: float | None,
|
|
protected_regressions_json: list[dict[str, Any]] | None,
|
|
details_json: dict[str, Any] | None,
|
|
) -> EvalGateVerdict:
|
|
res = await db.execute(
|
|
select(EvalGateVerdict).where(EvalGateVerdict.experiment_id == experiment_id)
|
|
)
|
|
row = res.scalars().first()
|
|
if row:
|
|
row.passed = passed
|
|
row.mean_composite_delta = mean_composite_delta
|
|
row.protected_regressions_json = protected_regressions_json
|
|
row.details_json = details_json
|
|
await db.flush()
|
|
return row
|
|
row = EvalGateVerdict(
|
|
id=_id(),
|
|
experiment_id=experiment_id,
|
|
passed=passed,
|
|
mean_composite_delta=mean_composite_delta,
|
|
protected_regressions_json=protected_regressions_json,
|
|
details_json=details_json,
|
|
)
|
|
db.add(row)
|
|
await db.flush()
|
|
return row
|
|
|
|
|
|
async def get_gate_verdict(
|
|
db: AsyncSession, experiment_id: str
|
|
) -> EvalGateVerdict | None:
|
|
res = await db.execute(
|
|
select(EvalGateVerdict).where(EvalGateVerdict.experiment_id == experiment_id)
|
|
)
|
|
return res.scalars().first()
|