Files
life-echo/api/app/features/evaluation/repo.py
Kevin ca8bcc8489 feat(evaluation): session catalog, user export import, and eval web UI
- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)
2026-04-06 13:49:28 +08:00

337 lines
9.0 KiB
Python

"""评测域数据访问。"""
from __future__ import annotations
import uuid
from typing import Any
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.features.evaluation.models import (
EvalCase,
EvalExperiment,
EvalGateVerdict,
EvalRegressionSet,
EvalRun,
EvalRunTurn,
EvalVersion,
)
def _id() -> str:
return str(uuid.uuid4()).replace("-", "")[:32]
async def list_regression_sets(db: AsyncSession) -> list[EvalRegressionSet]:
res = await db.execute(
select(EvalRegressionSet).order_by(EvalRegressionSet.created_at.desc())
)
return list(res.scalars().unique().all())
async def get_regression_set(db: AsyncSession, sid: str) -> EvalRegressionSet | None:
return await db.get(EvalRegressionSet, sid)
async def create_regression_set(
db: AsyncSession, *, name: str, description: str | None = None
) -> EvalRegressionSet:
row = EvalRegressionSet(id=_id(), name=name, description=description)
db.add(row)
await db.flush()
return row
async def list_cases(db: AsyncSession, regression_set_id: str) -> list[EvalCase]:
res = await db.execute(
select(EvalCase)
.where(EvalCase.regression_set_id == regression_set_id)
.order_by(EvalCase.created_at.asc())
)
return list(res.scalars().all())
async def create_case(
db: AsyncSession,
*,
regression_set_id: str,
user_utterances: list[str],
title: str | None = None,
source_conversation_id: str | None = None,
source_user_id: str | None = None,
reference_memoir_markdown: str | None = None,
is_protected: bool = False,
meta: dict[str, Any] | None = None,
) -> EvalCase:
row = EvalCase(
id=_id(),
regression_set_id=regression_set_id,
source_conversation_id=source_conversation_id,
source_user_id=source_user_id,
title=title,
user_utterances=list(user_utterances),
reference_memoir_markdown=reference_memoir_markdown,
is_protected=is_protected,
meta=meta,
)
db.add(row)
await db.flush()
return row
async def get_case(db: AsyncSession, case_id: str) -> EvalCase | None:
return await db.get(EvalCase, case_id)
async def list_versions(db: AsyncSession) -> list[EvalVersion]:
res = await db.execute(select(EvalVersion).order_by(EvalVersion.created_at.desc()))
return list(res.scalars().all())
async def create_version(
db: AsyncSession,
*,
name: str,
runner_kind: str = "llm_chat_v1",
config_json: dict[str, Any] | None = None,
) -> EvalVersion:
row = EvalVersion(
id=_id(),
name=name,
runner_kind=runner_kind,
config_json=config_json,
)
db.add(row)
await db.flush()
return row
async def get_version(db: AsyncSession, vid: str) -> EvalVersion | None:
return await db.get(EvalVersion, vid)
async def create_experiment(
db: AsyncSession,
*,
name: str,
regression_set_id: str,
baseline_version_id: str,
candidate_version_id: str,
rubric_pack: str = "conversation_v1+memoir_v1",
composite_weights_json: dict[str, Any] | None = None,
) -> EvalExperiment:
row = EvalExperiment(
id=_id(),
name=name,
regression_set_id=regression_set_id,
baseline_version_id=baseline_version_id,
candidate_version_id=candidate_version_id,
rubric_pack=rubric_pack,
composite_weights_json=composite_weights_json,
status="pending",
)
db.add(row)
await db.flush()
return row
async def get_experiment(db: AsyncSession, eid: str) -> EvalExperiment | None:
return await db.get(EvalExperiment, eid)
async def list_experiments(db: AsyncSession, limit: int = 50) -> list[EvalExperiment]:
res = await db.execute(
select(EvalExperiment).order_by(EvalExperiment.created_at.desc()).limit(limit)
)
return list(res.scalars().all())
async def update_experiment(
db: AsyncSession,
exp: EvalExperiment,
*,
status: str | None = None,
error_message: str | None = None,
completed_at: Any | None = ...,
) -> None:
if status is not None:
exp.status = status
if error_message is not None:
exp.error_message = error_message
if completed_at is not ...:
exp.completed_at = completed_at
async def get_run(
db: AsyncSession, experiment_id: str, case_id: str, side: str
) -> EvalRun | None:
res = await db.execute(
select(EvalRun).where(
EvalRun.experiment_id == experiment_id,
EvalRun.case_id == case_id,
EvalRun.side == side,
)
)
return res.scalars().first()
async def create_run(
db: AsyncSession,
*,
experiment_id: str,
case_id: str,
side: str,
) -> EvalRun:
row = EvalRun(
id=_id(),
experiment_id=experiment_id,
case_id=case_id,
side=side,
status="pending",
)
db.add(row)
await db.flush()
return row
async def list_runs_for_experiment(
db: AsyncSession, experiment_id: str
) -> list[EvalRun]:
res = await db.execute(
select(EvalRun).where(EvalRun.experiment_id == experiment_id)
)
return list(res.scalars().all())
async def list_runs_for_source_conversation(
db: AsyncSession,
*,
source_conversation_id: str,
limit: int = 80,
) -> list[tuple[EvalRun, EvalCase, EvalExperiment]]:
stmt = (
select(EvalRun, EvalCase, EvalExperiment)
.join(EvalCase, EvalRun.case_id == EvalCase.id)
.join(EvalExperiment, EvalRun.experiment_id == EvalExperiment.id)
.where(EvalCase.source_conversation_id == source_conversation_id)
.order_by(
EvalRun.completed_at.desc().nulls_last(),
EvalRun.started_at.desc().nulls_last(),
)
.limit(limit)
)
res = await db.execute(stmt)
return list(res.all())
async def update_run(
db: AsyncSession,
run: EvalRun,
*,
status: str | None = None,
error_message: str | None = None,
memoir_markdown: str | None = None,
conversation_score_total: float | None = None,
memoir_score_total: float | None = None,
composite_score: float | None = None,
judge_bundle_json: dict[str, Any] | None = None,
started_at: Any | None = ...,
completed_at: Any | None = ...,
) -> None:
if status is not None:
run.status = status
if error_message is not None:
run.error_message = error_message
if memoir_markdown is not None:
run.memoir_markdown = memoir_markdown
if conversation_score_total is not None:
run.conversation_score_total = conversation_score_total
if memoir_score_total is not None:
run.memoir_score_total = memoir_score_total
if composite_score is not None:
run.composite_score = composite_score
if judge_bundle_json is not None:
run.judge_bundle_json = judge_bundle_json
if started_at is not ...:
run.started_at = started_at
if completed_at is not ...:
run.completed_at = completed_at
async def add_turn(
db: AsyncSession,
*,
run_id: str,
turn_index: int,
user_utterance: str,
assistant_reply: str | None,
duration_ms: int | None,
judge_scores_json: dict[str, Any] | None,
judge_rationale: str | None,
) -> EvalRunTurn:
row = EvalRunTurn(
id=_id(),
run_id=run_id,
turn_index=turn_index,
user_utterance=user_utterance,
assistant_reply=assistant_reply,
duration_ms=duration_ms,
judge_scores_json=judge_scores_json,
judge_rationale=judge_rationale,
)
db.add(row)
await db.flush()
return row
async def list_turns(db: AsyncSession, run_id: str) -> list[EvalRunTurn]:
res = await db.execute(
select(EvalRunTurn)
.where(EvalRunTurn.run_id == run_id)
.order_by(EvalRunTurn.turn_index.asc())
)
return list(res.scalars().all())
async def upsert_gate_verdict(
db: AsyncSession,
*,
experiment_id: str,
passed: bool,
mean_composite_delta: float | None,
protected_regressions_json: list[dict[str, Any]] | None,
details_json: dict[str, Any] | None,
) -> EvalGateVerdict:
res = await db.execute(
select(EvalGateVerdict).where(EvalGateVerdict.experiment_id == experiment_id)
)
row = res.scalars().first()
if row:
row.passed = passed
row.mean_composite_delta = mean_composite_delta
row.protected_regressions_json = protected_regressions_json
row.details_json = details_json
await db.flush()
return row
row = EvalGateVerdict(
id=_id(),
experiment_id=experiment_id,
passed=passed,
mean_composite_delta=mean_composite_delta,
protected_regressions_json=protected_regressions_json,
details_json=details_json,
)
db.add(row)
await db.flush()
return row
async def get_gate_verdict(
db: AsyncSession, experiment_id: str
) -> EvalGateVerdict | None:
res = await db.execute(
select(EvalGateVerdict).where(EvalGateVerdict.experiment_id == experiment_id)
)
return res.scalars().first()