Files
life-echo/api/app/features/evaluation/admin_service.py
Kevin ca8bcc8489 feat(evaluation): session catalog, user export import, and eval web UI
- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)
2026-04-06 13:49:28 +08:00

285 lines
11 KiB
Python

"""内部评测 REST 编排:事务与业务规则;数据访问经 repo。"""
from __future__ import annotations
import json
from dataclasses import dataclass
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
from app.features.evaluation import repo as eval_repo
from app.features.evaluation.errors import (
EvaluationBadRequestError,
EvaluationNotFoundError,
)
from app.features.evaluation.importers.script_json import parse_script_json
from app.features.evaluation.importers.user_export_markdown import (
extract_user_utterances_from_export_md,
)
from app.features.evaluation.models import (
EvalCase,
EvalExperiment,
EvalGateVerdict,
EvalRegressionSet,
EvalRun,
EvalRunTurn,
EvalVersion,
)
from app.features.evaluation.presenters import run_out
from app.features.evaluation.schemas import (
CaseCreate,
ExperimentCreate,
ImportJsonCaseBody,
ImportMarkdownBody,
RegressionSetCreate,
SessionEvalRunItem,
SessionEvalRunsOut,
SnapshotFromConversationBody,
VersionCreate,
)
from app.features.evaluation.session_catalog_service import SessionCatalogService
from app.features.evaluation.user_export_fixtures import (
list_user_export_fixture_names as list_user_export_md_filenames,
)
from app.features.evaluation.user_export_fixtures import (
read_user_export_fixture,
)
from app.tasks.evaluation_tasks import run_eval_experiment_task
@dataclass(frozen=True)
class ExperimentDetailBundle:
experiment: EvalExperiment
run_rows: list[tuple[EvalRun, list[EvalRunTurn]]]
gate: EvalGateVerdict | None
class EvaluationAdminService:
def __init__(self, db: AsyncSession) -> None:
self._db = db
async def list_regression_sets(self) -> list[EvalRegressionSet]:
return await eval_repo.list_regression_sets(self._db)
async def create_regression_set(
self, body: RegressionSetCreate
) -> EvalRegressionSet:
row = await eval_repo.create_regression_set(
self._db, name=body.name, description=body.description
)
await self._db.commit()
await self._db.refresh(row)
return row
async def list_cases(self, set_id: str) -> list[EvalCase]:
parent = await eval_repo.get_regression_set(self._db, set_id)
if not parent:
raise EvaluationNotFoundError("regression set not found")
return await eval_repo.list_cases(self._db, set_id)
async def create_case(self, set_id: str, body: CaseCreate) -> EvalCase:
parent = await eval_repo.get_regression_set(self._db, set_id)
if not parent:
raise EvaluationNotFoundError("regression set not found")
row = await eval_repo.create_case(
self._db,
regression_set_id=set_id,
user_utterances=body.user_utterances,
title=body.title,
source_conversation_id=body.source_conversation_id,
source_user_id=body.source_user_id,
reference_memoir_markdown=body.reference_memoir_markdown,
is_protected=body.is_protected,
meta=body.meta,
)
await self._db.commit()
await self._db.refresh(row)
return row
async def snapshot_from_conversation(
self,
set_id: str,
conversation_id: str,
body: SnapshotFromConversationBody,
) -> EvalCase:
parent = await eval_repo.get_regression_set(self._db, set_id)
if not parent:
raise EvaluationNotFoundError("regression set not found")
catalog = SessionCatalogService(self._db)
tr = await catalog.get_transcript(conversation_id)
if not tr:
raise EvaluationNotFoundError("conversation not found")
utterances = (
tr.user_utterances_from_messages
if body.use_messages
else tr.user_utterances_from_segments
)
if not utterances:
raise EvaluationBadRequestError("no user utterances in session")
row = await eval_repo.create_case(
self._db,
regression_set_id=set_id,
user_utterances=utterances,
title=body.title,
source_conversation_id=conversation_id,
source_user_id=tr.user_id,
is_protected=body.is_protected,
meta={"source": "conversation_snapshot", "use_messages": body.use_messages},
)
await self._db.commit()
await self._db.refresh(row)
return row
async def import_markdown_case(
self, set_id: str, body: ImportMarkdownBody
) -> EvalCase:
parent = await eval_repo.get_regression_set(self._db, set_id)
if not parent:
raise EvaluationNotFoundError("regression set not found")
utterances = extract_user_utterances_from_export_md(body.markdown)
if not utterances:
raise EvaluationBadRequestError("no user lines parsed from markdown")
row = await eval_repo.create_case(
self._db,
regression_set_id=set_id,
user_utterances=utterances,
title=body.title,
is_protected=body.is_protected,
meta={"source": "markdown_import"},
)
await self._db.commit()
await self._db.refresh(row)
return row
async def import_json_case(self, body: ImportJsonCaseBody) -> EvalCase:
parent = await eval_repo.get_regression_set(self._db, body.regression_set_id)
if not parent:
raise EvaluationNotFoundError("regression set not found")
meta_extra: dict[str, Any]
if body.utterances:
utt = [str(u).strip() for u in body.utterances if str(u).strip()]
meta_extra = {}
elif body.raw_json is not None:
raw = body.raw_json
payload_str = json.dumps(raw, ensure_ascii=False)
utt, meta_extra = parse_script_json(payload_str)
else:
raise EvaluationBadRequestError("utterances or raw_json required")
if not utt:
raise EvaluationBadRequestError("empty utterances")
row = await eval_repo.create_case(
self._db,
regression_set_id=body.regression_set_id,
user_utterances=utt,
title=body.title,
is_protected=body.is_protected,
meta={"source": "json_import", **meta_extra},
)
await self._db.commit()
await self._db.refresh(row)
return row
async def list_versions(self) -> list[EvalVersion]:
return await eval_repo.list_versions(self._db)
async def create_version(self, body: VersionCreate) -> EvalVersion:
row = await eval_repo.create_version(
self._db,
name=body.name,
runner_kind=body.runner_kind,
config_json=body.config_json,
)
await self._db.commit()
await self._db.refresh(row)
return row
async def list_experiments(self, *, limit: int) -> list[EvalExperiment]:
return await eval_repo.list_experiments(self._db, limit=limit)
async def list_session_evaluation_runs(
self, conversation_id: str
) -> SessionEvalRunsOut:
rows = await eval_repo.list_runs_for_source_conversation(
self._db, source_conversation_id=conversation_id
)
items: list[SessionEvalRunItem] = []
for run, _case, exp in rows:
turns = await eval_repo.list_turns(self._db, run.id)
items.append(
SessionEvalRunItem(
experiment_name=exp.name,
run=run_out(run, turns),
)
)
return SessionEvalRunsOut(conversation_id=conversation_id, items=items)
async def create_experiment(self, body: ExperimentCreate) -> EvalExperiment:
rs = await eval_repo.get_regression_set(self._db, body.regression_set_id)
if not rs:
raise EvaluationNotFoundError("regression set not found")
bv = await eval_repo.get_version(self._db, body.baseline_version_id)
cv = await eval_repo.get_version(self._db, body.candidate_version_id)
if not bv or not cv:
raise EvaluationNotFoundError("version not found")
row = await eval_repo.create_experiment(
self._db,
name=body.name,
regression_set_id=body.regression_set_id,
baseline_version_id=body.baseline_version_id,
candidate_version_id=body.candidate_version_id,
rubric_pack=body.rubric_pack,
composite_weights_json=body.composite_weights_json,
)
await self._db.commit()
await self._db.refresh(row)
return row
async def get_experiment_detail(self, experiment_id: str) -> ExperimentDetailBundle:
exp = await eval_repo.get_experiment(self._db, experiment_id)
if not exp:
raise EvaluationNotFoundError("experiment not found")
runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id)
run_rows: list[tuple[EvalRun, list[EvalRunTurn]]] = []
for r in runs:
turns = await eval_repo.list_turns(self._db, r.id)
run_rows.append((r, turns))
gv = await eval_repo.get_gate_verdict(self._db, experiment_id)
return ExperimentDetailBundle(experiment=exp, run_rows=run_rows, gate=gv)
async def enqueue_experiment_run(self, experiment_id: str) -> EvalExperiment:
exp = await eval_repo.get_experiment(self._db, experiment_id)
if not exp:
raise EvaluationNotFoundError("experiment not found")
run_eval_experiment_task.delay(experiment_id)
await self._db.refresh(exp)
return exp
async def experiment_stream_snapshot(
self, experiment_id: str
) -> dict[str, Any] | None:
from app.features.evaluation.schemas import GateVerdictOut
exp = await eval_repo.get_experiment(self._db, experiment_id)
if not exp:
return None
runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id)
run_payload = []
for r in runs:
turns = await eval_repo.list_turns(self._db, r.id)
run_payload.append(run_out(r, turns).model_dump())
gv = await eval_repo.get_gate_verdict(self._db, experiment_id)
return {
"experiment_id": experiment_id,
"status": exp.status,
"runs": run_payload,
"gate": GateVerdictOut.model_validate(gv).model_dump() if gv else None,
}
def list_user_export_fixture_names(self) -> list[str]:
return list_user_export_md_filenames()
def load_user_export_fixture_turns(self, filename: str) -> list[tuple[str, str]]:
turns, _ = read_user_export_fixture(filename)
return turns