- Extend evaluation API: schemas, router, repo, admin and execution services - Improve user export markdown importer; add fixtures and importer tests - Session catalog repo/service updates; internal app wiring and docs - Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)
285 lines
11 KiB
Python
285 lines
11 KiB
Python
"""内部评测 REST 编排:事务与业务规则;数据访问经 repo。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.features.evaluation import repo as eval_repo
|
|
from app.features.evaluation.errors import (
|
|
EvaluationBadRequestError,
|
|
EvaluationNotFoundError,
|
|
)
|
|
from app.features.evaluation.importers.script_json import parse_script_json
|
|
from app.features.evaluation.importers.user_export_markdown import (
|
|
extract_user_utterances_from_export_md,
|
|
)
|
|
from app.features.evaluation.models import (
|
|
EvalCase,
|
|
EvalExperiment,
|
|
EvalGateVerdict,
|
|
EvalRegressionSet,
|
|
EvalRun,
|
|
EvalRunTurn,
|
|
EvalVersion,
|
|
)
|
|
from app.features.evaluation.presenters import run_out
|
|
from app.features.evaluation.schemas import (
|
|
CaseCreate,
|
|
ExperimentCreate,
|
|
ImportJsonCaseBody,
|
|
ImportMarkdownBody,
|
|
RegressionSetCreate,
|
|
SessionEvalRunItem,
|
|
SessionEvalRunsOut,
|
|
SnapshotFromConversationBody,
|
|
VersionCreate,
|
|
)
|
|
from app.features.evaluation.session_catalog_service import SessionCatalogService
|
|
from app.features.evaluation.user_export_fixtures import (
|
|
list_user_export_fixture_names as list_user_export_md_filenames,
|
|
)
|
|
from app.features.evaluation.user_export_fixtures import (
|
|
read_user_export_fixture,
|
|
)
|
|
from app.tasks.evaluation_tasks import run_eval_experiment_task
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ExperimentDetailBundle:
|
|
experiment: EvalExperiment
|
|
run_rows: list[tuple[EvalRun, list[EvalRunTurn]]]
|
|
gate: EvalGateVerdict | None
|
|
|
|
|
|
class EvaluationAdminService:
|
|
def __init__(self, db: AsyncSession) -> None:
|
|
self._db = db
|
|
|
|
async def list_regression_sets(self) -> list[EvalRegressionSet]:
|
|
return await eval_repo.list_regression_sets(self._db)
|
|
|
|
async def create_regression_set(
|
|
self, body: RegressionSetCreate
|
|
) -> EvalRegressionSet:
|
|
row = await eval_repo.create_regression_set(
|
|
self._db, name=body.name, description=body.description
|
|
)
|
|
await self._db.commit()
|
|
await self._db.refresh(row)
|
|
return row
|
|
|
|
async def list_cases(self, set_id: str) -> list[EvalCase]:
|
|
parent = await eval_repo.get_regression_set(self._db, set_id)
|
|
if not parent:
|
|
raise EvaluationNotFoundError("regression set not found")
|
|
return await eval_repo.list_cases(self._db, set_id)
|
|
|
|
async def create_case(self, set_id: str, body: CaseCreate) -> EvalCase:
|
|
parent = await eval_repo.get_regression_set(self._db, set_id)
|
|
if not parent:
|
|
raise EvaluationNotFoundError("regression set not found")
|
|
row = await eval_repo.create_case(
|
|
self._db,
|
|
regression_set_id=set_id,
|
|
user_utterances=body.user_utterances,
|
|
title=body.title,
|
|
source_conversation_id=body.source_conversation_id,
|
|
source_user_id=body.source_user_id,
|
|
reference_memoir_markdown=body.reference_memoir_markdown,
|
|
is_protected=body.is_protected,
|
|
meta=body.meta,
|
|
)
|
|
await self._db.commit()
|
|
await self._db.refresh(row)
|
|
return row
|
|
|
|
async def snapshot_from_conversation(
|
|
self,
|
|
set_id: str,
|
|
conversation_id: str,
|
|
body: SnapshotFromConversationBody,
|
|
) -> EvalCase:
|
|
parent = await eval_repo.get_regression_set(self._db, set_id)
|
|
if not parent:
|
|
raise EvaluationNotFoundError("regression set not found")
|
|
catalog = SessionCatalogService(self._db)
|
|
tr = await catalog.get_transcript(conversation_id)
|
|
if not tr:
|
|
raise EvaluationNotFoundError("conversation not found")
|
|
utterances = (
|
|
tr.user_utterances_from_messages
|
|
if body.use_messages
|
|
else tr.user_utterances_from_segments
|
|
)
|
|
if not utterances:
|
|
raise EvaluationBadRequestError("no user utterances in session")
|
|
row = await eval_repo.create_case(
|
|
self._db,
|
|
regression_set_id=set_id,
|
|
user_utterances=utterances,
|
|
title=body.title,
|
|
source_conversation_id=conversation_id,
|
|
source_user_id=tr.user_id,
|
|
is_protected=body.is_protected,
|
|
meta={"source": "conversation_snapshot", "use_messages": body.use_messages},
|
|
)
|
|
await self._db.commit()
|
|
await self._db.refresh(row)
|
|
return row
|
|
|
|
async def import_markdown_case(
|
|
self, set_id: str, body: ImportMarkdownBody
|
|
) -> EvalCase:
|
|
parent = await eval_repo.get_regression_set(self._db, set_id)
|
|
if not parent:
|
|
raise EvaluationNotFoundError("regression set not found")
|
|
utterances = extract_user_utterances_from_export_md(body.markdown)
|
|
if not utterances:
|
|
raise EvaluationBadRequestError("no user lines parsed from markdown")
|
|
row = await eval_repo.create_case(
|
|
self._db,
|
|
regression_set_id=set_id,
|
|
user_utterances=utterances,
|
|
title=body.title,
|
|
is_protected=body.is_protected,
|
|
meta={"source": "markdown_import"},
|
|
)
|
|
await self._db.commit()
|
|
await self._db.refresh(row)
|
|
return row
|
|
|
|
async def import_json_case(self, body: ImportJsonCaseBody) -> EvalCase:
|
|
parent = await eval_repo.get_regression_set(self._db, body.regression_set_id)
|
|
if not parent:
|
|
raise EvaluationNotFoundError("regression set not found")
|
|
meta_extra: dict[str, Any]
|
|
if body.utterances:
|
|
utt = [str(u).strip() for u in body.utterances if str(u).strip()]
|
|
meta_extra = {}
|
|
elif body.raw_json is not None:
|
|
raw = body.raw_json
|
|
payload_str = json.dumps(raw, ensure_ascii=False)
|
|
utt, meta_extra = parse_script_json(payload_str)
|
|
else:
|
|
raise EvaluationBadRequestError("utterances or raw_json required")
|
|
if not utt:
|
|
raise EvaluationBadRequestError("empty utterances")
|
|
row = await eval_repo.create_case(
|
|
self._db,
|
|
regression_set_id=body.regression_set_id,
|
|
user_utterances=utt,
|
|
title=body.title,
|
|
is_protected=body.is_protected,
|
|
meta={"source": "json_import", **meta_extra},
|
|
)
|
|
await self._db.commit()
|
|
await self._db.refresh(row)
|
|
return row
|
|
|
|
async def list_versions(self) -> list[EvalVersion]:
|
|
return await eval_repo.list_versions(self._db)
|
|
|
|
async def create_version(self, body: VersionCreate) -> EvalVersion:
|
|
row = await eval_repo.create_version(
|
|
self._db,
|
|
name=body.name,
|
|
runner_kind=body.runner_kind,
|
|
config_json=body.config_json,
|
|
)
|
|
await self._db.commit()
|
|
await self._db.refresh(row)
|
|
return row
|
|
|
|
async def list_experiments(self, *, limit: int) -> list[EvalExperiment]:
|
|
return await eval_repo.list_experiments(self._db, limit=limit)
|
|
|
|
async def list_session_evaluation_runs(
|
|
self, conversation_id: str
|
|
) -> SessionEvalRunsOut:
|
|
rows = await eval_repo.list_runs_for_source_conversation(
|
|
self._db, source_conversation_id=conversation_id
|
|
)
|
|
items: list[SessionEvalRunItem] = []
|
|
for run, _case, exp in rows:
|
|
turns = await eval_repo.list_turns(self._db, run.id)
|
|
items.append(
|
|
SessionEvalRunItem(
|
|
experiment_name=exp.name,
|
|
run=run_out(run, turns),
|
|
)
|
|
)
|
|
return SessionEvalRunsOut(conversation_id=conversation_id, items=items)
|
|
|
|
async def create_experiment(self, body: ExperimentCreate) -> EvalExperiment:
|
|
rs = await eval_repo.get_regression_set(self._db, body.regression_set_id)
|
|
if not rs:
|
|
raise EvaluationNotFoundError("regression set not found")
|
|
bv = await eval_repo.get_version(self._db, body.baseline_version_id)
|
|
cv = await eval_repo.get_version(self._db, body.candidate_version_id)
|
|
if not bv or not cv:
|
|
raise EvaluationNotFoundError("version not found")
|
|
row = await eval_repo.create_experiment(
|
|
self._db,
|
|
name=body.name,
|
|
regression_set_id=body.regression_set_id,
|
|
baseline_version_id=body.baseline_version_id,
|
|
candidate_version_id=body.candidate_version_id,
|
|
rubric_pack=body.rubric_pack,
|
|
composite_weights_json=body.composite_weights_json,
|
|
)
|
|
await self._db.commit()
|
|
await self._db.refresh(row)
|
|
return row
|
|
|
|
async def get_experiment_detail(self, experiment_id: str) -> ExperimentDetailBundle:
|
|
exp = await eval_repo.get_experiment(self._db, experiment_id)
|
|
if not exp:
|
|
raise EvaluationNotFoundError("experiment not found")
|
|
runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id)
|
|
run_rows: list[tuple[EvalRun, list[EvalRunTurn]]] = []
|
|
for r in runs:
|
|
turns = await eval_repo.list_turns(self._db, r.id)
|
|
run_rows.append((r, turns))
|
|
gv = await eval_repo.get_gate_verdict(self._db, experiment_id)
|
|
return ExperimentDetailBundle(experiment=exp, run_rows=run_rows, gate=gv)
|
|
|
|
async def enqueue_experiment_run(self, experiment_id: str) -> EvalExperiment:
|
|
exp = await eval_repo.get_experiment(self._db, experiment_id)
|
|
if not exp:
|
|
raise EvaluationNotFoundError("experiment not found")
|
|
run_eval_experiment_task.delay(experiment_id)
|
|
await self._db.refresh(exp)
|
|
return exp
|
|
|
|
async def experiment_stream_snapshot(
|
|
self, experiment_id: str
|
|
) -> dict[str, Any] | None:
|
|
from app.features.evaluation.schemas import GateVerdictOut
|
|
|
|
exp = await eval_repo.get_experiment(self._db, experiment_id)
|
|
if not exp:
|
|
return None
|
|
runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id)
|
|
run_payload = []
|
|
for r in runs:
|
|
turns = await eval_repo.list_turns(self._db, r.id)
|
|
run_payload.append(run_out(r, turns).model_dump())
|
|
gv = await eval_repo.get_gate_verdict(self._db, experiment_id)
|
|
return {
|
|
"experiment_id": experiment_id,
|
|
"status": exp.status,
|
|
"runs": run_payload,
|
|
"gate": GateVerdictOut.model_validate(gv).model_dump() if gv else None,
|
|
}
|
|
|
|
def list_user_export_fixture_names(self) -> list[str]:
|
|
return list_user_export_md_filenames()
|
|
|
|
def load_user_export_fixture_turns(self, filename: str) -> list[tuple[str, str]]:
|
|
turns, _ = read_user_export_fixture(filename)
|
|
return turns
|