"""内部评测 REST 编排:事务与业务规则;数据访问经 repo。""" from __future__ import annotations import json from dataclasses import dataclass from typing import Any from sqlalchemy.ext.asyncio import AsyncSession from app.features.evaluation import repo as eval_repo from app.features.evaluation.errors import ( EvaluationBadRequestError, EvaluationNotFoundError, ) from app.features.evaluation.importers.script_json import parse_script_json from app.features.evaluation.importers.user_export_markdown import ( extract_user_utterances_from_export_md, ) from app.features.evaluation.models import ( EvalCase, EvalExperiment, EvalGateVerdict, EvalRegressionSet, EvalRun, EvalRunTurn, EvalVersion, ) from app.features.evaluation.presenters import run_out from app.features.evaluation.schemas import ( CaseCreate, ExperimentCreate, ImportJsonCaseBody, ImportMarkdownBody, RegressionSetCreate, SessionEvalRunItem, SessionEvalRunsOut, SnapshotFromConversationBody, VersionCreate, ) from app.features.evaluation.session_catalog_service import SessionCatalogService from app.features.evaluation.user_export_fixtures import ( list_user_export_fixture_names as list_user_export_md_filenames, ) from app.features.evaluation.user_export_fixtures import ( read_user_export_fixture, ) from app.tasks.evaluation_tasks import run_eval_experiment_task @dataclass(frozen=True) class ExperimentDetailBundle: experiment: EvalExperiment run_rows: list[tuple[EvalRun, list[EvalRunTurn]]] gate: EvalGateVerdict | None class EvaluationAdminService: def __init__(self, db: AsyncSession) -> None: self._db = db async def list_regression_sets(self) -> list[EvalRegressionSet]: return await eval_repo.list_regression_sets(self._db) async def create_regression_set( self, body: RegressionSetCreate ) -> EvalRegressionSet: row = await eval_repo.create_regression_set( self._db, name=body.name, description=body.description ) await self._db.commit() await self._db.refresh(row) return row async def list_cases(self, set_id: str) -> list[EvalCase]: parent = await eval_repo.get_regression_set(self._db, set_id) if not parent: raise EvaluationNotFoundError("regression set not found") return await eval_repo.list_cases(self._db, set_id) async def create_case(self, set_id: str, body: CaseCreate) -> EvalCase: parent = await eval_repo.get_regression_set(self._db, set_id) if not parent: raise EvaluationNotFoundError("regression set not found") row = await eval_repo.create_case( self._db, regression_set_id=set_id, user_utterances=body.user_utterances, title=body.title, source_conversation_id=body.source_conversation_id, source_user_id=body.source_user_id, reference_memoir_markdown=body.reference_memoir_markdown, is_protected=body.is_protected, meta=body.meta, ) await self._db.commit() await self._db.refresh(row) return row async def snapshot_from_conversation( self, set_id: str, conversation_id: str, body: SnapshotFromConversationBody, ) -> EvalCase: parent = await eval_repo.get_regression_set(self._db, set_id) if not parent: raise EvaluationNotFoundError("regression set not found") catalog = SessionCatalogService(self._db) tr = await catalog.get_transcript(conversation_id) if not tr: raise EvaluationNotFoundError("conversation not found") utterances = ( tr.user_utterances_from_messages if body.use_messages else tr.user_utterances_from_segments ) if not utterances: raise EvaluationBadRequestError("no user utterances in session") row = await eval_repo.create_case( self._db, regression_set_id=set_id, user_utterances=utterances, title=body.title, source_conversation_id=conversation_id, source_user_id=tr.user_id, is_protected=body.is_protected, meta={"source": "conversation_snapshot", "use_messages": body.use_messages}, ) await self._db.commit() await self._db.refresh(row) return row async def import_markdown_case( self, set_id: str, body: ImportMarkdownBody ) -> EvalCase: parent = await eval_repo.get_regression_set(self._db, set_id) if not parent: raise EvaluationNotFoundError("regression set not found") utterances = extract_user_utterances_from_export_md(body.markdown) if not utterances: raise EvaluationBadRequestError("no user lines parsed from markdown") row = await eval_repo.create_case( self._db, regression_set_id=set_id, user_utterances=utterances, title=body.title, is_protected=body.is_protected, meta={"source": "markdown_import"}, ) await self._db.commit() await self._db.refresh(row) return row async def import_json_case(self, body: ImportJsonCaseBody) -> EvalCase: parent = await eval_repo.get_regression_set(self._db, body.regression_set_id) if not parent: raise EvaluationNotFoundError("regression set not found") meta_extra: dict[str, Any] if body.utterances: utt = [str(u).strip() for u in body.utterances if str(u).strip()] meta_extra = {} elif body.raw_json is not None: raw = body.raw_json payload_str = json.dumps(raw, ensure_ascii=False) utt, meta_extra = parse_script_json(payload_str) else: raise EvaluationBadRequestError("utterances or raw_json required") if not utt: raise EvaluationBadRequestError("empty utterances") row = await eval_repo.create_case( self._db, regression_set_id=body.regression_set_id, user_utterances=utt, title=body.title, is_protected=body.is_protected, meta={"source": "json_import", **meta_extra}, ) await self._db.commit() await self._db.refresh(row) return row async def list_versions(self) -> list[EvalVersion]: return await eval_repo.list_versions(self._db) async def create_version(self, body: VersionCreate) -> EvalVersion: row = await eval_repo.create_version( self._db, name=body.name, runner_kind=body.runner_kind, config_json=body.config_json, ) await self._db.commit() await self._db.refresh(row) return row async def list_experiments(self, *, limit: int) -> list[EvalExperiment]: return await eval_repo.list_experiments(self._db, limit=limit) async def list_session_evaluation_runs( self, conversation_id: str ) -> SessionEvalRunsOut: rows = await eval_repo.list_runs_for_source_conversation( self._db, source_conversation_id=conversation_id ) items: list[SessionEvalRunItem] = [] for run, _case, exp in rows: turns = await eval_repo.list_turns(self._db, run.id) items.append( SessionEvalRunItem( experiment_name=exp.name, run=run_out(run, turns), ) ) return SessionEvalRunsOut(conversation_id=conversation_id, items=items) async def create_experiment(self, body: ExperimentCreate) -> EvalExperiment: rs = await eval_repo.get_regression_set(self._db, body.regression_set_id) if not rs: raise EvaluationNotFoundError("regression set not found") bv = await eval_repo.get_version(self._db, body.baseline_version_id) cv = await eval_repo.get_version(self._db, body.candidate_version_id) if not bv or not cv: raise EvaluationNotFoundError("version not found") row = await eval_repo.create_experiment( self._db, name=body.name, regression_set_id=body.regression_set_id, baseline_version_id=body.baseline_version_id, candidate_version_id=body.candidate_version_id, rubric_pack=body.rubric_pack, composite_weights_json=body.composite_weights_json, ) await self._db.commit() await self._db.refresh(row) return row async def get_experiment_detail(self, experiment_id: str) -> ExperimentDetailBundle: exp = await eval_repo.get_experiment(self._db, experiment_id) if not exp: raise EvaluationNotFoundError("experiment not found") runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id) run_rows: list[tuple[EvalRun, list[EvalRunTurn]]] = [] for r in runs: turns = await eval_repo.list_turns(self._db, r.id) run_rows.append((r, turns)) gv = await eval_repo.get_gate_verdict(self._db, experiment_id) return ExperimentDetailBundle(experiment=exp, run_rows=run_rows, gate=gv) async def enqueue_experiment_run(self, experiment_id: str) -> EvalExperiment: exp = await eval_repo.get_experiment(self._db, experiment_id) if not exp: raise EvaluationNotFoundError("experiment not found") run_eval_experiment_task.delay(experiment_id) await self._db.refresh(exp) return exp async def experiment_stream_snapshot( self, experiment_id: str ) -> dict[str, Any] | None: from app.features.evaluation.schemas import GateVerdictOut exp = await eval_repo.get_experiment(self._db, experiment_id) if not exp: return None runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id) run_payload = [] for r in runs: turns = await eval_repo.list_turns(self._db, r.id) run_payload.append(run_out(r, turns).model_dump()) gv = await eval_repo.get_gate_verdict(self._db, experiment_id) return { "experiment_id": experiment_id, "status": exp.status, "runs": run_payload, "gate": GateVerdictOut.model_validate(gv).model_dump() if gv else None, } def list_user_export_fixture_names(self) -> list[str]: return list_user_export_md_filenames() def load_user_export_fixture_turns(self, filename: str) -> list[tuple[str, str]]: turns, _ = read_user_export_fixture(filename) return turns