feat/ 导出开发容器内的数据用于评估
This commit is contained in:
252
api/app/features/evaluation/admin_service.py
Normal file
252
api/app/features/evaluation/admin_service.py
Normal file
@@ -0,0 +1,252 @@
|
||||
"""内部评测 REST 编排:事务与业务规则;数据访问经 repo。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.features.evaluation import repo as eval_repo
|
||||
from app.features.evaluation.errors import (
|
||||
EvaluationBadRequestError,
|
||||
EvaluationNotFoundError,
|
||||
)
|
||||
from app.features.evaluation.importers.script_json import parse_script_json
|
||||
from app.features.evaluation.importers.user_export_markdown import (
|
||||
extract_user_utterances_from_export_md,
|
||||
)
|
||||
from app.features.evaluation.models import (
|
||||
EvalCase,
|
||||
EvalExperiment,
|
||||
EvalGateVerdict,
|
||||
EvalRegressionSet,
|
||||
EvalRun,
|
||||
EvalRunTurn,
|
||||
EvalVersion,
|
||||
)
|
||||
from app.features.evaluation.schemas import (
|
||||
CaseCreate,
|
||||
ExperimentCreate,
|
||||
ImportJsonCaseBody,
|
||||
ImportMarkdownBody,
|
||||
RegressionSetCreate,
|
||||
SnapshotFromConversationBody,
|
||||
VersionCreate,
|
||||
)
|
||||
from app.features.evaluation.session_catalog_service import SessionCatalogService
|
||||
from app.tasks.evaluation_tasks import run_eval_experiment_task
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExperimentDetailBundle:
|
||||
experiment: EvalExperiment
|
||||
run_rows: list[tuple[EvalRun, list[EvalRunTurn]]]
|
||||
gate: EvalGateVerdict | None
|
||||
|
||||
|
||||
class EvaluationAdminService:
|
||||
def __init__(self, db: AsyncSession) -> None:
|
||||
self._db = db
|
||||
|
||||
async def list_regression_sets(self) -> list[EvalRegressionSet]:
|
||||
return await eval_repo.list_regression_sets(self._db)
|
||||
|
||||
async def create_regression_set(
|
||||
self, body: RegressionSetCreate
|
||||
) -> EvalRegressionSet:
|
||||
row = await eval_repo.create_regression_set(
|
||||
self._db, name=body.name, description=body.description
|
||||
)
|
||||
await self._db.commit()
|
||||
await self._db.refresh(row)
|
||||
return row
|
||||
|
||||
async def list_cases(self, set_id: str) -> list[EvalCase]:
|
||||
parent = await eval_repo.get_regression_set(self._db, set_id)
|
||||
if not parent:
|
||||
raise EvaluationNotFoundError("regression set not found")
|
||||
return await eval_repo.list_cases(self._db, set_id)
|
||||
|
||||
async def create_case(self, set_id: str, body: CaseCreate) -> EvalCase:
|
||||
parent = await eval_repo.get_regression_set(self._db, set_id)
|
||||
if not parent:
|
||||
raise EvaluationNotFoundError("regression set not found")
|
||||
row = await eval_repo.create_case(
|
||||
self._db,
|
||||
regression_set_id=set_id,
|
||||
user_utterances=body.user_utterances,
|
||||
title=body.title,
|
||||
source_conversation_id=body.source_conversation_id,
|
||||
source_user_id=body.source_user_id,
|
||||
reference_memoir_markdown=body.reference_memoir_markdown,
|
||||
is_protected=body.is_protected,
|
||||
meta=body.meta,
|
||||
)
|
||||
await self._db.commit()
|
||||
await self._db.refresh(row)
|
||||
return row
|
||||
|
||||
async def snapshot_from_conversation(
|
||||
self,
|
||||
set_id: str,
|
||||
conversation_id: str,
|
||||
body: SnapshotFromConversationBody,
|
||||
) -> EvalCase:
|
||||
parent = await eval_repo.get_regression_set(self._db, set_id)
|
||||
if not parent:
|
||||
raise EvaluationNotFoundError("regression set not found")
|
||||
catalog = SessionCatalogService(self._db)
|
||||
tr = await catalog.get_transcript(conversation_id)
|
||||
if not tr:
|
||||
raise EvaluationNotFoundError("conversation not found")
|
||||
utterances = (
|
||||
tr.user_utterances_from_messages
|
||||
if body.use_messages
|
||||
else tr.user_utterances_from_segments
|
||||
)
|
||||
if not utterances:
|
||||
raise EvaluationBadRequestError("no user utterances in session")
|
||||
row = await eval_repo.create_case(
|
||||
self._db,
|
||||
regression_set_id=set_id,
|
||||
user_utterances=utterances,
|
||||
title=body.title,
|
||||
source_conversation_id=conversation_id,
|
||||
source_user_id=tr.user_id,
|
||||
is_protected=body.is_protected,
|
||||
meta={"source": "conversation_snapshot", "use_messages": body.use_messages},
|
||||
)
|
||||
await self._db.commit()
|
||||
await self._db.refresh(row)
|
||||
return row
|
||||
|
||||
async def import_markdown_case(
|
||||
self, set_id: str, body: ImportMarkdownBody
|
||||
) -> EvalCase:
|
||||
parent = await eval_repo.get_regression_set(self._db, set_id)
|
||||
if not parent:
|
||||
raise EvaluationNotFoundError("regression set not found")
|
||||
utterances = extract_user_utterances_from_export_md(body.markdown)
|
||||
if not utterances:
|
||||
raise EvaluationBadRequestError("no user lines parsed from markdown")
|
||||
row = await eval_repo.create_case(
|
||||
self._db,
|
||||
regression_set_id=set_id,
|
||||
user_utterances=utterances,
|
||||
title=body.title,
|
||||
is_protected=body.is_protected,
|
||||
meta={"source": "markdown_import"},
|
||||
)
|
||||
await self._db.commit()
|
||||
await self._db.refresh(row)
|
||||
return row
|
||||
|
||||
async def import_json_case(self, body: ImportJsonCaseBody) -> EvalCase:
|
||||
parent = await eval_repo.get_regression_set(self._db, body.regression_set_id)
|
||||
if not parent:
|
||||
raise EvaluationNotFoundError("regression set not found")
|
||||
meta_extra: dict[str, Any]
|
||||
if body.utterances:
|
||||
utt = [str(u).strip() for u in body.utterances if str(u).strip()]
|
||||
meta_extra = {}
|
||||
elif body.raw_json is not None:
|
||||
raw = body.raw_json
|
||||
payload_str = json.dumps(raw, ensure_ascii=False)
|
||||
utt, meta_extra = parse_script_json(payload_str)
|
||||
else:
|
||||
raise EvaluationBadRequestError("utterances or raw_json required")
|
||||
if not utt:
|
||||
raise EvaluationBadRequestError("empty utterances")
|
||||
row = await eval_repo.create_case(
|
||||
self._db,
|
||||
regression_set_id=body.regression_set_id,
|
||||
user_utterances=utt,
|
||||
title=body.title,
|
||||
is_protected=body.is_protected,
|
||||
meta={"source": "json_import", **meta_extra},
|
||||
)
|
||||
await self._db.commit()
|
||||
await self._db.refresh(row)
|
||||
return row
|
||||
|
||||
async def list_versions(self) -> list[EvalVersion]:
|
||||
return await eval_repo.list_versions(self._db)
|
||||
|
||||
async def create_version(self, body: VersionCreate) -> EvalVersion:
|
||||
row = await eval_repo.create_version(
|
||||
self._db,
|
||||
name=body.name,
|
||||
runner_kind=body.runner_kind,
|
||||
config_json=body.config_json,
|
||||
)
|
||||
await self._db.commit()
|
||||
await self._db.refresh(row)
|
||||
return row
|
||||
|
||||
async def list_experiments(self, *, limit: int) -> list[EvalExperiment]:
|
||||
return await eval_repo.list_experiments(self._db, limit=limit)
|
||||
|
||||
async def create_experiment(self, body: ExperimentCreate) -> EvalExperiment:
|
||||
rs = await eval_repo.get_regression_set(self._db, body.regression_set_id)
|
||||
if not rs:
|
||||
raise EvaluationNotFoundError("regression set not found")
|
||||
bv = await eval_repo.get_version(self._db, body.baseline_version_id)
|
||||
cv = await eval_repo.get_version(self._db, body.candidate_version_id)
|
||||
if not bv or not cv:
|
||||
raise EvaluationNotFoundError("version not found")
|
||||
row = await eval_repo.create_experiment(
|
||||
self._db,
|
||||
name=body.name,
|
||||
regression_set_id=body.regression_set_id,
|
||||
baseline_version_id=body.baseline_version_id,
|
||||
candidate_version_id=body.candidate_version_id,
|
||||
rubric_pack=body.rubric_pack,
|
||||
composite_weights_json=body.composite_weights_json,
|
||||
)
|
||||
await self._db.commit()
|
||||
await self._db.refresh(row)
|
||||
return row
|
||||
|
||||
async def get_experiment_detail(self, experiment_id: str) -> ExperimentDetailBundle:
|
||||
exp = await eval_repo.get_experiment(self._db, experiment_id)
|
||||
if not exp:
|
||||
raise EvaluationNotFoundError("experiment not found")
|
||||
runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id)
|
||||
run_rows: list[tuple[EvalRun, list[EvalRunTurn]]] = []
|
||||
for r in runs:
|
||||
turns = await eval_repo.list_turns(self._db, r.id)
|
||||
run_rows.append((r, turns))
|
||||
gv = await eval_repo.get_gate_verdict(self._db, experiment_id)
|
||||
return ExperimentDetailBundle(experiment=exp, run_rows=run_rows, gate=gv)
|
||||
|
||||
async def enqueue_experiment_run(self, experiment_id: str) -> EvalExperiment:
|
||||
exp = await eval_repo.get_experiment(self._db, experiment_id)
|
||||
if not exp:
|
||||
raise EvaluationNotFoundError("experiment not found")
|
||||
run_eval_experiment_task.delay(experiment_id)
|
||||
await self._db.refresh(exp)
|
||||
return exp
|
||||
|
||||
async def experiment_stream_snapshot(
|
||||
self, experiment_id: str
|
||||
) -> dict[str, Any] | None:
|
||||
from app.features.evaluation.presenters import run_out
|
||||
from app.features.evaluation.schemas import GateVerdictOut
|
||||
|
||||
exp = await eval_repo.get_experiment(self._db, experiment_id)
|
||||
if not exp:
|
||||
return None
|
||||
runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id)
|
||||
run_payload = []
|
||||
for r in runs:
|
||||
turns = await eval_repo.list_turns(self._db, r.id)
|
||||
run_payload.append(run_out(r, turns).model_dump())
|
||||
gv = await eval_repo.get_gate_verdict(self._db, experiment_id)
|
||||
return {
|
||||
"experiment_id": experiment_id,
|
||||
"status": exp.status,
|
||||
"runs": run_payload,
|
||||
"gate": GateVerdictOut.model_validate(gv).model_dump() if gv else None,
|
||||
}
|
||||
Reference in New Issue
Block a user