life-echo/api/app/features/evaluation/admin_service.py

"""内部评测 REST 编排：事务与业务规则；数据访问经 repo。"""

from __future__ import annotations

import json
from dataclasses import dataclass
from typing import Any

from sqlalchemy.ext.asyncio import AsyncSession

from app.features.evaluation import repo as eval_repo
from app.features.evaluation.errors import (
    EvaluationBadRequestError,
    EvaluationNotFoundError,
)
from app.features.evaluation.importers.script_json import parse_script_json
from app.features.evaluation.importers.user_export_markdown import (
    extract_user_utterances_from_export_md,
)
from app.features.evaluation.models import (
    EvalCase,
    EvalExperiment,
    EvalGateVerdict,
    EvalRegressionSet,
    EvalRun,
    EvalRunTurn,
    EvalVersion,
)
from app.features.evaluation.presenters import run_out
from app.features.evaluation.schemas import (
    CaseCreate,
    ExperimentCreate,
    ImportJsonCaseBody,
    ImportMarkdownBody,
    RegressionSetCreate,
    SessionEvalRunItem,
    SessionEvalRunsOut,
    SnapshotFromConversationBody,
    VersionCreate,
)
from app.features.evaluation.session_catalog_service import SessionCatalogService
from app.features.evaluation.user_export_fixtures import (
    list_user_export_fixture_names as list_user_export_md_filenames,
)
from app.features.evaluation.user_export_fixtures import (
    read_user_export_fixture,
)
from app.tasks.evaluation_tasks import run_eval_experiment_task


@dataclass(frozen=True)
class ExperimentDetailBundle:
    experiment: EvalExperiment
    run_rows: list[tuple[EvalRun, list[EvalRunTurn]]]
    gate: EvalGateVerdict | None


class EvaluationAdminService:
    def __init__(self, db: AsyncSession) -> None:
        self._db = db

    async def list_regression_sets(self) -> list[EvalRegressionSet]:
        return await eval_repo.list_regression_sets(self._db)

    async def create_regression_set(
        self, body: RegressionSetCreate
    ) -> EvalRegressionSet:
        row = await eval_repo.create_regression_set(
            self._db, name=body.name, description=body.description
        )
        await self._db.commit()
        await self._db.refresh(row)
        return row

    async def list_cases(self, set_id: str) -> list[EvalCase]:
        parent = await eval_repo.get_regression_set(self._db, set_id)
        if not parent:
            raise EvaluationNotFoundError("regression set not found")
        return await eval_repo.list_cases(self._db, set_id)

    async def create_case(self, set_id: str, body: CaseCreate) -> EvalCase:
        parent = await eval_repo.get_regression_set(self._db, set_id)
        if not parent:
            raise EvaluationNotFoundError("regression set not found")
        row = await eval_repo.create_case(
            self._db,
            regression_set_id=set_id,
            user_utterances=body.user_utterances,
            title=body.title,
            source_conversation_id=body.source_conversation_id,
            source_user_id=body.source_user_id,
            reference_memoir_markdown=body.reference_memoir_markdown,
            is_protected=body.is_protected,
            meta=body.meta,
        )
        await self._db.commit()
        await self._db.refresh(row)
        return row

    async def snapshot_from_conversation(
        self,
        set_id: str,
        conversation_id: str,
        body: SnapshotFromConversationBody,
    ) -> EvalCase:
        parent = await eval_repo.get_regression_set(self._db, set_id)
        if not parent:
            raise EvaluationNotFoundError("regression set not found")
        catalog = SessionCatalogService(self._db)
        tr = await catalog.get_transcript(conversation_id)
        if not tr:
            raise EvaluationNotFoundError("conversation not found")
        utterances = (
            tr.user_utterances_from_messages
            if body.use_messages
            else tr.user_utterances_from_segments
        )
        if not utterances:
            raise EvaluationBadRequestError("no user utterances in session")
        row = await eval_repo.create_case(
            self._db,
            regression_set_id=set_id,
            user_utterances=utterances,
            title=body.title,
            source_conversation_id=conversation_id,
            source_user_id=tr.user_id,
            is_protected=body.is_protected,
            meta={"source": "conversation_snapshot", "use_messages": body.use_messages},
        )
        await self._db.commit()
        await self._db.refresh(row)
        return row

    async def import_markdown_case(
        self, set_id: str, body: ImportMarkdownBody
    ) -> EvalCase:
        parent = await eval_repo.get_regression_set(self._db, set_id)
        if not parent:
            raise EvaluationNotFoundError("regression set not found")
        utterances = extract_user_utterances_from_export_md(body.markdown)
        if not utterances:
            raise EvaluationBadRequestError("no user lines parsed from markdown")
        row = await eval_repo.create_case(
            self._db,
            regression_set_id=set_id,
            user_utterances=utterances,
            title=body.title,
            is_protected=body.is_protected,
            meta={"source": "markdown_import"},
        )
        await self._db.commit()
        await self._db.refresh(row)
        return row

    async def import_json_case(self, body: ImportJsonCaseBody) -> EvalCase:
        parent = await eval_repo.get_regression_set(self._db, body.regression_set_id)
        if not parent:
            raise EvaluationNotFoundError("regression set not found")
        meta_extra: dict[str, Any]
        if body.utterances:
            utt = [str(u).strip() for u in body.utterances if str(u).strip()]
            meta_extra = {}
        elif body.raw_json is not None:
            raw = body.raw_json
            payload_str = json.dumps(raw, ensure_ascii=False)
            utt, meta_extra = parse_script_json(payload_str)
        else:
            raise EvaluationBadRequestError("utterances or raw_json required")
        if not utt:
            raise EvaluationBadRequestError("empty utterances")
        row = await eval_repo.create_case(
            self._db,
            regression_set_id=body.regression_set_id,
            user_utterances=utt,
            title=body.title,
            is_protected=body.is_protected,
            meta={"source": "json_import", **meta_extra},
        )
        await self._db.commit()
        await self._db.refresh(row)
        return row

    async def list_versions(self) -> list[EvalVersion]:
        return await eval_repo.list_versions(self._db)

    async def create_version(self, body: VersionCreate) -> EvalVersion:
        row = await eval_repo.create_version(
            self._db,
            name=body.name,
            runner_kind=body.runner_kind,
            config_json=body.config_json,
        )
        await self._db.commit()
        await self._db.refresh(row)
        return row

    async def list_experiments(self, *, limit: int) -> list[EvalExperiment]:
        return await eval_repo.list_experiments(self._db, limit=limit)

    async def list_session_evaluation_runs(
        self, conversation_id: str
    ) -> SessionEvalRunsOut:
        rows = await eval_repo.list_runs_for_source_conversation(
            self._db, source_conversation_id=conversation_id
        )
        items: list[SessionEvalRunItem] = []
        for run, _case, exp in rows:
            turns = await eval_repo.list_turns(self._db, run.id)
            items.append(
                SessionEvalRunItem(
                    experiment_name=exp.name,
                    run=run_out(run, turns),
                )
            )
        return SessionEvalRunsOut(conversation_id=conversation_id, items=items)

    async def create_experiment(self, body: ExperimentCreate) -> EvalExperiment:
        rs = await eval_repo.get_regression_set(self._db, body.regression_set_id)
        if not rs:
            raise EvaluationNotFoundError("regression set not found")
        bv = await eval_repo.get_version(self._db, body.baseline_version_id)
        cv = await eval_repo.get_version(self._db, body.candidate_version_id)
        if not bv or not cv:
            raise EvaluationNotFoundError("version not found")
        row = await eval_repo.create_experiment(
            self._db,
            name=body.name,
            regression_set_id=body.regression_set_id,
            baseline_version_id=body.baseline_version_id,
            candidate_version_id=body.candidate_version_id,
            rubric_pack=body.rubric_pack,
            composite_weights_json=body.composite_weights_json,
        )
        await self._db.commit()
        await self._db.refresh(row)
        return row

    async def get_experiment_detail(self, experiment_id: str) -> ExperimentDetailBundle:
        exp = await eval_repo.get_experiment(self._db, experiment_id)
        if not exp:
            raise EvaluationNotFoundError("experiment not found")
        runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id)
        run_rows: list[tuple[EvalRun, list[EvalRunTurn]]] = []
        for r in runs:
            turns = await eval_repo.list_turns(self._db, r.id)
            run_rows.append((r, turns))
        gv = await eval_repo.get_gate_verdict(self._db, experiment_id)
        return ExperimentDetailBundle(experiment=exp, run_rows=run_rows, gate=gv)

    async def enqueue_experiment_run(self, experiment_id: str) -> EvalExperiment:
        exp = await eval_repo.get_experiment(self._db, experiment_id)
        if not exp:
            raise EvaluationNotFoundError("experiment not found")
        run_eval_experiment_task.delay(experiment_id)
        await self._db.refresh(exp)
        return exp

    async def experiment_stream_snapshot(
        self, experiment_id: str
    ) -> dict[str, Any] | None:
        from app.features.evaluation.schemas import GateVerdictOut

        exp = await eval_repo.get_experiment(self._db, experiment_id)
        if not exp:
            return None
        runs = await eval_repo.list_runs_for_experiment(self._db, experiment_id)
        run_payload = []
        for r in runs:
            turns = await eval_repo.list_turns(self._db, r.id)
            run_payload.append(run_out(r, turns).model_dump())
        gv = await eval_repo.get_gate_verdict(self._db, experiment_id)
        return {
            "experiment_id": experiment_id,
            "status": exp.status,
            "runs": run_payload,
            "gate": GateVerdictOut.model_validate(gv).model_dump() if gv else None,
        }

    def list_user_export_fixture_names(self) -> list[str]:
        return list_user_export_md_filenames()

    def load_user_export_fixture_turns(self, filename: str) -> list[tuple[str, str]]:
        turns, _ = read_user_export_fixture(filename)
        return turns