feat(evaluation): session catalog, user export import, and eval web UI

- Extend evaluation API: schemas, router, repo, admin and execution services
- Improve user export markdown importer; add fixtures and importer tests
- Session catalog repo/service updates; internal app wiring and docs
- Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)
This commit is contained in:
Kevin
2026-04-06 13:45:04 +08:00
parent b75edacb5f
commit ca8bcc8489
17 changed files with 2062 additions and 296 deletions

View File

@@ -26,16 +26,25 @@ from app.features.evaluation.models import (
EvalRunTurn,
EvalVersion,
)
from app.features.evaluation.presenters import run_out
from app.features.evaluation.schemas import (
CaseCreate,
ExperimentCreate,
ImportJsonCaseBody,
ImportMarkdownBody,
RegressionSetCreate,
SessionEvalRunItem,
SessionEvalRunsOut,
SnapshotFromConversationBody,
VersionCreate,
)
from app.features.evaluation.session_catalog_service import SessionCatalogService
from app.features.evaluation.user_export_fixtures import (
list_user_export_fixture_names as list_user_export_md_filenames,
)
from app.features.evaluation.user_export_fixtures import (
read_user_export_fixture,
)
from app.tasks.evaluation_tasks import run_eval_experiment_task
@@ -188,6 +197,23 @@ class EvaluationAdminService:
async def list_experiments(self, *, limit: int) -> list[EvalExperiment]:
return await eval_repo.list_experiments(self._db, limit=limit)
async def list_session_evaluation_runs(
self, conversation_id: str
) -> SessionEvalRunsOut:
rows = await eval_repo.list_runs_for_source_conversation(
self._db, source_conversation_id=conversation_id
)
items: list[SessionEvalRunItem] = []
for run, _case, exp in rows:
turns = await eval_repo.list_turns(self._db, run.id)
items.append(
SessionEvalRunItem(
experiment_name=exp.name,
run=run_out(run, turns),
)
)
return SessionEvalRunsOut(conversation_id=conversation_id, items=items)
async def create_experiment(self, body: ExperimentCreate) -> EvalExperiment:
rs = await eval_repo.get_regression_set(self._db, body.regression_set_id)
if not rs:
@@ -232,7 +258,6 @@ class EvaluationAdminService:
async def experiment_stream_snapshot(
self, experiment_id: str
) -> dict[str, Any] | None:
from app.features.evaluation.presenters import run_out
from app.features.evaluation.schemas import GateVerdictOut
exp = await eval_repo.get_experiment(self._db, experiment_id)
@@ -250,3 +275,10 @@ class EvaluationAdminService:
"runs": run_payload,
"gate": GateVerdictOut.model_validate(gv).model_dump() if gv else None,
}
def list_user_export_fixture_names(self) -> list[str]:
return list_user_export_md_filenames()
def load_user_export_fixture_turns(self, filename: str) -> list[tuple[str, str]]:
turns, _ = read_user_export_fixture(filename)
return turns