feat(evaluation): session catalog, user export import, and eval web UI
- Extend evaluation API: schemas, router, repo, admin and execution services - Improve user export markdown importer; add fixtures and importer tests - Session catalog repo/service updates; internal app wiring and docs - Add internal-eval.sh helper; refresh app-eval-web (App, styles, Vite)
This commit is contained in:
@@ -26,16 +26,25 @@ from app.features.evaluation.models import (
|
||||
EvalRunTurn,
|
||||
EvalVersion,
|
||||
)
|
||||
from app.features.evaluation.presenters import run_out
|
||||
from app.features.evaluation.schemas import (
|
||||
CaseCreate,
|
||||
ExperimentCreate,
|
||||
ImportJsonCaseBody,
|
||||
ImportMarkdownBody,
|
||||
RegressionSetCreate,
|
||||
SessionEvalRunItem,
|
||||
SessionEvalRunsOut,
|
||||
SnapshotFromConversationBody,
|
||||
VersionCreate,
|
||||
)
|
||||
from app.features.evaluation.session_catalog_service import SessionCatalogService
|
||||
from app.features.evaluation.user_export_fixtures import (
|
||||
list_user_export_fixture_names as list_user_export_md_filenames,
|
||||
)
|
||||
from app.features.evaluation.user_export_fixtures import (
|
||||
read_user_export_fixture,
|
||||
)
|
||||
from app.tasks.evaluation_tasks import run_eval_experiment_task
|
||||
|
||||
|
||||
@@ -188,6 +197,23 @@ class EvaluationAdminService:
|
||||
async def list_experiments(self, *, limit: int) -> list[EvalExperiment]:
|
||||
return await eval_repo.list_experiments(self._db, limit=limit)
|
||||
|
||||
async def list_session_evaluation_runs(
|
||||
self, conversation_id: str
|
||||
) -> SessionEvalRunsOut:
|
||||
rows = await eval_repo.list_runs_for_source_conversation(
|
||||
self._db, source_conversation_id=conversation_id
|
||||
)
|
||||
items: list[SessionEvalRunItem] = []
|
||||
for run, _case, exp in rows:
|
||||
turns = await eval_repo.list_turns(self._db, run.id)
|
||||
items.append(
|
||||
SessionEvalRunItem(
|
||||
experiment_name=exp.name,
|
||||
run=run_out(run, turns),
|
||||
)
|
||||
)
|
||||
return SessionEvalRunsOut(conversation_id=conversation_id, items=items)
|
||||
|
||||
async def create_experiment(self, body: ExperimentCreate) -> EvalExperiment:
|
||||
rs = await eval_repo.get_regression_set(self._db, body.regression_set_id)
|
||||
if not rs:
|
||||
@@ -232,7 +258,6 @@ class EvaluationAdminService:
|
||||
async def experiment_stream_snapshot(
|
||||
self, experiment_id: str
|
||||
) -> dict[str, Any] | None:
|
||||
from app.features.evaluation.presenters import run_out
|
||||
from app.features.evaluation.schemas import GateVerdictOut
|
||||
|
||||
exp = await eval_repo.get_experiment(self._db, experiment_id)
|
||||
@@ -250,3 +275,10 @@ class EvaluationAdminService:
|
||||
"runs": run_payload,
|
||||
"gate": GateVerdictOut.model_validate(gv).model_dump() if gv else None,
|
||||
}
|
||||
|
||||
def list_user_export_fixture_names(self) -> list[str]:
|
||||
return list_user_export_md_filenames()
|
||||
|
||||
def load_user_export_fixture_turns(self, filename: str) -> list[tuple[str, str]]:
|
||||
turns, _ = read_user_export_fixture(filename)
|
||||
return turns
|
||||
|
||||
Reference in New Issue
Block a user