feat(evaluation): memoir readiness, judge/replay updates, eval web playground

Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.
This commit is contained in:
Kevin
2026-04-08 09:38:07 +08:00
parent 99543d04c6
commit 6772e1269c
26 changed files with 1255 additions and 124 deletions

View File

@@ -14,6 +14,7 @@ from app.features.evaluation.admin_service import EvaluationAdminService
from app.features.evaluation.deps import (
get_eval_judge_manual_service,
get_evaluation_admin_service,
get_memoir_readiness_service,
get_replay_conversation_service,
)
from app.features.evaluation.errors import (
@@ -27,6 +28,7 @@ from app.features.evaluation.importers.user_export_markdown import (
from app.features.evaluation.internal_auth import InternalEvalAuth
from app.features.evaluation.judge_manual_service import EvalJudgeManualService
from app.features.evaluation.presenters import case_out, run_out
from app.features.evaluation.memoir_readiness_service import MemoirReadinessService
from app.features.evaluation.replay_service import ReplayConversationService
from app.features.evaluation.schemas import (
CaseCreate,
@@ -49,6 +51,7 @@ from app.features.evaluation.schemas import (
RegressionSetOut,
ReplayBootstrapBody,
ReplayBootstrapOut,
MemoirPhase1ReadyOut,
ReplayConversationBody,
ReplayConversationOut,
SessionDialogueOut,
@@ -222,6 +225,35 @@ async def get_session_transcript(
)
@router.get(
"/sessions/{conversation_id}/memoir-phase1-ready",
response_model=MemoirPhase1ReadyOut,
)
async def memoir_phase1_ready(
conversation_id: str,
_auth: InternalEvalAuth,
svc: Annotated[
MemoirReadinessService, Depends(get_memoir_readiness_service)
],
segment_ids: Annotated[
list[str],
Query(
min_length=1,
description="本批待检查的 segment id可重复 query 参数 segment_ids=id1&segment_ids=id2",
),
],
):
try:
return await svc.memoir_phase1_ready_for_segments(
conversation_id=conversation_id,
segment_ids=segment_ids,
)
except EvaluationNotFoundError as e:
raise _eval_http_exc(e) from e
except EvaluationBadRequestError as e:
raise _eval_http_exc(e) from e
@router.get(
"/sessions/{conversation_id}/evaluation-runs",
response_model=SessionEvalRunsOut,
@@ -282,9 +314,10 @@ async def replay_conversation(
detail="provide only one of fixture_filename or user_utterances",
)
try:
segment_ids: list[str] = []
if body.fixture_filename:
fn = body.fixture_filename.strip()
n, echo = await replay.replay_fixture(
n, echo, segment_ids = await replay.replay_fixture(
conversation_id=body.conversation_id,
fixture_filename=fn,
flush_memoir_after=body.flush_memoir_after,
@@ -294,7 +327,7 @@ async def replay_conversation(
utt = [str(u) for u in body.user_utterances if str(u).strip()]
if not utt:
raise EvaluationBadRequestError("user_utterances is empty")
n = await replay.replay_utterances(
n, segment_ids = await replay.replay_utterances(
conversation_id=body.conversation_id,
utterances=utt,
flush_memoir_after=body.flush_memoir_after,
@@ -313,6 +346,7 @@ async def replay_conversation(
conversation_id=body.conversation_id,
turns_replayed=n,
utterances_echo=echo,
segment_ids=segment_ids,
)