feat(evaluation): memoir readiness, judge/replay updates, eval web playground
Add memoir_readiness_service and router tests; extend judge schemas/services, replay_service, and conversation rubric; align story route agent, payload, prompts, and story_pipeline_sync; update agent logging, config, and DI. Document internal-eval; add replayDraft util and PlaygroundPage changes in app-eval-web.
This commit is contained in:
@@ -14,6 +14,7 @@ from app.features.evaluation.admin_service import EvaluationAdminService
|
||||
from app.features.evaluation.deps import (
|
||||
get_eval_judge_manual_service,
|
||||
get_evaluation_admin_service,
|
||||
get_memoir_readiness_service,
|
||||
get_replay_conversation_service,
|
||||
)
|
||||
from app.features.evaluation.errors import (
|
||||
@@ -27,6 +28,7 @@ from app.features.evaluation.importers.user_export_markdown import (
|
||||
from app.features.evaluation.internal_auth import InternalEvalAuth
|
||||
from app.features.evaluation.judge_manual_service import EvalJudgeManualService
|
||||
from app.features.evaluation.presenters import case_out, run_out
|
||||
from app.features.evaluation.memoir_readiness_service import MemoirReadinessService
|
||||
from app.features.evaluation.replay_service import ReplayConversationService
|
||||
from app.features.evaluation.schemas import (
|
||||
CaseCreate,
|
||||
@@ -49,6 +51,7 @@ from app.features.evaluation.schemas import (
|
||||
RegressionSetOut,
|
||||
ReplayBootstrapBody,
|
||||
ReplayBootstrapOut,
|
||||
MemoirPhase1ReadyOut,
|
||||
ReplayConversationBody,
|
||||
ReplayConversationOut,
|
||||
SessionDialogueOut,
|
||||
@@ -222,6 +225,35 @@ async def get_session_transcript(
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/sessions/{conversation_id}/memoir-phase1-ready",
|
||||
response_model=MemoirPhase1ReadyOut,
|
||||
)
|
||||
async def memoir_phase1_ready(
|
||||
conversation_id: str,
|
||||
_auth: InternalEvalAuth,
|
||||
svc: Annotated[
|
||||
MemoirReadinessService, Depends(get_memoir_readiness_service)
|
||||
],
|
||||
segment_ids: Annotated[
|
||||
list[str],
|
||||
Query(
|
||||
min_length=1,
|
||||
description="本批待检查的 segment id,可重复 query 参数 segment_ids=id1&segment_ids=id2",
|
||||
),
|
||||
],
|
||||
):
|
||||
try:
|
||||
return await svc.memoir_phase1_ready_for_segments(
|
||||
conversation_id=conversation_id,
|
||||
segment_ids=segment_ids,
|
||||
)
|
||||
except EvaluationNotFoundError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
except EvaluationBadRequestError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
|
||||
|
||||
@router.get(
|
||||
"/sessions/{conversation_id}/evaluation-runs",
|
||||
response_model=SessionEvalRunsOut,
|
||||
@@ -282,9 +314,10 @@ async def replay_conversation(
|
||||
detail="provide only one of fixture_filename or user_utterances",
|
||||
)
|
||||
try:
|
||||
segment_ids: list[str] = []
|
||||
if body.fixture_filename:
|
||||
fn = body.fixture_filename.strip()
|
||||
n, echo = await replay.replay_fixture(
|
||||
n, echo, segment_ids = await replay.replay_fixture(
|
||||
conversation_id=body.conversation_id,
|
||||
fixture_filename=fn,
|
||||
flush_memoir_after=body.flush_memoir_after,
|
||||
@@ -294,7 +327,7 @@ async def replay_conversation(
|
||||
utt = [str(u) for u in body.user_utterances if str(u).strip()]
|
||||
if not utt:
|
||||
raise EvaluationBadRequestError("user_utterances is empty")
|
||||
n = await replay.replay_utterances(
|
||||
n, segment_ids = await replay.replay_utterances(
|
||||
conversation_id=body.conversation_id,
|
||||
utterances=utt,
|
||||
flush_memoir_after=body.flush_memoir_after,
|
||||
@@ -313,6 +346,7 @@ async def replay_conversation(
|
||||
conversation_id=body.conversation_id,
|
||||
turns_replayed=n,
|
||||
utterances_echo=echo,
|
||||
segment_ids=segment_ids,
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user