refactor(eval+memoir):精简内部评测路由与服务,composite/对话摘要与 judge 能力补强

- 访谈:新增 interview_state_hints,联动 orchestrator 与提示词
- 回忆录:story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建:开发用 celery broker、compose/development 脚本、依赖注入
- eval-web:移除数据集/实验/版本等页面与流式轮询,突出 Playground
- 文档与单测同步
This commit is contained in:
Kevin
2026-04-08 21:36:12 +08:00
parent 2a0c80987d
commit 064ad2161d
64 changed files with 3412 additions and 3068 deletions

View File

@@ -28,45 +28,32 @@ from app.features.evaluation.importers.user_export_markdown import (
from app.features.evaluation.internal_auth import InternalEvalAuth
from app.features.evaluation.judge_manual_service import EvalJudgeManualService
from app.features.evaluation.memoir_readiness_service import MemoirReadinessService
from app.features.evaluation.presenters import case_out, run_out
from app.features.evaluation.replay_service import ReplayConversationService
from app.features.evaluation.schemas import (
CaseCreate,
CaseOut,
EvalRunOut,
EvalSandboxOut,
ExperimentCreate,
ExperimentDetailOut,
ExperimentOut,
GateVerdictOut,
ImportJsonCaseBody,
ImportMarkdownBody,
ManualJudgeConversationBody,
ManualJudgeConversationOut,
ManualJudgeConversationStreamBody,
ManualJudgeMemoirBody,
ManualJudgeMemoirOut,
PlaygroundConversationJudgeOut,
MemoirPhase1ReadyOut,
MemoirSectionBaselineOut,
RegressionSetCreate,
RegressionSetOut,
MemoirSubmitOut,
PlaygroundConversationJudgeOut,
ReplayBootstrapBody,
ReplayBootstrapOut,
ReplayConversationBody,
ReplayConversationOut,
RetryBaselineJudgeBody,
RetryBaselineJudgeOut,
SessionDialogueOut,
SessionEvalRunsOut,
SessionListItem,
SessionListResponse,
SessionTranscriptOut,
SnapshotFromConversationBody,
UserExportFixtureDetailOut,
UserExportFixtureListOut,
UserExportFixtureTurnOut,
UserMemoirSnapshotOut,
VersionCreate,
VersionOut,
)
from app.features.evaluation.session_catalog_service import SessionCatalogService
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
@@ -88,73 +75,6 @@ def _eval_http_exc(
return HTTPException(status_code=400, detail=e.detail)
@router.get("/regression-sets", response_model=list[RegressionSetOut])
async def list_regression_sets(
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
rows = await svc.list_regression_sets()
return [RegressionSetOut.model_validate(r) for r in rows]
@router.post("/regression-sets", response_model=RegressionSetOut)
async def create_regression_set(
body: RegressionSetCreate,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.create_regression_set(body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return RegressionSetOut.model_validate(row)
@router.get("/regression-sets/{set_id}/cases", response_model=list[CaseOut])
async def list_cases(
set_id: str,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
rows = await svc.list_cases(set_id)
except EvaluationNotFoundError as e:
raise _eval_http_exc(e) from e
return [case_out(r) for r in rows]
@router.post("/regression-sets/{set_id}/cases", response_model=CaseOut)
async def create_case(
set_id: str,
body: CaseCreate,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.create_case(set_id, body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return case_out(row)
@router.post(
"/regression-sets/{set_id}/snapshot-from-conversation/{conversation_id}",
response_model=CaseOut,
)
async def snapshot_from_conversation(
set_id: str,
conversation_id: str,
body: SnapshotFromConversationBody,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.snapshot_from_conversation(set_id, conversation_id, body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return case_out(row)
@router.get("/sessions", response_model=SessionListResponse)
async def list_sessions(
_auth: InternalEvalAuth,
@@ -275,16 +195,25 @@ async def memoir_phase1_ready(
raise _eval_http_exc(e) from e
@router.get(
"/sessions/{conversation_id}/evaluation-runs",
response_model=SessionEvalRunsOut,
@router.post(
"/sessions/{conversation_id}/memoir-submit",
response_model=MemoirSubmitOut,
)
async def list_session_evaluation_runs(
async def memoir_submit_phase1(
conversation_id: str,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
svc: Annotated[
MemoirReadinessService, Depends(get_memoir_readiness_service)
],
):
return await svc.list_session_evaluation_runs(conversation_id)
try:
return await svc.submit_memoir_phase1_for_conversation(
conversation_id=conversation_id,
)
except EvaluationNotFoundError as e:
raise _eval_http_exc(e) from e
except EvaluationBadRequestError as e:
raise _eval_http_exc(e) from e
@router.post("/sessions/replay-bootstrap", response_model=ReplayBootstrapOut)
@@ -342,6 +271,7 @@ async def replay_conversation(
conversation_id=body.conversation_id,
fixture_filename=fn,
flush_memoir_after=body.flush_memoir_after,
skip_memoir=body.skip_memoir,
skip_tts=body.skip_tts,
)
elif body.user_utterances is not None:
@@ -352,6 +282,7 @@ async def replay_conversation(
conversation_id=body.conversation_id,
utterances=utt,
flush_memoir_after=body.flush_memoir_after,
skip_memoir=body.skip_memoir,
skip_tts=body.skip_tts,
)
echo = utt
@@ -383,6 +314,8 @@ async def judge_conversation_manual(
payload = await judge_svc.judge_conversation(
body.conversation_id,
body.fixture_filename,
judge_provider=body.judge_provider,
judge_model=body.judge_model,
)
except EvaluationNotFoundError as e:
raise _eval_http_exc(e) from e
@@ -406,6 +339,8 @@ async def judge_conversation_manual_stream(
body.fixture_filename,
include_turn_judges=body.include_turn_judges,
include_baseline_turn_judges=body.include_baseline_turn_judges,
judge_provider=body.judge_provider,
judge_model=body.judge_model,
):
yield f"data: {json.dumps(evt, ensure_ascii=False)}\n\n"
except Exception as e:
@@ -427,6 +362,32 @@ async def judge_conversation_manual_stream(
)
@router.post(
"/judge/conversation-retry-baseline",
response_model=RetryBaselineJudgeOut,
)
async def retry_baseline_conversation_judge(
body: RetryBaselineJudgeBody,
_auth: InternalEvalAuth,
judge_svc: Annotated[
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
],
):
try:
payload = await judge_svc.retry_baseline_conversation_judge(
body.conversation_id,
body.fixture_filename,
include_baseline_turn_judges=body.include_baseline_turn_judges,
judge_provider=body.judge_provider,
judge_model=body.judge_model,
)
except EvaluationNotFoundError as e:
raise _eval_http_exc(e) from e
except EvaluationBadRequestError as e:
raise _eval_http_exc(e) from e
return RetryBaselineJudgeOut.model_validate(payload)
@router.post("/judge/memoir-chapters", response_model=ManualJudgeMemoirOut)
async def judge_memoir_chapters_manual(
body: ManualJudgeMemoirBody,
@@ -439,6 +400,8 @@ async def judge_memoir_chapters_manual(
payload = await judge_svc.judge_memoir_for_user(
body.user_id,
body.baseline_sections,
judge_provider=body.judge_provider,
judge_model=body.judge_model,
)
except EvaluationBadRequestError as e:
raise _eval_http_exc(e) from e
@@ -496,107 +459,3 @@ async def get_user_export_fixture(
MemoirSectionBaselineOut(title=t, body=b) for t, b in memoir_tuples
],
)
@router.post("/regression-sets/{set_id}/import-markdown", response_model=CaseOut)
async def import_markdown_case(
set_id: str,
body: ImportMarkdownBody,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.import_markdown_case(set_id, body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return case_out(row)
@router.post("/import/json-case", response_model=CaseOut)
async def import_json_case(
body: ImportJsonCaseBody,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.import_json_case(body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return case_out(row)
@router.get("/versions", response_model=list[VersionOut])
async def list_versions(
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
rows = await svc.list_versions()
return [VersionOut.model_validate(r) for r in rows]
@router.post("/versions", response_model=VersionOut)
async def create_version(
body: VersionCreate,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.create_version(body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return VersionOut.model_validate(row)
@router.get("/experiments", response_model=list[ExperimentOut])
async def list_experiments(
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
limit: int = Query(50, ge=1, le=200),
):
rows = await svc.list_experiments(limit=limit)
return [ExperimentOut.model_validate(r) for r in rows]
@router.post("/experiments", response_model=ExperimentOut)
async def create_experiment(
body: ExperimentCreate,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.create_experiment(body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return ExperimentOut.model_validate(row)
@router.get("/experiments/{experiment_id}", response_model=ExperimentDetailOut)
async def get_experiment_detail(
experiment_id: str,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
bundle = await svc.get_experiment_detail(experiment_id)
except EvaluationNotFoundError as e:
raise _eval_http_exc(e) from e
run_outs: list[EvalRunOut] = [run_out(r, turns) for r, turns in bundle.run_rows]
gate = GateVerdictOut.model_validate(bundle.gate) if bundle.gate else None
return ExperimentDetailOut(
experiment=ExperimentOut.model_validate(bundle.experiment),
runs=run_outs,
gate=gate,
)
@router.post("/experiments/{experiment_id}/run", response_model=ExperimentOut)
async def enqueue_experiment_run(
experiment_id: str,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
exp = await svc.enqueue_experiment_run(experiment_id)
except EvaluationNotFoundError as e:
raise _eval_http_exc(e) from e
return ExperimentOut.model_validate(exp)