feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
This commit is contained in:
Kevin
2026-04-10 10:23:43 +08:00
parent b0251e5b26
commit ac49bc7f23
59 changed files with 4773 additions and 696 deletions

View File

@@ -10,6 +10,7 @@ from fastapi.responses import StreamingResponse
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.db import get_async_db
from app.core.memoir_pipeline_progress import get_pipeline_run_for_eval
from app.features.evaluation.admin_service import EvaluationAdminService
from app.features.evaluation.deps import (
get_eval_judge_manual_service,
@@ -37,6 +38,7 @@ from app.features.evaluation.schemas import (
ManualJudgeMemoirBody,
ManualJudgeMemoirOut,
MemoirPhase1ReadyOut,
MemoirPipelineRunOut,
MemoirSectionBaselineOut,
MemoirSubmitOut,
PlaygroundConversationJudgeOut,
@@ -166,6 +168,42 @@ async def get_playground_conversation_judge(
)
@router.get(
"/users/{user_id}/memoir-pipeline-run",
response_model=MemoirPipelineRunOut,
)
async def get_memoir_pipeline_run(
user_id: str,
_auth: InternalEvalAuth,
phase1_task_id: Annotated[
str | None,
Query(description="Phase1 Celery task id与 memoir-submit 返回一致)"),
] = None,
memoir_correlation_id: Annotated[
str | None,
Query(description="流水线聚合根 ID与日志 memoir_correlation_id 一致)"),
] = None,
):
if not phase1_task_id and not memoir_correlation_id:
raise HTTPException(
status_code=400,
detail="provide phase1_task_id or memoir_correlation_id",
)
if phase1_task_id and memoir_correlation_id:
raise HTTPException(
status_code=400,
detail="provide only one of phase1_task_id or memoir_correlation_id",
)
snap = get_pipeline_run_for_eval(
user_id.strip(),
memoir_correlation_id=memoir_correlation_id,
phase1_task_id=phase1_task_id,
)
if not snap:
raise HTTPException(status_code=404, detail="pipeline snapshot not found")
return MemoirPipelineRunOut.model_validate(snap)
@router.get(
"/sessions/{conversation_id}/memoir-phase1-ready",
response_model=MemoirPhase1ReadyOut,
@@ -412,6 +450,42 @@ async def judge_memoir_chapters_manual(
return ManualJudgeMemoirOut.model_validate(payload)
@router.post("/judge/memoir-chapters-stream")
async def judge_memoir_chapters_stream(
body: ManualJudgeMemoirBody,
_auth: InternalEvalAuth,
judge_svc: Annotated[
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
],
):
async def event_iter():
try:
async for evt in judge_svc.iter_memoir_chapter_judge_sse(
body.user_id,
body.baseline_sections,
judge_provider=body.judge_provider,
judge_model=body.judge_model,
):
yield f"data: {json.dumps(evt, ensure_ascii=False)}\n\n"
except Exception as e:
err = json.dumps(
{"event": "error", "phase": "server", "message": str(e)},
ensure_ascii=False,
)
yield f"data: {err}\n\n"
yield f"data: {json.dumps({'event': 'done'}, ensure_ascii=False)}\n\n"
return StreamingResponse(
event_iter(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no",
},
)
@router.get("/users/{user_id}/memoir-snapshot", response_model=UserMemoirSnapshotOut)
async def get_user_memoir_snapshot(
user_id: str,