Files
life-echo/api/app/features/evaluation/router.py
Kevin 78b61c076e feat(eval): Playground GLM 评分落库并可恢复
在 conversations 表增加 playground_conversation_judge_json,流式/非流式对话评审结束后写入最近一次快照(整体分、逐轮分、对比文案、错误与基线文件名等)。新增只读 GET 供前端按会话拉取;评测台 Playground 切换会话时自动恢复,并提示基线是否和当时一致。
2026-04-08 16:51:08 +08:00

603 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""内部评测 REST API。"""
from __future__ import annotations
import json
from typing import Annotated
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import StreamingResponse
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.db import get_async_db
from app.features.evaluation.admin_service import EvaluationAdminService
from app.features.evaluation.deps import (
get_eval_judge_manual_service,
get_evaluation_admin_service,
get_memoir_readiness_service,
get_replay_conversation_service,
)
from app.features.evaluation.errors import (
EvaluationBadRequestError,
EvaluationNotFoundError,
)
from app.features.evaluation.importers.user_export_markdown import (
extract_memoir_chapter_sections_from_export_md,
extract_source_user_id_from_export_md,
)
from app.features.evaluation.internal_auth import InternalEvalAuth
from app.features.evaluation.judge_manual_service import EvalJudgeManualService
from app.features.evaluation.memoir_readiness_service import MemoirReadinessService
from app.features.evaluation.presenters import case_out, run_out
from app.features.evaluation.replay_service import ReplayConversationService
from app.features.evaluation.schemas import (
CaseCreate,
CaseOut,
EvalRunOut,
EvalSandboxOut,
ExperimentCreate,
ExperimentDetailOut,
ExperimentOut,
GateVerdictOut,
ImportJsonCaseBody,
ImportMarkdownBody,
ManualJudgeConversationBody,
ManualJudgeConversationOut,
ManualJudgeConversationStreamBody,
ManualJudgeMemoirBody,
ManualJudgeMemoirOut,
PlaygroundConversationJudgeOut,
MemoirPhase1ReadyOut,
MemoirSectionBaselineOut,
RegressionSetCreate,
RegressionSetOut,
ReplayBootstrapBody,
ReplayBootstrapOut,
ReplayConversationBody,
ReplayConversationOut,
SessionDialogueOut,
SessionEvalRunsOut,
SessionListItem,
SessionListResponse,
SessionTranscriptOut,
SnapshotFromConversationBody,
UserExportFixtureDetailOut,
UserExportFixtureListOut,
UserExportFixtureTurnOut,
UserMemoirSnapshotOut,
VersionCreate,
VersionOut,
)
from app.features.evaluation.session_catalog_service import SessionCatalogService
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
router = APIRouter(tags=["internal-evaluation"])
@router.get("/ping", include_in_schema=False)
async def eval_api_ping() -> dict[str, str | bool]:
"""无鉴权:确认当前进程是 internal_main 且路由已挂载。"""
return {"ok": True, "service": "life-echo-internal-eval"}
def _eval_http_exc(
e: EvaluationNotFoundError | EvaluationBadRequestError,
) -> HTTPException:
if isinstance(e, EvaluationNotFoundError):
return HTTPException(status_code=404, detail=e.detail)
return HTTPException(status_code=400, detail=e.detail)
@router.get("/regression-sets", response_model=list[RegressionSetOut])
async def list_regression_sets(
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
rows = await svc.list_regression_sets()
return [RegressionSetOut.model_validate(r) for r in rows]
@router.post("/regression-sets", response_model=RegressionSetOut)
async def create_regression_set(
body: RegressionSetCreate,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.create_regression_set(body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return RegressionSetOut.model_validate(row)
@router.get("/regression-sets/{set_id}/cases", response_model=list[CaseOut])
async def list_cases(
set_id: str,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
rows = await svc.list_cases(set_id)
except EvaluationNotFoundError as e:
raise _eval_http_exc(e) from e
return [case_out(r) for r in rows]
@router.post("/regression-sets/{set_id}/cases", response_model=CaseOut)
async def create_case(
set_id: str,
body: CaseCreate,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.create_case(set_id, body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return case_out(row)
@router.post(
"/regression-sets/{set_id}/snapshot-from-conversation/{conversation_id}",
response_model=CaseOut,
)
async def snapshot_from_conversation(
set_id: str,
conversation_id: str,
body: SnapshotFromConversationBody,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.snapshot_from_conversation(set_id, conversation_id, body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return case_out(row)
@router.get("/sessions", response_model=SessionListResponse)
async def list_sessions(
_auth: InternalEvalAuth,
db: Annotated[AsyncSession, Depends(get_async_db)],
offset: int = Query(0, ge=0),
limit: int = Query(50, ge=1, le=200),
user_id: str | None = Query(None),
q: str | None = Query(None),
status: str | None = Query(
None,
description="按会话 status 过滤,如 active",
),
):
catalog = SessionCatalogService(db)
rows, total = await catalog.list_sessions(
offset=offset, limit=limit, user_id=user_id, q=q, status=status
)
return SessionListResponse(
items=[
SessionListItem(
id=r.id,
user_id=r.user_id,
user_phone=r.user_phone,
started_at=r.started_at,
last_message_at=r.last_message_at,
conversation_stage=r.conversation_stage,
current_topic=r.current_topic,
status=r.status,
)
for r in rows
],
total=total,
)
@router.get(
"/sessions/{conversation_id}/dialogue",
response_model=SessionDialogueOut,
)
async def get_session_dialogue(
conversation_id: str,
_auth: InternalEvalAuth,
db: Annotated[AsyncSession, Depends(get_async_db)],
):
catalog = SessionCatalogService(db)
out = await catalog.get_session_dialogue(conversation_id)
if not out:
raise HTTPException(status_code=404, detail="conversation not found")
return out
@router.get(
"/sessions/{conversation_id}/transcript", response_model=SessionTranscriptOut
)
async def get_session_transcript(
conversation_id: str,
_auth: InternalEvalAuth,
db: Annotated[AsyncSession, Depends(get_async_db)],
):
catalog = SessionCatalogService(db)
tr = await catalog.get_transcript(conversation_id)
if not tr:
raise HTTPException(status_code=404, detail="conversation not found")
return SessionTranscriptOut(
conversation_id=tr.conversation_id,
user_id=tr.user_id,
user_utterances_from_segments=tr.user_utterances_from_segments,
user_utterances_from_messages=tr.user_utterances_from_messages,
)
@router.get(
"/sessions/{conversation_id}/playground-conversation-judge",
response_model=PlaygroundConversationJudgeOut,
)
async def get_playground_conversation_judge(
conversation_id: str,
_auth: InternalEvalAuth,
db: Annotated[AsyncSession, Depends(get_async_db)],
):
catalog = SessionCatalogService(db)
tr = await catalog.get_transcript(conversation_id)
if not tr:
raise HTTPException(status_code=404, detail="conversation not found")
judge = await catalog.get_playground_conversation_judge_json(conversation_id)
return PlaygroundConversationJudgeOut(
conversation_id=conversation_id,
judge=judge,
)
@router.get(
"/sessions/{conversation_id}/memoir-phase1-ready",
response_model=MemoirPhase1ReadyOut,
)
async def memoir_phase1_ready(
conversation_id: str,
_auth: InternalEvalAuth,
svc: Annotated[
MemoirReadinessService, Depends(get_memoir_readiness_service)
],
segment_ids: Annotated[
list[str],
Query(
min_length=1,
description="本批待检查的 segment id可重复 query 参数 segment_ids=id1&segment_ids=id2",
),
],
):
try:
return await svc.memoir_phase1_ready_for_segments(
conversation_id=conversation_id,
segment_ids=segment_ids,
)
except EvaluationNotFoundError as e:
raise _eval_http_exc(e) from e
except EvaluationBadRequestError as e:
raise _eval_http_exc(e) from e
@router.get(
"/sessions/{conversation_id}/evaluation-runs",
response_model=SessionEvalRunsOut,
)
async def list_session_evaluation_runs(
conversation_id: str,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
return await svc.list_session_evaluation_runs(conversation_id)
@router.post("/sessions/replay-bootstrap", response_model=ReplayBootstrapOut)
async def replay_bootstrap(
body: ReplayBootstrapBody,
_auth: InternalEvalAuth,
replay: Annotated[
ReplayConversationService, Depends(get_replay_conversation_service)
],
):
try:
cid = await replay.bootstrap_conversation(body.user_id)
except EvaluationBadRequestError as e:
raise _eval_http_exc(e) from e
return ReplayBootstrapOut(conversation_id=cid)
@router.post("/sessions/eval-sandbox", response_model=EvalSandboxOut)
async def create_eval_sandbox(
_auth: InternalEvalAuth,
replay: Annotated[
ReplayConversationService, Depends(get_replay_conversation_service)
],
):
try:
uid, cid, phone, nick = await replay.create_eval_sandbox()
except EvaluationBadRequestError as e:
raise _eval_http_exc(e) from e
return EvalSandboxOut(
user_id=uid,
conversation_id=cid,
phone=phone,
nickname=nick,
)
@router.post("/replay/conversation", response_model=ReplayConversationOut)
async def replay_conversation(
body: ReplayConversationBody,
_auth: InternalEvalAuth,
replay: Annotated[
ReplayConversationService, Depends(get_replay_conversation_service)
],
):
if body.fixture_filename and body.user_utterances:
raise HTTPException(
status_code=400,
detail="provide only one of fixture_filename or user_utterances",
)
try:
segment_ids: list[str] = []
if body.fixture_filename:
fn = body.fixture_filename.strip()
n, echo, segment_ids = await replay.replay_fixture(
conversation_id=body.conversation_id,
fixture_filename=fn,
flush_memoir_after=body.flush_memoir_after,
skip_tts=body.skip_tts,
)
elif body.user_utterances is not None:
utt = [str(u) for u in body.user_utterances if str(u).strip()]
if not utt:
raise EvaluationBadRequestError("user_utterances is empty")
n, segment_ids = await replay.replay_utterances(
conversation_id=body.conversation_id,
utterances=utt,
flush_memoir_after=body.flush_memoir_after,
skip_tts=body.skip_tts,
)
echo = utt
else:
raise EvaluationBadRequestError(
"fixture_filename or user_utterances required"
)
except EvaluationNotFoundError as e:
raise _eval_http_exc(e) from e
except EvaluationBadRequestError as e:
raise _eval_http_exc(e) from e
return ReplayConversationOut(
conversation_id=body.conversation_id,
turns_replayed=n,
utterances_echo=echo,
segment_ids=segment_ids,
)
@router.post("/judge/conversation", response_model=ManualJudgeConversationOut)
async def judge_conversation_manual(
body: ManualJudgeConversationBody,
_auth: InternalEvalAuth,
judge_svc: Annotated[
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
],
):
try:
payload = await judge_svc.judge_conversation(
body.conversation_id,
body.fixture_filename,
)
except EvaluationNotFoundError as e:
raise _eval_http_exc(e) from e
except EvaluationBadRequestError as e:
raise _eval_http_exc(e) from e
return ManualJudgeConversationOut.model_validate(payload)
@router.post("/judge/conversation-stream")
async def judge_conversation_manual_stream(
body: ManualJudgeConversationStreamBody,
_auth: InternalEvalAuth,
judge_svc: Annotated[
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
],
):
async def event_iter():
try:
async for evt in judge_svc.iter_conversation_judge_sse(
body.conversation_id,
body.fixture_filename,
include_turn_judges=body.include_turn_judges,
include_baseline_turn_judges=body.include_baseline_turn_judges,
):
yield f"data: {json.dumps(evt, ensure_ascii=False)}\n\n"
except Exception as e:
err = json.dumps(
{"event": "error", "phase": "server", "message": str(e)},
ensure_ascii=False,
)
yield f"data: {err}\n\n"
yield f"data: {json.dumps({'event': 'done'}, ensure_ascii=False)}\n\n"
return StreamingResponse(
event_iter(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no",
},
)
@router.post("/judge/memoir-chapters", response_model=ManualJudgeMemoirOut)
async def judge_memoir_chapters_manual(
body: ManualJudgeMemoirBody,
_auth: InternalEvalAuth,
judge_svc: Annotated[
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
],
):
try:
payload = await judge_svc.judge_memoir_for_user(
body.user_id,
body.baseline_sections,
)
except EvaluationBadRequestError as e:
raise _eval_http_exc(e) from e
return ManualJudgeMemoirOut.model_validate(payload)
@router.get("/users/{user_id}/memoir-snapshot", response_model=UserMemoirSnapshotOut)
async def get_user_memoir_snapshot(
user_id: str,
_auth: InternalEvalAuth,
judge_svc: Annotated[
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
],
):
try:
payload = await judge_svc.memoir_snapshot(user_id)
except EvaluationBadRequestError as e:
raise _eval_http_exc(e) from e
return UserMemoirSnapshotOut.model_validate(payload)
@router.get(
"/fixtures/user-exports",
response_model=UserExportFixtureListOut,
)
async def list_user_export_fixtures(
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
return UserExportFixtureListOut(items=svc.list_user_export_fixture_names())
@router.get(
"/fixtures/user-exports/{filename}",
response_model=UserExportFixtureDetailOut,
)
async def get_user_export_fixture(
filename: str,
_auth: InternalEvalAuth,
):
try:
turns, raw_md = read_user_export_fixture(filename)
except ValueError:
raise HTTPException(
status_code=400, detail="invalid fixture filename"
) from None
except FileNotFoundError:
raise HTTPException(status_code=404, detail="fixture not found") from None
memoir_tuples = extract_memoir_chapter_sections_from_export_md(raw_md)
return UserExportFixtureDetailOut(
filename=filename,
turns=[UserExportFixtureTurnOut(user=u, ai=a) for u, a in turns],
source_user_id=extract_source_user_id_from_export_md(raw_md),
memoir_sections=[
MemoirSectionBaselineOut(title=t, body=b) for t, b in memoir_tuples
],
)
@router.post("/regression-sets/{set_id}/import-markdown", response_model=CaseOut)
async def import_markdown_case(
set_id: str,
body: ImportMarkdownBody,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.import_markdown_case(set_id, body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return case_out(row)
@router.post("/import/json-case", response_model=CaseOut)
async def import_json_case(
body: ImportJsonCaseBody,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.import_json_case(body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return case_out(row)
@router.get("/versions", response_model=list[VersionOut])
async def list_versions(
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
rows = await svc.list_versions()
return [VersionOut.model_validate(r) for r in rows]
@router.post("/versions", response_model=VersionOut)
async def create_version(
body: VersionCreate,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.create_version(body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return VersionOut.model_validate(row)
@router.get("/experiments", response_model=list[ExperimentOut])
async def list_experiments(
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
limit: int = Query(50, ge=1, le=200),
):
rows = await svc.list_experiments(limit=limit)
return [ExperimentOut.model_validate(r) for r in rows]
@router.post("/experiments", response_model=ExperimentOut)
async def create_experiment(
body: ExperimentCreate,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
row = await svc.create_experiment(body)
except (EvaluationNotFoundError, EvaluationBadRequestError) as e:
raise _eval_http_exc(e) from e
return ExperimentOut.model_validate(row)
@router.get("/experiments/{experiment_id}", response_model=ExperimentDetailOut)
async def get_experiment_detail(
experiment_id: str,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
bundle = await svc.get_experiment_detail(experiment_id)
except EvaluationNotFoundError as e:
raise _eval_http_exc(e) from e
run_outs: list[EvalRunOut] = [run_out(r, turns) for r, turns in bundle.run_rows]
gate = GateVerdictOut.model_validate(bundle.gate) if bundle.gate else None
return ExperimentDetailOut(
experiment=ExperimentOut.model_validate(bundle.experiment),
runs=run_outs,
gate=gate,
)
@router.post("/experiments/{experiment_id}/run", response_model=ExperimentOut)
async def enqueue_experiment_run(
experiment_id: str,
_auth: InternalEvalAuth,
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
):
try:
exp = await svc.enqueue_experiment_run(experiment_id)
except EvaluationNotFoundError as e:
raise _eval_http_exc(e) from e
return ExperimentOut.model_validate(exp)