From 78b61c076e9aa3be07b2d3e50c4a262a96066f4f Mon Sep 17 00:00:00 2001 From: Kevin Date: Wed, 8 Apr 2026 16:50:53 +0800 Subject: [PATCH] =?UTF-8?q?feat(eval):=20Playground=20GLM=20=E8=AF=84?= =?UTF-8?q?=E5=88=86=E8=90=BD=E5=BA=93=E5=B9=B6=E5=8F=AF=E6=81=A2=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 在 conversations 表增加 playground_conversation_judge_json,流式/非流式对话评审结束后写入最近一次快照(整体分、逐轮分、对比文案、错误与基线文件名等)。新增只读 GET 供前端按会话拉取;评测台 Playground 切换会话时自动恢复,并提示基线是否和当时一致。 --- ...013_conversations_playground_judge_json.py | 39 +++ api/app/features/conversation/models.py | 2 + api/app/features/conversation/repo.py | 14 + .../evaluation/judge_manual_service.py | 243 ++++++++++++------ api/app/features/evaluation/router.py | 21 ++ api/app/features/evaluation/schemas.py | 7 + .../evaluation/session_catalog_service.py | 10 + app-eval-web/src/pages/PlaygroundPage.tsx | 116 ++++++++- 8 files changed, 361 insertions(+), 91 deletions(-) create mode 100644 api/alembic/versions/0013_conversations_playground_judge_json.py diff --git a/api/alembic/versions/0013_conversations_playground_judge_json.py b/api/alembic/versions/0013_conversations_playground_judge_json.py new file mode 100644 index 0000000..6223983 --- /dev/null +++ b/api/alembic/versions/0013_conversations_playground_judge_json.py @@ -0,0 +1,39 @@ +"""Persist Playground GLM conversation judge snapshot on conversations. + +Revision ID: 0013_playground_judge +Revises: 0012_mem_fact_tl_lineage +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +revision: str = "0013_playground_judge" +down_revision: Union[str, None] = "0012_mem_fact_tl_lineage" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def _has_column(table: str, column: str) -> bool: + bind = op.get_bind() + return any(c["name"] == column for c in sa.inspect(bind).get_columns(table)) + + +def upgrade() -> None: + if not _has_column("conversations", "playground_conversation_judge_json"): + op.add_column( + "conversations", + sa.Column( + "playground_conversation_judge_json", + postgresql.JSON(astext_type=sa.Text()), + nullable=True, + ), + ) + + +def downgrade() -> None: + if _has_column("conversations", "playground_conversation_judge_json"): + op.drop_column("conversations", "playground_conversation_judge_json") diff --git a/api/app/features/conversation/models.py b/api/app/features/conversation/models.py index 20a97b2..3acac93 100644 --- a/api/app/features/conversation/models.py +++ b/api/app/features/conversation/models.py @@ -27,6 +27,8 @@ class Conversation(Base): current_topic = Column(String, nullable=True) conversation_stage = Column(String, nullable=True) deleted_at = Column(DateTime(timezone=True), nullable=True) + # 内部评测 Playground:最近一次 GLM 对话评分快照(含逐轮分与对比文案) + playground_conversation_judge_json = Column(JSON, nullable=True) user = relationship("User", back_populates="conversations") segments = relationship( diff --git a/api/app/features/conversation/repo.py b/api/app/features/conversation/repo.py index 0287277..ccfb126 100644 --- a/api/app/features/conversation/repo.py +++ b/api/app/features/conversation/repo.py @@ -1,5 +1,7 @@ """Conversation repository — Conversation, turn log, and Segment data access.""" +from typing import Any + from sqlalchemy import func, select from sqlalchemy.ext.asyncio import AsyncSession @@ -12,6 +14,18 @@ async def get_conversation( return await db.get(Conversation, conversation_id) +async def set_playground_conversation_judge_json( + conversation_id: str, + db: AsyncSession, + payload: dict[str, Any] | None, +) -> Conversation | None: + row = await get_conversation(conversation_id, db) + if row is None: + return None + row.playground_conversation_judge_json = payload + return row + + async def get_user_conversations(user_id: str, db: AsyncSession) -> list[Conversation]: stmt = ( select(Conversation) diff --git a/api/app/features/evaluation/judge_manual_service.py b/api/app/features/evaluation/judge_manual_service.py index d772888..98865a4 100644 --- a/api/app/features/evaluation/judge_manual_service.py +++ b/api/app/features/evaluation/judge_manual_service.py @@ -1,9 +1,10 @@ -"""手动触发 GLM-5 评审(不写 eval_runs)。""" +"""手动触发 GLM-5 评审(不写 eval_runs;Playground 对话评分写入 conversations 表)。""" from __future__ import annotations import re from collections.abc import AsyncIterator +from datetime import datetime, timezone from typing import Any from sqlalchemy.ext.asyncio import AsyncSession @@ -106,6 +107,21 @@ class EvalJudgeManualService: def __init__(self, db: AsyncSession) -> None: self._db = db + async def _persist_playground_conversation_judge( + self, conversation_id: str, bundle: dict[str, Any] + ) -> None: + try: + row = await conversation_repo.set_playground_conversation_judge_json( + conversation_id, self._db, bundle + ) + if row is not None: + await self._db.commit() + except Exception: + logger.exception( + "persist playground_conversation_judge_json failed conversation_id={}", + conversation_id, + ) + async def judge_conversation( self, conversation_id: str, @@ -165,6 +181,24 @@ class EvalJudgeManualService: f"replay_glm5_failed: {replay_result.error or 'unknown error'}" ) + bundle: dict[str, Any] = { + "version": 1, + "judged_at": datetime.now(timezone.utc).isoformat(), + "fixture_filename": fn, + "baseline_judge": baseline_judge_dict, + "replay_judge": replay_judge_dict, + "baseline_turn_judges": {}, + "replay_turn_judges": {}, + "compare_markdown": "", + "errors": list(errors), + "warnings": [], + "options": { + "include_turn_judges": False, + "include_baseline_turn_judges": False, + }, + } + await self._persist_playground_conversation_judge(cid, bundle) + return { "conversation_id": cid, "fixture_filename": fn, @@ -183,7 +217,22 @@ class EvalJudgeManualService: include_turn_judges: bool = False, include_baseline_turn_judges: bool = False, ) -> AsyncIterator[dict[str, Any]]: - """供 SSE:先整体基准分、再整体回放分,可选逐轮分,再流式对比与建议。""" + """供 SSE:先整体基准分、再整体回放分,可选逐轮分,再流式对比与建议;成功后写入 playground 字段。""" + acc: dict[str, Any] = { + "version": 1, + "fixture_filename": None, + "baseline_judge": None, + "replay_judge": None, + "baseline_turn_judges": {}, + "replay_turn_judges": {}, + "compare_markdown": "", + "errors": [], + "warnings": [], + "options": { + "include_turn_judges": include_turn_judges, + "include_baseline_turn_judges": include_baseline_turn_judges, + }, + } cid = (conversation_id or "").strip() if not cid: yield { @@ -238,99 +287,129 @@ class EvalJudgeManualService: } return - judge = EvalJudgeService(judge_llm) - yield {"event": "meta", "conversation_id": cid, "fixture_filename": fn} + acc["fixture_filename"] = fn + persist = True + try: + judge = EvalJudgeService(judge_llm) + yield {"event": "meta", "conversation_id": cid, "fixture_filename": fn} - if not baseline_transcript.strip(): - yield { - "event": "warning", - "message": "未提供基准 MD 或基准无文本:仅对回放对话打分并输出单侧改进建议", - } + if not baseline_transcript.strip(): + wmsg = "未提供基准 MD 或基准无文本:仅对回放对话打分并输出单侧改进建议" + acc["warnings"].append(wmsg) + yield {"event": "warning", "message": wmsg} - baseline_judge = None - if baseline_transcript.strip(): - baseline_result = await judge.judge_conversation_result( - full_transcript=baseline_transcript - ) - baseline_judge = baseline_result.output - yield { - "event": "baseline_judge", - "ok": baseline_judge is not None, - "judge": baseline_judge.model_dump() if baseline_judge else None, - } - if not baseline_judge: + baseline_judge = None + if baseline_transcript.strip(): + baseline_result = await judge.judge_conversation_result( + full_transcript=baseline_transcript + ) + baseline_judge = baseline_result.output + acc["baseline_judge"] = ( + baseline_judge.model_dump() if baseline_judge else None + ) yield { - "event": "error", - "phase": "baseline_glm5", - "message": ( + "event": "baseline_judge", + "ok": baseline_judge is not None, + "judge": acc["baseline_judge"], + } + if not baseline_judge: + err = ( f"基准整体打分失败:{baseline_result.error}" if baseline_result.error else "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)" - ), - } - elif ( - include_baseline_turn_judges - and export_turns - and baseline_judge is not None - ): - yield {"event": "meta", "phase": "baseline_turn_judges_start"} - async for row in _iter_turn_judgments_for_turns( - judge, - export_turns, - sse_event="baseline_turn_judge", + ) + acc["errors"].append(err) + yield { + "event": "error", + "phase": "baseline_glm5", + "message": err, + } + elif ( + include_baseline_turn_judges + and export_turns + and baseline_judge is not None ): - yield row - else: - yield { - "event": "baseline_judge", - "ok": False, - "skipped": True, - "judge": None, - } + yield {"event": "meta", "phase": "baseline_turn_judges_start"} + async for row in _iter_turn_judgments_for_turns( + judge, + export_turns, + sse_event="baseline_turn_judge", + ): + if row.get("event") == "baseline_turn_judge": + idx = row.get("turn_index") + if isinstance(idx, (int, float)): + acc["baseline_turn_judges"][str(int(idx))] = row.get( + "judge" + ) + yield row + else: + acc["baseline_judge"] = None + yield { + "event": "baseline_judge", + "ok": False, + "skipped": True, + "judge": None, + } - replay_result = await judge.judge_conversation_result( - full_transcript=replay_transcript - ) - replay_judge = replay_result.output - yield { - "event": "replay_judge", - "ok": replay_judge is not None, - "judge": replay_judge.model_dump() if replay_judge else None, - } - if not replay_judge: + replay_result = await judge.judge_conversation_result( + full_transcript=replay_transcript + ) + replay_judge = replay_result.output + acc["replay_judge"] = ( + replay_judge.model_dump() if replay_judge else None + ) yield { - "event": "error", - "phase": "replay_glm5", - "message": ( + "event": "replay_judge", + "ok": replay_judge is not None, + "judge": acc["replay_judge"], + } + if not replay_judge: + err = ( f"回放对话整体 GLM-5 打分失败:{replay_result.error}" if replay_result.error else "回放对话整体 GLM-5 打分失败(空密钥、限流或 JSON 解析失败,见服务端日志)" - ), - } + ) + acc["errors"].append(err) + yield { + "event": "error", + "phase": "replay_glm5", + "message": err, + } + yield {"event": "done"} + return + + if include_turn_judges: + replay_pairs = pair_session_messages_to_turns(list(dialogue.messages)) + if replay_pairs: + yield {"event": "meta", "phase": "replay_turn_judges_start"} + async for row in _iter_turn_judgments_for_turns( + judge, + replay_pairs, + sse_event="replay_turn_judge", + ): + if row.get("event") == "replay_turn_judge": + idx = row.get("turn_index") + if isinstance(idx, (int, float)): + acc["replay_turn_judges"][str(int(idx))] = row.get( + "judge" + ) + yield row + + async for piece in judge.stream_conversation_compare( + baseline_transcript=baseline_transcript, + replay_transcript=replay_transcript, + baseline_judge=baseline_judge, + replay_judge=replay_judge, + ): + if piece: + acc["compare_markdown"] += piece + yield {"event": "compare_delta", "text": piece} + yield {"event": "done"} - return - - if include_turn_judges: - replay_pairs = pair_session_messages_to_turns(list(dialogue.messages)) - if replay_pairs: - yield {"event": "meta", "phase": "replay_turn_judges_start"} - async for row in _iter_turn_judgments_for_turns( - judge, - replay_pairs, - sse_event="replay_turn_judge", - ): - yield row - - async for piece in judge.stream_conversation_compare( - baseline_transcript=baseline_transcript, - replay_transcript=replay_transcript, - baseline_judge=baseline_judge, - replay_judge=replay_judge, - ): - if piece: - yield {"event": "compare_delta", "text": piece} - - yield {"event": "done"} + finally: + if persist: + acc["judged_at"] = datetime.now(timezone.utc).isoformat() + await self._persist_playground_conversation_judge(cid, acc) async def judge_memoir_for_user( self, diff --git a/api/app/features/evaluation/router.py b/api/app/features/evaluation/router.py index 453582a..7bb7ee7 100644 --- a/api/app/features/evaluation/router.py +++ b/api/app/features/evaluation/router.py @@ -46,6 +46,7 @@ from app.features.evaluation.schemas import ( ManualJudgeConversationStreamBody, ManualJudgeMemoirBody, ManualJudgeMemoirOut, + PlaygroundConversationJudgeOut, MemoirPhase1ReadyOut, MemoirSectionBaselineOut, RegressionSetCreate, @@ -225,6 +226,26 @@ async def get_session_transcript( ) +@router.get( + "/sessions/{conversation_id}/playground-conversation-judge", + response_model=PlaygroundConversationJudgeOut, +) +async def get_playground_conversation_judge( + conversation_id: str, + _auth: InternalEvalAuth, + db: Annotated[AsyncSession, Depends(get_async_db)], +): + catalog = SessionCatalogService(db) + tr = await catalog.get_transcript(conversation_id) + if not tr: + raise HTTPException(status_code=404, detail="conversation not found") + judge = await catalog.get_playground_conversation_judge_json(conversation_id) + return PlaygroundConversationJudgeOut( + conversation_id=conversation_id, + judge=judge, + ) + + @router.get( "/sessions/{conversation_id}/memoir-phase1-ready", response_model=MemoirPhase1ReadyOut, diff --git a/api/app/features/evaluation/schemas.py b/api/app/features/evaluation/schemas.py index 328a235..9c5685b 100644 --- a/api/app/features/evaluation/schemas.py +++ b/api/app/features/evaluation/schemas.py @@ -211,6 +211,13 @@ class ManualJudgeConversationOut(BaseModel): errors: list[str] = Field(default_factory=list) +class PlaygroundConversationJudgeOut(BaseModel): + """`conversations.playground_conversation_judge_json` 的只读视图。""" + + conversation_id: str + judge: dict[str, Any] | None = None + + class ManualJudgeMemoirBody(BaseModel): user_id: str baseline_sections: list[MemoirSectionBaselineOut] | None = None diff --git a/api/app/features/evaluation/session_catalog_service.py b/api/app/features/evaluation/session_catalog_service.py index a37e153..37c907c 100644 --- a/api/app/features/evaluation/session_catalog_service.py +++ b/api/app/features/evaluation/session_catalog_service.py @@ -3,6 +3,7 @@ from __future__ import annotations from dataclasses import dataclass +from typing import Any from sqlalchemy.ext.asyncio import AsyncSession @@ -110,3 +111,12 @@ class SessionCatalogService: user_utterances_from_segments=from_segments, user_utterances_from_messages=from_messages, ) + + async def get_playground_conversation_judge_json( + self, conversation_id: str + ) -> dict[str, Any] | None: + c = await self._repo.get_conversation(conversation_id) + if not c or c.deleted_at: + return None + raw = c.playground_conversation_judge_json + return raw if isinstance(raw, dict) else None diff --git a/app-eval-web/src/pages/PlaygroundPage.tsx b/app-eval-web/src/pages/PlaygroundPage.tsx index 3fc8d86..c6923af 100644 --- a/app-eval-web/src/pages/PlaygroundPage.tsx +++ b/app-eval-web/src/pages/PlaygroundPage.tsx @@ -130,6 +130,10 @@ export default function PlaygroundPage() { const [convJudgeStreamText, setConvJudgeStreamText] = useState(""); const [convJudgeErrors, setConvJudgeErrors] = useState([]); const [convJudgePhase, setConvJudgePhase] = useState(""); + const [persistedJudgeMeta, setPersistedJudgeMeta] = useState<{ + judgedAt: string | null; + savedFixture: string | null; + } | null>(null); const [fixtureFiles, setFixtureFiles] = useState([]); const [fixtureName, setFixtureName] = useState(""); @@ -265,7 +269,49 @@ export default function PlaygroundPage() { })(); }, [fixtureName]); - function resetJudgeUi() { + const applyPersistedPlaygroundJudge = useCallback( + (j: Record) => { + setConvJudgeBaseline(j.baseline_judge ?? null); + setConvJudgeReplay(j.replay_judge ?? null); + setConvJudgeStreamText( + typeof j.compare_markdown === "string" ? j.compare_markdown : "", + ); + const errs: string[] = []; + if (Array.isArray(j.errors)) { + for (const x of j.errors) errs.push(String(x)); + } + if (Array.isArray(j.warnings)) { + for (const x of j.warnings) errs.push(String(x)); + } + setConvJudgeErrors(errs); + const bt = j.baseline_turn_judges; + if (bt && typeof bt === "object" && !Array.isArray(bt)) { + const o: Record = {}; + for (const [k, v] of Object.entries(bt)) { + const i = Number(k); + if (!Number.isNaN(i)) o[i] = v; + } + setBaselineTurnJudges(o); + } else { + setBaselineTurnJudges({}); + } + const rt = j.replay_turn_judges; + if (rt && typeof rt === "object" && !Array.isArray(rt)) { + const o: Record = {}; + for (const [k, v] of Object.entries(rt)) { + const i = Number(k); + if (!Number.isNaN(i)) o[i] = v; + } + setReplayTurnJudges(o); + } else { + setReplayTurnJudges({}); + } + setConvJudgePhase(""); + }, + [], + ); + + const resetJudgeUi = useCallback(() => { setConvJudgeBaseline(null); setConvJudgeReplay(null); setConvJudgeStreamText(""); @@ -273,7 +319,52 @@ export default function PlaygroundPage() { setConvJudgePhase(""); setBaselineTurnJudges({}); setReplayTurnJudges({}); - } + setPersistedJudgeMeta(null); + }, []); + + const playgroundPersistHint = useMemo(() => { + if (!persistedJudgeMeta) return null; + const { judgedAt, savedFixture } = persistedJudgeMeta; + let s = "已恢复上次保存在服务器上的 GLM 评分"; + if (judgedAt) s += `(${judgedAt})`; + if (savedFixture && fixtureName && savedFixture !== fixtureName) { + s += + "。当前所选基线导出文件与当时不一致,若要对照基线请重新跑一次自动评分"; + } + return s; + }, [persistedJudgeMeta, fixtureName]); + + useEffect(() => { + const cid = replayConversationId.trim(); + if (!cid) { + resetJudgeUi(); + return; + } + let cancelled = false; + void (async () => { + const r = await api<{ + conversation_id: string; + judge: Record | null; + }>( + `/internal/api/evaluation/sessions/${encodeURIComponent(cid)}/playground-conversation-judge`, + ); + if (cancelled) return; + if (!r.ok || !r.data?.judge) { + resetJudgeUi(); + return; + } + const j = r.data.judge; + applyPersistedPlaygroundJudge(j); + setPersistedJudgeMeta({ + judgedAt: typeof j.judged_at === "string" ? j.judged_at : null, + savedFixture: + typeof j.fixture_filename === "string" ? j.fixture_filename : null, + }); + })(); + return () => { + cancelled = true; + }; + }, [replayConversationId, applyPersistedPlaygroundJudge, resetJudgeUi]); async function runReplay(resume: boolean) { if (!fixtureName) { @@ -519,12 +610,7 @@ export default function PlaygroundPage() { return; } setJudgeConvBusy(true); - setConvJudgeBaseline(null); - setConvJudgeReplay(null); - setConvJudgeStreamText(""); - setConvJudgeErrors([]); - setBaselineTurnJudges({}); - setReplayTurnJudges({}); + resetJudgeUi(); setConvJudgePhase("连接评分服务…"); try { const url = `${apiBase}/internal/api/evaluation/judge/conversation-stream`; @@ -625,7 +711,11 @@ export default function PlaygroundPage() { } else if (ev === "done") { setConvJudgePhase(""); if (!judgeStreamHadError) { - pushNotice("自动评分流已结束", "success"); + pushNotice("自动评分流已结束(结果已写入服务器)", "success"); + setPersistedJudgeMeta({ + judgedAt: new Date().toISOString(), + savedFixture: fixtureName.trim() || null, + }); } } } @@ -1026,6 +1116,14 @@ export default function PlaygroundPage() { 先对基线 transcript 与当前会话做整体评分,可按需勾选逐轮 GLM-5;再流式输出差异与建议。请确保所选基线与当前会话一致。

+ {playgroundPersistHint ? ( +

+ {playgroundPersistHint} +

+ ) : null} {convJudgePhase ? (

{convJudgePhase}