From 78b61c076e9aa3be07b2d3e50c4a262a96066f4f Mon Sep 17 00:00:00 2001
From: Kevin
Date: Wed, 8 Apr 2026 16:50:53 +0800
Subject: [PATCH] =?UTF-8?q?feat(eval):=20Playground=20GLM=20=E8=AF=84?=
=?UTF-8?q?=E5=88=86=E8=90=BD=E5=BA=93=E5=B9=B6=E5=8F=AF=E6=81=A2=E5=A4=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
在 conversations 表增加 playground_conversation_judge_json,流式/非流式对话评审结束后写入最近一次快照(整体分、逐轮分、对比文案、错误与基线文件名等)。新增只读 GET 供前端按会话拉取;评测台 Playground 切换会话时自动恢复,并提示基线是否和当时一致。
---
...013_conversations_playground_judge_json.py | 39 +++
api/app/features/conversation/models.py | 2 +
api/app/features/conversation/repo.py | 14 +
.../evaluation/judge_manual_service.py | 243 ++++++++++++------
api/app/features/evaluation/router.py | 21 ++
api/app/features/evaluation/schemas.py | 7 +
.../evaluation/session_catalog_service.py | 10 +
app-eval-web/src/pages/PlaygroundPage.tsx | 116 ++++++++-
8 files changed, 361 insertions(+), 91 deletions(-)
create mode 100644 api/alembic/versions/0013_conversations_playground_judge_json.py
diff --git a/api/alembic/versions/0013_conversations_playground_judge_json.py b/api/alembic/versions/0013_conversations_playground_judge_json.py
new file mode 100644
index 0000000..6223983
--- /dev/null
+++ b/api/alembic/versions/0013_conversations_playground_judge_json.py
@@ -0,0 +1,39 @@
+"""Persist Playground GLM conversation judge snapshot on conversations.
+
+Revision ID: 0013_playground_judge
+Revises: 0012_mem_fact_tl_lineage
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+from alembic import op
+
+revision: str = "0013_playground_judge"
+down_revision: Union[str, None] = "0012_mem_fact_tl_lineage"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def _has_column(table: str, column: str) -> bool:
+ bind = op.get_bind()
+ return any(c["name"] == column for c in sa.inspect(bind).get_columns(table))
+
+
+def upgrade() -> None:
+ if not _has_column("conversations", "playground_conversation_judge_json"):
+ op.add_column(
+ "conversations",
+ sa.Column(
+ "playground_conversation_judge_json",
+ postgresql.JSON(astext_type=sa.Text()),
+ nullable=True,
+ ),
+ )
+
+
+def downgrade() -> None:
+ if _has_column("conversations", "playground_conversation_judge_json"):
+ op.drop_column("conversations", "playground_conversation_judge_json")
diff --git a/api/app/features/conversation/models.py b/api/app/features/conversation/models.py
index 20a97b2..3acac93 100644
--- a/api/app/features/conversation/models.py
+++ b/api/app/features/conversation/models.py
@@ -27,6 +27,8 @@ class Conversation(Base):
current_topic = Column(String, nullable=True)
conversation_stage = Column(String, nullable=True)
deleted_at = Column(DateTime(timezone=True), nullable=True)
+ # 内部评测 Playground:最近一次 GLM 对话评分快照(含逐轮分与对比文案)
+ playground_conversation_judge_json = Column(JSON, nullable=True)
user = relationship("User", back_populates="conversations")
segments = relationship(
diff --git a/api/app/features/conversation/repo.py b/api/app/features/conversation/repo.py
index 0287277..ccfb126 100644
--- a/api/app/features/conversation/repo.py
+++ b/api/app/features/conversation/repo.py
@@ -1,5 +1,7 @@
"""Conversation repository — Conversation, turn log, and Segment data access."""
+from typing import Any
+
from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession
@@ -12,6 +14,18 @@ async def get_conversation(
return await db.get(Conversation, conversation_id)
+async def set_playground_conversation_judge_json(
+ conversation_id: str,
+ db: AsyncSession,
+ payload: dict[str, Any] | None,
+) -> Conversation | None:
+ row = await get_conversation(conversation_id, db)
+ if row is None:
+ return None
+ row.playground_conversation_judge_json = payload
+ return row
+
+
async def get_user_conversations(user_id: str, db: AsyncSession) -> list[Conversation]:
stmt = (
select(Conversation)
diff --git a/api/app/features/evaluation/judge_manual_service.py b/api/app/features/evaluation/judge_manual_service.py
index d772888..98865a4 100644
--- a/api/app/features/evaluation/judge_manual_service.py
+++ b/api/app/features/evaluation/judge_manual_service.py
@@ -1,9 +1,10 @@
-"""手动触发 GLM-5 评审(不写 eval_runs)。"""
+"""手动触发 GLM-5 评审(不写 eval_runs;Playground 对话评分写入 conversations 表)。"""
from __future__ import annotations
import re
from collections.abc import AsyncIterator
+from datetime import datetime, timezone
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
@@ -106,6 +107,21 @@ class EvalJudgeManualService:
def __init__(self, db: AsyncSession) -> None:
self._db = db
+ async def _persist_playground_conversation_judge(
+ self, conversation_id: str, bundle: dict[str, Any]
+ ) -> None:
+ try:
+ row = await conversation_repo.set_playground_conversation_judge_json(
+ conversation_id, self._db, bundle
+ )
+ if row is not None:
+ await self._db.commit()
+ except Exception:
+ logger.exception(
+ "persist playground_conversation_judge_json failed conversation_id={}",
+ conversation_id,
+ )
+
async def judge_conversation(
self,
conversation_id: str,
@@ -165,6 +181,24 @@ class EvalJudgeManualService:
f"replay_glm5_failed: {replay_result.error or 'unknown error'}"
)
+ bundle: dict[str, Any] = {
+ "version": 1,
+ "judged_at": datetime.now(timezone.utc).isoformat(),
+ "fixture_filename": fn,
+ "baseline_judge": baseline_judge_dict,
+ "replay_judge": replay_judge_dict,
+ "baseline_turn_judges": {},
+ "replay_turn_judges": {},
+ "compare_markdown": "",
+ "errors": list(errors),
+ "warnings": [],
+ "options": {
+ "include_turn_judges": False,
+ "include_baseline_turn_judges": False,
+ },
+ }
+ await self._persist_playground_conversation_judge(cid, bundle)
+
return {
"conversation_id": cid,
"fixture_filename": fn,
@@ -183,7 +217,22 @@ class EvalJudgeManualService:
include_turn_judges: bool = False,
include_baseline_turn_judges: bool = False,
) -> AsyncIterator[dict[str, Any]]:
- """供 SSE:先整体基准分、再整体回放分,可选逐轮分,再流式对比与建议。"""
+ """供 SSE:先整体基准分、再整体回放分,可选逐轮分,再流式对比与建议;成功后写入 playground 字段。"""
+ acc: dict[str, Any] = {
+ "version": 1,
+ "fixture_filename": None,
+ "baseline_judge": None,
+ "replay_judge": None,
+ "baseline_turn_judges": {},
+ "replay_turn_judges": {},
+ "compare_markdown": "",
+ "errors": [],
+ "warnings": [],
+ "options": {
+ "include_turn_judges": include_turn_judges,
+ "include_baseline_turn_judges": include_baseline_turn_judges,
+ },
+ }
cid = (conversation_id or "").strip()
if not cid:
yield {
@@ -238,99 +287,129 @@ class EvalJudgeManualService:
}
return
- judge = EvalJudgeService(judge_llm)
- yield {"event": "meta", "conversation_id": cid, "fixture_filename": fn}
+ acc["fixture_filename"] = fn
+ persist = True
+ try:
+ judge = EvalJudgeService(judge_llm)
+ yield {"event": "meta", "conversation_id": cid, "fixture_filename": fn}
- if not baseline_transcript.strip():
- yield {
- "event": "warning",
- "message": "未提供基准 MD 或基准无文本:仅对回放对话打分并输出单侧改进建议",
- }
+ if not baseline_transcript.strip():
+ wmsg = "未提供基准 MD 或基准无文本:仅对回放对话打分并输出单侧改进建议"
+ acc["warnings"].append(wmsg)
+ yield {"event": "warning", "message": wmsg}
- baseline_judge = None
- if baseline_transcript.strip():
- baseline_result = await judge.judge_conversation_result(
- full_transcript=baseline_transcript
- )
- baseline_judge = baseline_result.output
- yield {
- "event": "baseline_judge",
- "ok": baseline_judge is not None,
- "judge": baseline_judge.model_dump() if baseline_judge else None,
- }
- if not baseline_judge:
+ baseline_judge = None
+ if baseline_transcript.strip():
+ baseline_result = await judge.judge_conversation_result(
+ full_transcript=baseline_transcript
+ )
+ baseline_judge = baseline_result.output
+ acc["baseline_judge"] = (
+ baseline_judge.model_dump() if baseline_judge else None
+ )
yield {
- "event": "error",
- "phase": "baseline_glm5",
- "message": (
+ "event": "baseline_judge",
+ "ok": baseline_judge is not None,
+ "judge": acc["baseline_judge"],
+ }
+ if not baseline_judge:
+ err = (
f"基准整体打分失败:{baseline_result.error}"
if baseline_result.error
else "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)"
- ),
- }
- elif (
- include_baseline_turn_judges
- and export_turns
- and baseline_judge is not None
- ):
- yield {"event": "meta", "phase": "baseline_turn_judges_start"}
- async for row in _iter_turn_judgments_for_turns(
- judge,
- export_turns,
- sse_event="baseline_turn_judge",
+ )
+ acc["errors"].append(err)
+ yield {
+ "event": "error",
+ "phase": "baseline_glm5",
+ "message": err,
+ }
+ elif (
+ include_baseline_turn_judges
+ and export_turns
+ and baseline_judge is not None
):
- yield row
- else:
- yield {
- "event": "baseline_judge",
- "ok": False,
- "skipped": True,
- "judge": None,
- }
+ yield {"event": "meta", "phase": "baseline_turn_judges_start"}
+ async for row in _iter_turn_judgments_for_turns(
+ judge,
+ export_turns,
+ sse_event="baseline_turn_judge",
+ ):
+ if row.get("event") == "baseline_turn_judge":
+ idx = row.get("turn_index")
+ if isinstance(idx, (int, float)):
+ acc["baseline_turn_judges"][str(int(idx))] = row.get(
+ "judge"
+ )
+ yield row
+ else:
+ acc["baseline_judge"] = None
+ yield {
+ "event": "baseline_judge",
+ "ok": False,
+ "skipped": True,
+ "judge": None,
+ }
- replay_result = await judge.judge_conversation_result(
- full_transcript=replay_transcript
- )
- replay_judge = replay_result.output
- yield {
- "event": "replay_judge",
- "ok": replay_judge is not None,
- "judge": replay_judge.model_dump() if replay_judge else None,
- }
- if not replay_judge:
+ replay_result = await judge.judge_conversation_result(
+ full_transcript=replay_transcript
+ )
+ replay_judge = replay_result.output
+ acc["replay_judge"] = (
+ replay_judge.model_dump() if replay_judge else None
+ )
yield {
- "event": "error",
- "phase": "replay_glm5",
- "message": (
+ "event": "replay_judge",
+ "ok": replay_judge is not None,
+ "judge": acc["replay_judge"],
+ }
+ if not replay_judge:
+ err = (
f"回放对话整体 GLM-5 打分失败:{replay_result.error}"
if replay_result.error
else "回放对话整体 GLM-5 打分失败(空密钥、限流或 JSON 解析失败,见服务端日志)"
- ),
- }
+ )
+ acc["errors"].append(err)
+ yield {
+ "event": "error",
+ "phase": "replay_glm5",
+ "message": err,
+ }
+ yield {"event": "done"}
+ return
+
+ if include_turn_judges:
+ replay_pairs = pair_session_messages_to_turns(list(dialogue.messages))
+ if replay_pairs:
+ yield {"event": "meta", "phase": "replay_turn_judges_start"}
+ async for row in _iter_turn_judgments_for_turns(
+ judge,
+ replay_pairs,
+ sse_event="replay_turn_judge",
+ ):
+ if row.get("event") == "replay_turn_judge":
+ idx = row.get("turn_index")
+ if isinstance(idx, (int, float)):
+ acc["replay_turn_judges"][str(int(idx))] = row.get(
+ "judge"
+ )
+ yield row
+
+ async for piece in judge.stream_conversation_compare(
+ baseline_transcript=baseline_transcript,
+ replay_transcript=replay_transcript,
+ baseline_judge=baseline_judge,
+ replay_judge=replay_judge,
+ ):
+ if piece:
+ acc["compare_markdown"] += piece
+ yield {"event": "compare_delta", "text": piece}
+
yield {"event": "done"}
- return
-
- if include_turn_judges:
- replay_pairs = pair_session_messages_to_turns(list(dialogue.messages))
- if replay_pairs:
- yield {"event": "meta", "phase": "replay_turn_judges_start"}
- async for row in _iter_turn_judgments_for_turns(
- judge,
- replay_pairs,
- sse_event="replay_turn_judge",
- ):
- yield row
-
- async for piece in judge.stream_conversation_compare(
- baseline_transcript=baseline_transcript,
- replay_transcript=replay_transcript,
- baseline_judge=baseline_judge,
- replay_judge=replay_judge,
- ):
- if piece:
- yield {"event": "compare_delta", "text": piece}
-
- yield {"event": "done"}
+ finally:
+ if persist:
+ acc["judged_at"] = datetime.now(timezone.utc).isoformat()
+ await self._persist_playground_conversation_judge(cid, acc)
async def judge_memoir_for_user(
self,
diff --git a/api/app/features/evaluation/router.py b/api/app/features/evaluation/router.py
index 453582a..7bb7ee7 100644
--- a/api/app/features/evaluation/router.py
+++ b/api/app/features/evaluation/router.py
@@ -46,6 +46,7 @@ from app.features.evaluation.schemas import (
ManualJudgeConversationStreamBody,
ManualJudgeMemoirBody,
ManualJudgeMemoirOut,
+ PlaygroundConversationJudgeOut,
MemoirPhase1ReadyOut,
MemoirSectionBaselineOut,
RegressionSetCreate,
@@ -225,6 +226,26 @@ async def get_session_transcript(
)
+@router.get(
+ "/sessions/{conversation_id}/playground-conversation-judge",
+ response_model=PlaygroundConversationJudgeOut,
+)
+async def get_playground_conversation_judge(
+ conversation_id: str,
+ _auth: InternalEvalAuth,
+ db: Annotated[AsyncSession, Depends(get_async_db)],
+):
+ catalog = SessionCatalogService(db)
+ tr = await catalog.get_transcript(conversation_id)
+ if not tr:
+ raise HTTPException(status_code=404, detail="conversation not found")
+ judge = await catalog.get_playground_conversation_judge_json(conversation_id)
+ return PlaygroundConversationJudgeOut(
+ conversation_id=conversation_id,
+ judge=judge,
+ )
+
+
@router.get(
"/sessions/{conversation_id}/memoir-phase1-ready",
response_model=MemoirPhase1ReadyOut,
diff --git a/api/app/features/evaluation/schemas.py b/api/app/features/evaluation/schemas.py
index 328a235..9c5685b 100644
--- a/api/app/features/evaluation/schemas.py
+++ b/api/app/features/evaluation/schemas.py
@@ -211,6 +211,13 @@ class ManualJudgeConversationOut(BaseModel):
errors: list[str] = Field(default_factory=list)
+class PlaygroundConversationJudgeOut(BaseModel):
+ """`conversations.playground_conversation_judge_json` 的只读视图。"""
+
+ conversation_id: str
+ judge: dict[str, Any] | None = None
+
+
class ManualJudgeMemoirBody(BaseModel):
user_id: str
baseline_sections: list[MemoirSectionBaselineOut] | None = None
diff --git a/api/app/features/evaluation/session_catalog_service.py b/api/app/features/evaluation/session_catalog_service.py
index a37e153..37c907c 100644
--- a/api/app/features/evaluation/session_catalog_service.py
+++ b/api/app/features/evaluation/session_catalog_service.py
@@ -3,6 +3,7 @@
from __future__ import annotations
from dataclasses import dataclass
+from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
@@ -110,3 +111,12 @@ class SessionCatalogService:
user_utterances_from_segments=from_segments,
user_utterances_from_messages=from_messages,
)
+
+ async def get_playground_conversation_judge_json(
+ self, conversation_id: str
+ ) -> dict[str, Any] | None:
+ c = await self._repo.get_conversation(conversation_id)
+ if not c or c.deleted_at:
+ return None
+ raw = c.playground_conversation_judge_json
+ return raw if isinstance(raw, dict) else None
diff --git a/app-eval-web/src/pages/PlaygroundPage.tsx b/app-eval-web/src/pages/PlaygroundPage.tsx
index 3fc8d86..c6923af 100644
--- a/app-eval-web/src/pages/PlaygroundPage.tsx
+++ b/app-eval-web/src/pages/PlaygroundPage.tsx
@@ -130,6 +130,10 @@ export default function PlaygroundPage() {
const [convJudgeStreamText, setConvJudgeStreamText] = useState("");
const [convJudgeErrors, setConvJudgeErrors] = useState([]);
const [convJudgePhase, setConvJudgePhase] = useState("");
+ const [persistedJudgeMeta, setPersistedJudgeMeta] = useState<{
+ judgedAt: string | null;
+ savedFixture: string | null;
+ } | null>(null);
const [fixtureFiles, setFixtureFiles] = useState([]);
const [fixtureName, setFixtureName] = useState("");
@@ -265,7 +269,49 @@ export default function PlaygroundPage() {
})();
}, [fixtureName]);
- function resetJudgeUi() {
+ const applyPersistedPlaygroundJudge = useCallback(
+ (j: Record) => {
+ setConvJudgeBaseline(j.baseline_judge ?? null);
+ setConvJudgeReplay(j.replay_judge ?? null);
+ setConvJudgeStreamText(
+ typeof j.compare_markdown === "string" ? j.compare_markdown : "",
+ );
+ const errs: string[] = [];
+ if (Array.isArray(j.errors)) {
+ for (const x of j.errors) errs.push(String(x));
+ }
+ if (Array.isArray(j.warnings)) {
+ for (const x of j.warnings) errs.push(String(x));
+ }
+ setConvJudgeErrors(errs);
+ const bt = j.baseline_turn_judges;
+ if (bt && typeof bt === "object" && !Array.isArray(bt)) {
+ const o: Record = {};
+ for (const [k, v] of Object.entries(bt)) {
+ const i = Number(k);
+ if (!Number.isNaN(i)) o[i] = v;
+ }
+ setBaselineTurnJudges(o);
+ } else {
+ setBaselineTurnJudges({});
+ }
+ const rt = j.replay_turn_judges;
+ if (rt && typeof rt === "object" && !Array.isArray(rt)) {
+ const o: Record = {};
+ for (const [k, v] of Object.entries(rt)) {
+ const i = Number(k);
+ if (!Number.isNaN(i)) o[i] = v;
+ }
+ setReplayTurnJudges(o);
+ } else {
+ setReplayTurnJudges({});
+ }
+ setConvJudgePhase("");
+ },
+ [],
+ );
+
+ const resetJudgeUi = useCallback(() => {
setConvJudgeBaseline(null);
setConvJudgeReplay(null);
setConvJudgeStreamText("");
@@ -273,7 +319,52 @@ export default function PlaygroundPage() {
setConvJudgePhase("");
setBaselineTurnJudges({});
setReplayTurnJudges({});
- }
+ setPersistedJudgeMeta(null);
+ }, []);
+
+ const playgroundPersistHint = useMemo(() => {
+ if (!persistedJudgeMeta) return null;
+ const { judgedAt, savedFixture } = persistedJudgeMeta;
+ let s = "已恢复上次保存在服务器上的 GLM 评分";
+ if (judgedAt) s += `(${judgedAt})`;
+ if (savedFixture && fixtureName && savedFixture !== fixtureName) {
+ s +=
+ "。当前所选基线导出文件与当时不一致,若要对照基线请重新跑一次自动评分";
+ }
+ return s;
+ }, [persistedJudgeMeta, fixtureName]);
+
+ useEffect(() => {
+ const cid = replayConversationId.trim();
+ if (!cid) {
+ resetJudgeUi();
+ return;
+ }
+ let cancelled = false;
+ void (async () => {
+ const r = await api<{
+ conversation_id: string;
+ judge: Record | null;
+ }>(
+ `/internal/api/evaluation/sessions/${encodeURIComponent(cid)}/playground-conversation-judge`,
+ );
+ if (cancelled) return;
+ if (!r.ok || !r.data?.judge) {
+ resetJudgeUi();
+ return;
+ }
+ const j = r.data.judge;
+ applyPersistedPlaygroundJudge(j);
+ setPersistedJudgeMeta({
+ judgedAt: typeof j.judged_at === "string" ? j.judged_at : null,
+ savedFixture:
+ typeof j.fixture_filename === "string" ? j.fixture_filename : null,
+ });
+ })();
+ return () => {
+ cancelled = true;
+ };
+ }, [replayConversationId, applyPersistedPlaygroundJudge, resetJudgeUi]);
async function runReplay(resume: boolean) {
if (!fixtureName) {
@@ -519,12 +610,7 @@ export default function PlaygroundPage() {
return;
}
setJudgeConvBusy(true);
- setConvJudgeBaseline(null);
- setConvJudgeReplay(null);
- setConvJudgeStreamText("");
- setConvJudgeErrors([]);
- setBaselineTurnJudges({});
- setReplayTurnJudges({});
+ resetJudgeUi();
setConvJudgePhase("连接评分服务…");
try {
const url = `${apiBase}/internal/api/evaluation/judge/conversation-stream`;
@@ -625,7 +711,11 @@ export default function PlaygroundPage() {
} else if (ev === "done") {
setConvJudgePhase("");
if (!judgeStreamHadError) {
- pushNotice("自动评分流已结束", "success");
+ pushNotice("自动评分流已结束(结果已写入服务器)", "success");
+ setPersistedJudgeMeta({
+ judgedAt: new Date().toISOString(),
+ savedFixture: fixtureName.trim() || null,
+ });
}
}
}
@@ -1026,6 +1116,14 @@ export default function PlaygroundPage() {
先对基线 transcript 与当前会话做整体评分,可按需勾选逐轮
GLM-5;再流式输出差异与建议。请确保所选基线与当前会话一致。
+ {playgroundPersistHint ? (
+
+ {playgroundPersistHint}
+
+ ) : null}
{convJudgePhase ? (
{convJudgePhase}