diff --git a/api/app/features/conversation/ws/pipeline.py b/api/app/features/conversation/ws/pipeline.py index 206c8a6..1c936da 100644 --- a/api/app/features/conversation/ws/pipeline.py +++ b/api/app/features/conversation/ws/pipeline.py @@ -629,6 +629,8 @@ async def process_user_message( db: AsyncSession, user: User = None, user_message_timestamp: Optional[datetime] = None, + *, + force_skip_tts: bool = False, ) -> None: """处理用户消息,生成 Agent 回应。由 ChatOrchestrator 路由到 ProfileAgent 或 InterviewAgent。""" store = ConversationHistoryStore(db) @@ -671,7 +673,7 @@ async def process_user_message( turn.skip_tts, ) responses = turn.messages - skip_tts = turn.skip_tts + skip_tts = bool(turn.skip_tts or force_skip_tts) segment.agent_response = AI_RESPONSE_SEGMENT_JOIN.join(responses) _mark_conversation_active(conversation) diff --git a/api/app/features/evaluation/deps.py b/api/app/features/evaluation/deps.py index abaaaad..60e3cc4 100644 --- a/api/app/features/evaluation/deps.py +++ b/api/app/features/evaluation/deps.py @@ -7,9 +7,26 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.core.db import get_async_db from app.features.evaluation.admin_service import EvaluationAdminService +from app.features.evaluation.judge_manual_service import EvalJudgeManualService +from app.features.evaluation.replay_service import ReplayConversationService +from app.features.quota.deps import get_quota_service +from app.features.quota.service import QuotaService def get_evaluation_admin_service( db: Annotated[AsyncSession, Depends(get_async_db)], ) -> EvaluationAdminService: return EvaluationAdminService(db) + + +def get_replay_conversation_service( + db: Annotated[AsyncSession, Depends(get_async_db)], + quota: Annotated[QuotaService, Depends(get_quota_service)], +) -> ReplayConversationService: + return ReplayConversationService(db, quota) + + +def get_eval_judge_manual_service( + db: Annotated[AsyncSession, Depends(get_async_db)], +) -> EvalJudgeManualService: + return EvalJudgeManualService(db) diff --git a/api/app/features/evaluation/importers/user_export_markdown.py b/api/app/features/evaluation/importers/user_export_markdown.py index 670590a..205d34b 100644 --- a/api/app/features/evaluation/importers/user_export_markdown.py +++ b/api/app/features/evaluation/importers/user_export_markdown.py @@ -49,3 +49,47 @@ def extract_dialogue_turns_from_export_md(text: str) -> list[tuple[str, str]]: raw_ai = ((ai_m.group(1) if ai_m else "") or "").strip() out.append((u, _normalize_export_ai_block(raw_ai))) return out + + +_MEMOIR_SECTION_HEADER = re.compile( + r"^##\s*回忆录章节(生成正文)\s*$", + re.MULTILINE | re.IGNORECASE, +) + +_IMAGE_REF = re.compile(r"\{\{IMAGE:[^}]*\}\}\s*", re.DOTALL) + + +def extract_source_user_id_from_export_md(text: str) -> str | None: + """匹配导出头 ``**User ID:** `uuid` ``。""" + m = re.search(r"\*\*User ID:\*\*\s*`([0-9a-fA-F-]{36})`", text) + if not m: + return None + return m.group(1).strip() + + +def extract_memoir_chapter_sections_from_export_md(text: str) -> list[tuple[str, str]]: + """从 ``## 回忆录章节(生成正文)`` 起按 ``##`` / ``###`` 标题切分基线正文(去掉 IMAGE 占位)。""" + m = _MEMOIR_SECTION_HEADER.search(text) + if not m: + return [] + tail = (text[m.end() :] or "").strip() + if not tail: + return [] + pieces = re.split(r"\n(?=(?:###\s|##\s+))", tail) + out: list[tuple[str, str]] = [] + for piece in pieces: + piece = piece.strip() + if not piece.startswith("#"): + continue + first_nl = piece.find("\n") + if first_nl == -1: + title = piece.lstrip("#").strip() + body = "" + else: + title = piece[:first_nl].lstrip("#").strip() + body = (piece[first_nl + 1 :] or "").strip() + body = _IMAGE_REF.sub("", body) + body = re.sub(r"\n{3,}", "\n\n", body).strip() + if title and body: + out.append((title, body)) + return out diff --git a/api/app/features/evaluation/judge_manual_service.py b/api/app/features/evaluation/judge_manual_service.py new file mode 100644 index 0000000..09ddedc --- /dev/null +++ b/api/app/features/evaluation/judge_manual_service.py @@ -0,0 +1,372 @@ +"""手动触发 GLM 评审(不写 eval_runs)。""" + +from __future__ import annotations + +import re +from collections.abc import AsyncIterator +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.dependencies import get_eval_judge_langchain_llm +from app.core.logging import get_logger +from app.features.evaluation.errors import ( + EvaluationBadRequestError, + EvaluationNotFoundError, +) +from app.features.evaluation.execution_service import _assistant_text_for_eval_display +from app.features.evaluation.judge_service import EvalJudgeService +from app.features.evaluation.schemas import MemoirSectionBaselineOut +from app.features.evaluation.session_catalog_service import SessionCatalogService +from app.features.evaluation.user_export_fixtures import read_user_export_fixture +from app.features.memoir.repo import get_chapters_for_memoir_list +from app.features.story.repo import get_stories_for_user + +logger = get_logger(__name__) + +_MAX_JUDGE_MARKDOWN_CHARS = 20_000 +_MAX_EVAL_CHAPTERS = 30 +_MAX_EVAL_STORIES = 40 + + +def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str: + s = (text or "").strip() + if len(s) <= max_chars: + return s + return f"{s[:max_chars]}\n\n…(已截断供评审)" + + +def _transcript_from_export_turns(turns: list[tuple[str, str]]) -> str: + parts: list[str] = [] + for u, ai in turns: + u = (u or "").strip() + ai = (ai or "").strip() + if u: + parts.append(f"用户: {u}") + if ai: + parts.append(f"AI: {_assistant_text_for_eval_display(ai)}") + return "\n\n".join(parts) + + +def _normalize_title_key(title: str) -> str: + t = (title or "").strip().lower() + t = re.sub(r"^#+\s*", "", t) + return re.sub(r"\s+", " ", t) + + +def _baseline_for_chapter_title( + baselines: list[MemoirSectionBaselineOut], + chapter_title: str, + index: int, +) -> MemoirSectionBaselineOut | None: + if baselines: + key = _normalize_title_key(chapter_title) + for b in baselines: + if _normalize_title_key(b.title) == key: + return b + if 0 <= index < len(baselines): + return baselines[index] + return None + + +class EvalJudgeManualService: + def __init__(self, db: AsyncSession) -> None: + self._db = db + + async def judge_conversation( + self, + conversation_id: str, + fixture_filename: str | None, + ) -> dict[str, Any]: + cid = (conversation_id or "").strip() + if not cid: + raise EvaluationBadRequestError("conversation_id is required") + + catalog = SessionCatalogService(self._db) + dialogue = await catalog.get_session_dialogue(cid) + if not dialogue: + raise EvaluationNotFoundError("conversation not found") + + parts: list[str] = [] + for m in dialogue.messages: + r = (m.role or "").lower() + label = "用户" if r == "human" else "AI" + raw = m.content or "" + out = _assistant_text_for_eval_display(raw) if r != "human" else raw + parts.append(f"{label}: {out}") + replay_transcript = "\n\n".join(parts) + if not replay_transcript.strip(): + raise EvaluationBadRequestError("no messages to judge") + + fn = (fixture_filename or "").strip() or None + baseline_transcript = "" + if fn: + try: + turns, _ = read_user_export_fixture(fn) + baseline_transcript = _transcript_from_export_turns(turns) + except ValueError as e: + raise EvaluationBadRequestError(str(e)) from e + except FileNotFoundError as e: + raise EvaluationNotFoundError("fixture not found") from e + + errors: list[str] = [] + judge_llm = get_eval_judge_langchain_llm() + judge = EvalJudgeService(judge_llm) + baseline_judge_dict: dict[str, Any] | None = None + if baseline_transcript.strip(): + bj = await judge.judge_conversation(full_transcript=baseline_transcript) + if bj: + baseline_judge_dict = bj.model_dump() + else: + errors.append("baseline_glm_failed") + elif fn: + errors.append("baseline_transcript_empty") + + rj = await judge.judge_conversation(full_transcript=replay_transcript) + replay_judge_dict = rj.model_dump() if rj else None + if not rj: + errors.append("replay_glm_failed") + + return { + "conversation_id": cid, + "fixture_filename": fn, + "baseline_transcript": baseline_transcript, + "replay_transcript": replay_transcript, + "baseline_judge": baseline_judge_dict, + "replay_judge": replay_judge_dict, + "errors": errors, + } + + async def iter_conversation_judge_sse( + self, + conversation_id: str, + fixture_filename: str | None, + ) -> AsyncIterator[dict[str, Any]]: + """供 SSE:先整体基准分、再整体回放分,再流式对比与建议。""" + cid = (conversation_id or "").strip() + if not cid: + yield { + "event": "error", + "phase": "validate", + "message": "conversation_id is required", + } + return + + catalog = SessionCatalogService(self._db) + dialogue = await catalog.get_session_dialogue(cid) + if not dialogue: + yield { + "event": "error", + "phase": "load", + "message": "conversation not found", + } + return + + parts: list[str] = [] + for m in dialogue.messages: + r = (m.role or "").lower() + label = "用户" if r == "human" else "AI" + raw = m.content or "" + out = _assistant_text_for_eval_display(raw) if r != "human" else raw + parts.append(f"{label}: {out}") + replay_transcript = "\n\n".join(parts) + if not replay_transcript.strip(): + yield {"event": "error", "phase": "load", "message": "no messages to judge"} + return + + fn = (fixture_filename or "").strip() or None + baseline_transcript = "" + if fn: + try: + turns, _ = read_user_export_fixture(fn) + baseline_transcript = _transcript_from_export_turns(turns) + except ValueError as e: + yield {"event": "error", "phase": "fixture", "message": str(e)} + return + except FileNotFoundError: + yield { + "event": "error", + "phase": "fixture", + "message": "fixture not found", + } + return + + judge_llm = get_eval_judge_langchain_llm() + if not judge_llm: + yield { + "event": "error", + "phase": "config", + "message": "评审 LLM 未配置(eval_judge_api_key / zhipu_api_key)", + } + return + + judge = EvalJudgeService(judge_llm) + yield {"event": "meta", "conversation_id": cid, "fixture_filename": fn} + + if not baseline_transcript.strip(): + yield { + "event": "warning", + "message": "未提供基准 MD 或基准无文本:仅对回放对话打分并输出单侧改进建议", + } + + baseline_judge = None + if baseline_transcript.strip(): + baseline_judge = await judge.judge_conversation( + full_transcript=baseline_transcript + ) + yield { + "event": "baseline_judge", + "ok": baseline_judge is not None, + "judge": baseline_judge.model_dump() if baseline_judge else None, + } + if not baseline_judge: + yield { + "event": "error", + "phase": "baseline_glm", + "message": "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)", + } + else: + yield { + "event": "baseline_judge", + "ok": False, + "skipped": True, + "judge": None, + } + + replay_judge = await judge.judge_conversation(full_transcript=replay_transcript) + yield { + "event": "replay_judge", + "ok": replay_judge is not None, + "judge": replay_judge.model_dump() if replay_judge else None, + } + if not replay_judge: + yield { + "event": "error", + "phase": "replay_glm", + "message": "回放对话整体 GLM 打分失败(空密钥、限流或 JSON 解析失败,见服务端日志)", + } + yield {"event": "done"} + return + + async for piece in judge.stream_conversation_compare( + baseline_transcript=baseline_transcript, + replay_transcript=replay_transcript, + baseline_judge=baseline_judge, + replay_judge=replay_judge, + ): + if piece: + yield {"event": "compare_delta", "text": piece} + + yield {"event": "done"} + + async def judge_memoir_for_user( + self, + user_id: str, + baseline_sections: list[MemoirSectionBaselineOut] | None, + ) -> dict[str, Any]: + uid = (user_id or "").strip() + if not uid: + raise EvaluationBadRequestError("user_id is required") + + judge_llm = get_eval_judge_langchain_llm() + judge = EvalJudgeService(judge_llm) + baselines = list(baseline_sections or []) + + chapter_results: list[dict[str, Any]] = [] + try: + chapters = await get_chapters_for_memoir_list( + uid, self._db, active_only=True, is_new_only=None + ) + for i, ch in enumerate(chapters[:_MAX_EVAL_CHAPTERS]): + body = (ch.canonical_markdown or "").strip() + if not body: + continue + bl = _baseline_for_chapter_title(baselines, ch.title or "", i) + baseline_excerpt = "" + if bl and (bl.body or "").strip(): + baseline_excerpt = _clip_md_for_judge(bl.body, max_chars=6000) + md = f"# 章节:{ch.title}\n\n" + if baseline_excerpt: + md += f"## 导出基线(节选)\n\n{baseline_excerpt}\n\n" + md += f"## 当前成稿\n\n{_clip_md_for_judge(body)}" + cj = await judge.judge_memoir(memoir_markdown=md) + chapter_results.append( + { + "id": ch.id, + "title": ch.title, + "order_index": ch.order_index, + "baseline_title": bl.title if bl else None, + "judge": cj.model_dump() if cj else None, + } + ) + except Exception as e: + logger.warning("manual memoir chapter judges failed: {}", e) + + story_results: list[dict[str, Any]] = [] + try: + stories = await get_stories_for_user(self._db, uid, status="active") + for st in stories[:_MAX_EVAL_STORIES]: + body = (st.canonical_markdown or "").strip() + if not body: + continue + md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}" + sj = await judge.judge_memoir(memoir_markdown=md) + story_results.append( + { + "id": st.id, + "title": st.title, + "stage": st.stage, + "judge": sj.model_dump() if sj else None, + } + ) + except Exception as e: + logger.warning("manual memoir story judges failed: {}", e) + + return { + "user_id": uid, + "chapter_results": chapter_results, + "story_results": story_results, + } + + async def memoir_snapshot(self, user_id: str) -> dict[str, Any]: + uid = (user_id or "").strip() + if not uid: + raise EvaluationBadRequestError("user_id is required") + + chapters_out: list[dict[str, Any]] = [] + stories_out: list[dict[str, Any]] = [] + try: + chapters = await get_chapters_for_memoir_list( + uid, self._db, active_only=True, is_new_only=None + ) + for ch in chapters[:_MAX_EVAL_CHAPTERS]: + chapters_out.append( + { + "id": ch.id, + "title": ch.title, + "category": ch.category, + "order_index": ch.order_index, + "canonical_markdown": ch.canonical_markdown, + } + ) + except Exception as e: + logger.warning("memoir snapshot chapters failed: {}", e) + try: + stories = await get_stories_for_user(self._db, uid, status="active") + for st in stories[:_MAX_EVAL_STORIES]: + stories_out.append( + { + "id": st.id, + "title": st.title, + "stage": st.stage, + "canonical_markdown": st.canonical_markdown, + } + ) + except Exception as e: + logger.warning("memoir snapshot stories failed: {}", e) + + return { + "user_id": uid, + "chapters": chapters_out, + "stories": stories_out, + } diff --git a/api/app/features/evaluation/judge_service.py b/api/app/features/evaluation/judge_service.py index 51569ac..b52e4f2 100644 --- a/api/app/features/evaluation/judge_service.py +++ b/api/app/features/evaluation/judge_service.py @@ -2,6 +2,7 @@ from __future__ import annotations +from collections.abc import AsyncIterator from typing import Any from app.core.llm_call import LLMCallError, allm_json_call @@ -12,6 +13,7 @@ from app.features.evaluation.judge_schemas import ( TurnJudgeOutput, ) from app.features.evaluation.rubrics.conversation_v1 import ( + COMPARE_CONV_STREAM_HINT, CONV_JUDGE_INSTRUCTIONS, TURN_JUDGE_INSTRUCTIONS, ) @@ -21,7 +23,9 @@ logger = get_logger(__name__) _TURN_MAX = 768 _CONV_MAX = 8192 +_CONV_JUDGE_JSON_MAX = 2048 _MEMOIR_MAX = 12000 +_COMPARE_STREAM_MAX = 6144 class EvalJudgeService: @@ -75,13 +79,81 @@ class EvalJudgeService: self._llm, prompt, ConversationJudgeOutput, - max_tokens=_TURN_MAX, + max_tokens=_CONV_JUDGE_JSON_MAX, agent="EvalJudgeService.judge_conversation", ) except LLMCallError as e: logger.warning("conversation judge failed: {}", e) return None + async def stream_conversation_compare( + self, + *, + baseline_transcript: str, + replay_transcript: str, + baseline_judge: ConversationJudgeOutput | None, + replay_judge: ConversationJudgeOutput | None, + ) -> AsyncIterator[str]: + """流式输出中文对比与建议(非 JSON)。""" + if not self._llm: + yield "[错误] 未配置评审模型 API Key(eval_judge_api_key / zhipu_api_key)" + return + b_tr = (baseline_transcript or "").strip()[:_CONV_MAX] + r_tr = (replay_transcript or "").strip()[:_CONV_MAX] + b_json = ( + baseline_judge.model_dump_json(ensure_ascii=False) + if baseline_judge + else "null" + ) + r_json = ( + replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null" + ) + if baseline_judge and replay_judge: + prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分(JSON)。请用中文直接写正文(不要用 JSON、不要用 Markdown 代码块): + +【A:导出基准对话】(历史快照:用户与当时导出的线上 AI,多轮合并为一篇) +{b_tr} + +【B:本次回放/新测对话】(用户句与基准对齐,AI 为当前后端重新生成) +{r_tr} + +【A 的整体评分 JSON】 +{b_json} + +【B 的整体评分 JSON】 +{r_json} + +请依次撰写: +1) 两段对话在整体体验上的主要差异(共情、追问、重复感、自然度等); +2) B 相对 A 的优点与不足; +3) 若 B 在关键维度明显弱于 A,给出可操作的改进方向(系统提示、访谈策略、模型或温度等)。 + +笔调简洁、偏执行清单。""" + elif replay_judge: + prompt = f"""{COMPARE_CONV_STREAM_HINT} + +【回放/新测 transcript】 +{r_tr} + +【整体评分 JSON】 +{r_json} +""" + else: + yield "[错误] 缺少回放对话评分,无法生成建议" + return + + llm = self._llm + if hasattr(llm, "bind"): + llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX) + try: + async for chunk in llm.astream(prompt): + piece = getattr(chunk, "content", None) + if piece: + yield piece + except Exception as e: + logger.warning("conversation compare stream failed: {}", e) + yield f"\n\n[流式输出中断:{e}]" + async def judge_memoir(self, *, memoir_markdown: str) -> MemoirJudgeOutput | None: if not self._llm: return None diff --git a/api/app/features/evaluation/replay_service.py b/api/app/features/evaluation/replay_service.py new file mode 100644 index 0000000..27d89fe --- /dev/null +++ b/api/app/features/evaluation/replay_service.py @@ -0,0 +1,172 @@ +"""内部评测:按 App 一致路径回放用户轮次(segment + orchestrator + memoir 队列)。""" + +from __future__ import annotations + +import secrets +import uuid + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.db import utc_now +from app.core.logging import get_logger +from app.core.security import hash_password +from app.features.auth import repo as auth_repo +from app.features.conversation.models import Conversation +from app.features.conversation.service import ConversationService +from app.features.conversation.ws.pipeline import ( + background_runner, + process_user_message, +) +from app.features.evaluation.errors import ( + EvaluationBadRequestError, + EvaluationNotFoundError, +) +from app.features.evaluation.user_export_fixtures import read_user_export_fixture +from app.features.quota.service import QuotaService +from app.features.user.models import User + +logger = get_logger(__name__) + + +class ReplayConversationService: + def __init__(self, db: AsyncSession, quota_service: QuotaService) -> None: + self._db = db + self._quota = quota_service + + async def create_eval_sandbox(self) -> tuple[str, str, str, str]: + """新建仅用于评测的临时用户(唯一伪手机号)+ 新会话。""" + user_id = str(uuid.uuid4()) + phone: str | None = None + for _ in range(8): + candidate = f"eval_{secrets.token_hex(10)}" + existing = await auth_repo.get_user_by_phone(candidate, self._db) + if not existing: + phone = candidate + break + if not phone: + raise EvaluationBadRequestError("could not allocate eval phone") + + user = User( + id=user_id, + phone=phone, + password_hash=hash_password(secrets.token_urlsafe(24)), + nickname="评测临时用户", + subscription_type="free", + created_at=utc_now(), + ) + await auth_repo.create_user(user, self._db) + await self._db.commit() + await self._db.refresh(user) + + conversation_id = str(uuid.uuid4()) + conv_service = ConversationService(self._db, self._quota) + conv, err = await conv_service.ensure_ws_connection(conversation_id, user_id) + if err or not conv: + raise EvaluationBadRequestError(err or "failed to create conversation") + + logger.info( + "eval sandbox user_id={} phone={} conversation_id={}", + user_id, + phone, + conversation_id, + ) + return user_id, conversation_id, phone, user.nickname + + async def bootstrap_conversation(self, user_id: str) -> str: + uid = (user_id or "").strip() + if not uid: + raise EvaluationBadRequestError("user_id is required") + user = await self._db.get(User, uid) + if not user: + raise EvaluationBadRequestError("user not found") + conversation_id = str(uuid.uuid4()) + conv_service = ConversationService(self._db, self._quota) + conv, err = await conv_service.ensure_ws_connection(conversation_id, uid) + if err or not conv: + raise EvaluationBadRequestError(err or "failed to create conversation") + logger.info( + "eval replay bootstrap conversation_id={} user_id={}", + conversation_id, + uid, + ) + return conversation_id + + async def replay_fixture( + self, + *, + conversation_id: str, + fixture_filename: str, + flush_memoir_after: bool, + skip_tts: bool, + ) -> tuple[int, list[str]]: + try: + turns, _ = read_user_export_fixture(fixture_filename) + except ValueError as e: + raise EvaluationBadRequestError(str(e)) from e + except FileNotFoundError: + raise EvaluationNotFoundError("fixture not found") from None + utterances = [u.strip() for u, _ in turns if (u or "").strip()] + if not utterances: + raise EvaluationBadRequestError("fixture produced no user utterances") + n = await self.replay_utterances( + conversation_id=conversation_id, + utterances=utterances, + flush_memoir_after=flush_memoir_after, + skip_tts=skip_tts, + ) + return n, utterances + + async def replay_utterances( + self, + *, + conversation_id: str, + utterances: list[str], + flush_memoir_after: bool, + skip_tts: bool, + ) -> int: + cid = (conversation_id or "").strip() + if not cid: + raise EvaluationBadRequestError("conversation_id is required") + conv = await self._db.get(Conversation, cid) + if not conv or conv.deleted_at is not None: + raise EvaluationNotFoundError("conversation not found") + user = await self._db.get(User, conv.user_id) + if not user: + raise EvaluationBadRequestError("user not found for conversation") + + conv_service = ConversationService(self._db, self._quota) + count = 0 + for raw in utterances: + text = (raw or "").strip() + if not text: + continue + segment = await conv_service.create_user_segment(conv, conv.user_id, text) + ts = segment.created_at or conv.last_message_at + await background_runner.queue_message( + conv.user_id, + segment.id, + text_char_count=len(text), + ) + await process_user_message( + conversation_id=cid, + user_message=text, + conversation=conv, + segment=segment, + db=self._db, + user=user, + user_message_timestamp=ts, + force_skip_tts=skip_tts, + ) + count += 1 + + if flush_memoir_after and conv.user_id: + await background_runner.flush_pending(conv.user_id) + + logger.info( + "eval replay done conversation_id={} turns={} flush={} skip_tts={}", + cid, + count, + flush_memoir_after, + skip_tts, + ) + return count diff --git a/api/app/features/evaluation/router.py b/api/app/features/evaluation/router.py index 2568480..1b4317c 100644 --- a/api/app/features/evaluation/router.py +++ b/api/app/features/evaluation/router.py @@ -2,32 +2,55 @@ from __future__ import annotations +import json from typing import Annotated from fastapi import APIRouter, Depends, HTTPException, Query +from fastapi.responses import StreamingResponse from sqlalchemy.ext.asyncio import AsyncSession from app.core.db import get_async_db from app.features.evaluation.admin_service import EvaluationAdminService -from app.features.evaluation.deps import get_evaluation_admin_service +from app.features.evaluation.deps import ( + get_eval_judge_manual_service, + get_evaluation_admin_service, + get_replay_conversation_service, +) from app.features.evaluation.errors import ( EvaluationBadRequestError, EvaluationNotFoundError, ) +from app.features.evaluation.importers.user_export_markdown import ( + extract_memoir_chapter_sections_from_export_md, + extract_source_user_id_from_export_md, +) from app.features.evaluation.internal_auth import InternalEvalAuth +from app.features.evaluation.judge_manual_service import EvalJudgeManualService from app.features.evaluation.presenters import case_out, run_out +from app.features.evaluation.replay_service import ReplayConversationService from app.features.evaluation.schemas import ( CaseCreate, CaseOut, EvalRunOut, + EvalSandboxOut, ExperimentCreate, ExperimentDetailOut, ExperimentOut, GateVerdictOut, ImportJsonCaseBody, ImportMarkdownBody, + ManualJudgeConversationBody, + ManualJudgeConversationOut, + ManualJudgeConversationStreamBody, + ManualJudgeMemoirBody, + ManualJudgeMemoirOut, + MemoirSectionBaselineOut, RegressionSetCreate, RegressionSetOut, + ReplayBootstrapBody, + ReplayBootstrapOut, + ReplayConversationBody, + ReplayConversationOut, SessionDialogueOut, SessionEvalRunsOut, SessionListItem, @@ -37,10 +60,12 @@ from app.features.evaluation.schemas import ( UserExportFixtureDetailOut, UserExportFixtureListOut, UserExportFixtureTurnOut, + UserMemoirSnapshotOut, VersionCreate, VersionOut, ) from app.features.evaluation.session_catalog_service import SessionCatalogService +from app.features.evaluation.user_export_fixtures import read_user_export_fixture router = APIRouter(tags=["internal-evaluation"]) @@ -209,6 +234,175 @@ async def list_session_evaluation_runs( return await svc.list_session_evaluation_runs(conversation_id) +@router.post("/sessions/replay-bootstrap", response_model=ReplayBootstrapOut) +async def replay_bootstrap( + body: ReplayBootstrapBody, + _auth: InternalEvalAuth, + replay: Annotated[ + ReplayConversationService, Depends(get_replay_conversation_service) + ], +): + try: + cid = await replay.bootstrap_conversation(body.user_id) + except EvaluationBadRequestError as e: + raise _eval_http_exc(e) from e + return ReplayBootstrapOut(conversation_id=cid) + + +@router.post("/sessions/eval-sandbox", response_model=EvalSandboxOut) +async def create_eval_sandbox( + _auth: InternalEvalAuth, + replay: Annotated[ + ReplayConversationService, Depends(get_replay_conversation_service) + ], +): + try: + uid, cid, phone, nick = await replay.create_eval_sandbox() + except EvaluationBadRequestError as e: + raise _eval_http_exc(e) from e + return EvalSandboxOut( + user_id=uid, + conversation_id=cid, + phone=phone, + nickname=nick, + ) + + +@router.post("/replay/conversation", response_model=ReplayConversationOut) +async def replay_conversation( + body: ReplayConversationBody, + _auth: InternalEvalAuth, + replay: Annotated[ + ReplayConversationService, Depends(get_replay_conversation_service) + ], +): + if body.fixture_filename and body.user_utterances: + raise HTTPException( + status_code=400, + detail="provide only one of fixture_filename or user_utterances", + ) + try: + if body.fixture_filename: + fn = body.fixture_filename.strip() + n, echo = await replay.replay_fixture( + conversation_id=body.conversation_id, + fixture_filename=fn, + flush_memoir_after=body.flush_memoir_after, + skip_tts=body.skip_tts, + ) + elif body.user_utterances is not None: + utt = [str(u) for u in body.user_utterances if str(u).strip()] + if not utt: + raise EvaluationBadRequestError("user_utterances is empty") + n = await replay.replay_utterances( + conversation_id=body.conversation_id, + utterances=utt, + flush_memoir_after=body.flush_memoir_after, + skip_tts=body.skip_tts, + ) + echo = utt + else: + raise EvaluationBadRequestError( + "fixture_filename or user_utterances required" + ) + except EvaluationNotFoundError as e: + raise _eval_http_exc(e) from e + except EvaluationBadRequestError as e: + raise _eval_http_exc(e) from e + return ReplayConversationOut( + conversation_id=body.conversation_id, + turns_replayed=n, + utterances_echo=echo, + ) + + +@router.post("/judge/conversation", response_model=ManualJudgeConversationOut) +async def judge_conversation_manual( + body: ManualJudgeConversationBody, + _auth: InternalEvalAuth, + judge_svc: Annotated[ + EvalJudgeManualService, Depends(get_eval_judge_manual_service) + ], +): + try: + payload = await judge_svc.judge_conversation( + body.conversation_id, + body.fixture_filename, + ) + except EvaluationNotFoundError as e: + raise _eval_http_exc(e) from e + except EvaluationBadRequestError as e: + raise _eval_http_exc(e) from e + return ManualJudgeConversationOut.model_validate(payload) + + +@router.post("/judge/conversation-stream") +async def judge_conversation_manual_stream( + body: ManualJudgeConversationStreamBody, + _auth: InternalEvalAuth, + judge_svc: Annotated[ + EvalJudgeManualService, Depends(get_eval_judge_manual_service) + ], +): + async def event_iter(): + try: + async for evt in judge_svc.iter_conversation_judge_sse( + body.conversation_id, + body.fixture_filename, + ): + yield f"data: {json.dumps(evt, ensure_ascii=False)}\n\n" + except Exception as e: + err = json.dumps( + {"event": "error", "phase": "server", "message": str(e)}, + ensure_ascii=False, + ) + yield f"data: {err}\n\n" + yield f"data: {json.dumps({'event': 'done'}, ensure_ascii=False)}\n\n" + + return StreamingResponse( + event_iter(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", + }, + ) + + +@router.post("/judge/memoir-chapters", response_model=ManualJudgeMemoirOut) +async def judge_memoir_chapters_manual( + body: ManualJudgeMemoirBody, + _auth: InternalEvalAuth, + judge_svc: Annotated[ + EvalJudgeManualService, Depends(get_eval_judge_manual_service) + ], +): + try: + payload = await judge_svc.judge_memoir_for_user( + body.user_id, + body.baseline_sections, + ) + except EvaluationBadRequestError as e: + raise _eval_http_exc(e) from e + return ManualJudgeMemoirOut.model_validate(payload) + + +@router.get("/users/{user_id}/memoir-snapshot", response_model=UserMemoirSnapshotOut) +async def get_user_memoir_snapshot( + user_id: str, + _auth: InternalEvalAuth, + judge_svc: Annotated[ + EvalJudgeManualService, Depends(get_eval_judge_manual_service) + ], +): + try: + payload = await judge_svc.memoir_snapshot(user_id) + except EvaluationBadRequestError as e: + raise _eval_http_exc(e) from e + return UserMemoirSnapshotOut.model_validate(payload) + + @router.get( "/fixtures/user-exports", response_model=UserExportFixtureListOut, @@ -227,19 +421,23 @@ async def list_user_export_fixtures( async def get_user_export_fixture( filename: str, _auth: InternalEvalAuth, - svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)], ): try: - turns = svc.load_user_export_fixture_turns(filename) + turns, raw_md = read_user_export_fixture(filename) except ValueError: raise HTTPException( status_code=400, detail="invalid fixture filename" ) from None except FileNotFoundError: raise HTTPException(status_code=404, detail="fixture not found") from None + memoir_tuples = extract_memoir_chapter_sections_from_export_md(raw_md) return UserExportFixtureDetailOut( filename=filename, turns=[UserExportFixtureTurnOut(user=u, ai=a) for u, a in turns], + source_user_id=extract_source_user_id_from_export_md(raw_md), + memoir_sections=[ + MemoirSectionBaselineOut(title=t, body=b) for t, b in memoir_tuples + ], ) diff --git a/api/app/features/evaluation/rubrics/conversation_v1.py b/api/app/features/evaluation/rubrics/conversation_v1.py index f819eaa..a052119 100644 --- a/api/app/features/evaluation/rubrics/conversation_v1.py +++ b/api/app/features/evaluation/rubrics/conversation_v1.py @@ -22,3 +22,10 @@ CONV_JUDGE_INSTRUCTIONS = """你是访谈整段对话评审。给定完整 trans dimension_scores 建议至少包含:emotion, information, structure, repetition, naturalness(各 0-100 相对分量即可),用于反映整段是否重复盘问、是否自然;另可有 rationale。 只输出 JSON:total_score, dimension_scores, rationale。""" + + +COMPARE_CONV_STREAM_HINT = """你是访谈对话评测专家。下面给出一份「回放/新测」完整对话 transcript 及其整体评分(JSON)。请用中文直接写正文(不要用 JSON): +1) 对这段对话的整体评价与风险点; +2) 可操作的改进建议(提示词、流程、模型参数等)。 + +笔调简洁、可执行。""" diff --git a/api/app/features/evaluation/schemas.py b/api/app/features/evaluation/schemas.py index 386f68a..27e227a 100644 --- a/api/app/features/evaluation/schemas.py +++ b/api/app/features/evaluation/schemas.py @@ -133,9 +133,100 @@ class UserExportFixtureListOut(BaseModel): items: list[str] +class MemoirSectionBaselineOut(BaseModel): + title: str + body: str + + class UserExportFixtureDetailOut(BaseModel): filename: str turns: list[UserExportFixtureTurnOut] + source_user_id: str | None = None + memoir_sections: list[MemoirSectionBaselineOut] = Field(default_factory=list) + + +class ReplayBootstrapBody(BaseModel): + user_id: str + + +class ReplayBootstrapOut(BaseModel): + conversation_id: str + + +class EvalSandboxOut(BaseModel): + """内部评测专用:一次性临时账号 + 空白会话,不落真实手机号业务。""" + + user_id: str + conversation_id: str + phone: str + nickname: str + + +class ReplayConversationBody(BaseModel): + conversation_id: str + fixture_filename: str | None = None + user_utterances: list[str] | None = None + flush_memoir_after: bool = True + skip_tts: bool = True + + +class ReplayConversationOut(BaseModel): + conversation_id: str + turns_replayed: int + utterances_echo: list[str] = Field(default_factory=list) + + +class ManualJudgeConversationBody(BaseModel): + conversation_id: str + """与当前评测台选中的 MD 一致,供基准 transcript / 整体打分。""" + fixture_filename: str | None = None + + +class ManualJudgeConversationStreamBody(BaseModel): + conversation_id: str + fixture_filename: str | None = None + + +class ManualJudgeConversationOut(BaseModel): + conversation_id: str + fixture_filename: str | None = None + baseline_transcript: str = "" + replay_transcript: str + baseline_judge: dict[str, Any] | None = None + replay_judge: dict[str, Any] | None = None + errors: list[str] = Field(default_factory=list) + + +class ManualJudgeMemoirBody(BaseModel): + user_id: str + baseline_sections: list[MemoirSectionBaselineOut] | None = None + + +class ManualJudgeMemoirOut(BaseModel): + user_id: str + chapter_results: list[dict[str, Any]] = Field(default_factory=list) + story_results: list[dict[str, Any]] = Field(default_factory=list) + + +class MemoirChapterSnapOut(BaseModel): + id: str + title: str + category: str | None = None + order_index: int | None = None + canonical_markdown: str | None = None + + +class MemoirStorySnapOut(BaseModel): + id: str + title: str + stage: str | None = None + canonical_markdown: str | None = None + + +class UserMemoirSnapshotOut(BaseModel): + user_id: str + chapters: list[MemoirChapterSnapOut] + stories: list[MemoirStorySnapOut] class SnapshotFromConversationBody(BaseModel): diff --git a/api/docs/internal-eval.md b/api/docs/internal-eval.md index 25b80b1..732d454 100644 --- a/api/docs/internal-eval.md +++ b/api/docs/internal-eval.md @@ -55,6 +55,39 @@ VITE_EVAL_API_BASE=http://127.0.0.1:8001 VITE_EVAL_API_KEY=与上同 npm run dev 浏览器 `EventSource` 无法带自定义 Header,流式端点支持 **query** `?key=`,与 `X-Internal-Eval-Key` 等效。 +## 评测 Web:两大模块 + +- **对话评测**:选 `api/tests/user_exports/*.md` 为基准 →「新建评测会话」或填写已有 `conversation_id` →「执行回放」→「GLM 评审对话」。 +- **回忆录章节**:同一套 fixture 会带上导出 MD 中的 `source_user_id` 与 `memoir_sections`;「刷新库中章节/故事」拉 DB 快照 →「GLM 评审章节」(基线节选与当前成稿一并送评)。 + +## 真实链路透传回放(与 App 一致) + +| 方法 | 路径 | 说明 | +|------|------|------| +| `POST` | `/internal/api/evaluation/sessions/eval-sandbox` | 无 body:新建**临时用户**(`eval_` 伪手机号)+ 空白 `conversation_id` | +| `POST` | `/internal/api/evaluation/sessions/replay-bootstrap` | body:`{ "user_id" }`,在已有用户下返回新 `conversation_id` | +| `POST` | `/internal/api/evaluation/replay/conversation` | body:`conversation_id`、`fixture_filename` **或** `user_utterances`;可选 `flush_memoir_after`(默认 true)、`skip_tts`(默认 true) | + +每轮等价于 WebSocket 文本路径:`create_user_segment` → `process_user_message`(内部可 `force_skip_tts`)→ `background_runner.queue_message`。 + +- **TTS**:回放默认 `skip_tts: true`,不在评测台跑语音合成。 +- **Memory / 回忆录管线**:`queue_message` 与末尾 `flush_pending` 依赖 **Celery worker**(`process_memoir_phase1` 等);仅起 internal API 未起 worker 时,对话会落库但章节异步不会推进。 + +## 手动 GLM(不写 `eval_runs` 表) + +| 方法 | 路径 | 说明 | +|------|------|------| +| `POST` | `/internal/api/evaluation/judge/conversation` | body:`{ "conversation_id" }`,返回轮次分 + 全文对话分 | +| `POST` | `/internal/api/evaluation/judge/memoir-chapters` | body:`{ "user_id", "baseline_sections"? }`,Chapter/Story 分项 | +| `GET` | `/internal/api/evaluation/users/{user_id}/memoir-snapshot` | 只读章节与故事正文快照 | + +## Fixture 详情扩展 + +`GET /internal/api/evaluation/fixtures/user-exports/{filename}` 在原有 `turns` 外增加: + +- `source_user_id`:导出抬头中的 User ID +- `memoir_sections`:`## 回忆录章节(生成正文)` 下按标题切分的基线正文(已去掉 `{{IMAGE:...}}` 占位) + ## 门禁规则(v1) - 所有 case 的合成均分:候选须 **严格高于** 基线。 diff --git a/api/tests/evaluation/test_importers.py b/api/tests/evaluation/test_importers.py index 329f4ee..b88c152 100644 --- a/api/tests/evaluation/test_importers.py +++ b/api/tests/evaluation/test_importers.py @@ -5,6 +5,8 @@ import pytest from app.features.evaluation.importers.user_export_markdown import ( extract_dialogue_turns_from_export_md, + extract_memoir_chapter_sections_from_export_md, + extract_source_user_id_from_export_md, extract_user_utterances_from_export_md, ) @@ -72,3 +74,32 @@ def test_extract_dialogue_turns_from_repo_user_export() -> None: turns = extract_dialogue_turns_from_export_md(text) assert len(turns) >= 5 assert "你好" in turns[0][0] + + +def test_extract_source_user_id_from_export_md() -> None: + md = "- **User ID:** `e27fcd97-fefa-43b8-a7a3-3ecd49ebf5f0`\n" + assert ( + extract_source_user_id_from_export_md(md) + == "e27fcd97-fefa-43b8-a7a3-3ecd49ebf5f0" + ) + + +def test_extract_memoir_chapter_sections_from_export_md() -> None: + md = """ +## 回忆录章节(生成正文) + +### First chapter + +Line a. +{{IMAGE:foo}} + +### Second title + +Line b. +""" + sections = extract_memoir_chapter_sections_from_export_md(md) + assert len(sections) == 2 + assert sections[0][0] == "First chapter" + assert "Line a." in sections[0][1] + assert "{{IMAGE" not in sections[0][1] + assert sections[1][0] == "Second title" diff --git a/api/tests/evaluation/test_replay_router.py b/api/tests/evaluation/test_replay_router.py new file mode 100644 index 0000000..23724a7 --- /dev/null +++ b/api/tests/evaluation/test_replay_router.py @@ -0,0 +1,74 @@ +"""回放 / 评审路由参数校验(最小 HTTP)。""" + +import pytest +from httpx import ASGITransport, AsyncClient + +from app.features.evaluation.internal_auth import get_internal_eval_principal + + +@pytest.mark.asyncio +async def test_replay_conversation_requires_fixture_or_utterances( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from fastapi import FastAPI + + monkeypatch.setattr( + "app.core.config.settings.internal_eval_api_key", + "secret", + raising=False, + ) + from app.features.evaluation.router import router + + app = FastAPI() + app.include_router(router, prefix="/internal/api/evaluation") + + async def _override_auth(): + from app.features.evaluation.internal_auth import InternalEvalPrincipal + + return InternalEvalPrincipal() + + app.dependency_overrides[get_internal_eval_principal] = _override_auth + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://t") as client: + r = await client.post( + "/internal/api/evaluation/replay/conversation", + headers={"X-Internal-Eval-Key": "secret"}, + json={"conversation_id": "00000000-0000-0000-0000-000000000001"}, + ) + assert r.status_code == 400 + + +@pytest.mark.asyncio +async def test_replay_conversation_rejects_both_fixture_and_utterances( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from fastapi import FastAPI + + monkeypatch.setattr( + "app.core.config.settings.internal_eval_api_key", + "secret", + raising=False, + ) + from app.features.evaluation.router import router + + app = FastAPI() + app.include_router(router, prefix="/internal/api/evaluation") + + async def _override_auth(): + from app.features.evaluation.internal_auth import InternalEvalPrincipal + + return InternalEvalPrincipal() + + app.dependency_overrides[get_internal_eval_principal] = _override_auth + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://t") as client: + r = await client.post( + "/internal/api/evaluation/replay/conversation", + headers={"X-Internal-Eval-Key": "secret"}, + json={ + "conversation_id": "00000000-0000-0000-0000-000000000001", + "fixture_filename": "x.md", + "user_utterances": ["a"], + }, + ) + assert r.status_code == 400 diff --git a/app-eval-web/src/App.tsx b/app-eval-web/src/App.tsx index ee4ebbd..b24cf10 100644 --- a/app-eval-web/src/App.tsx +++ b/app-eval-web/src/App.tsx @@ -1,4 +1,4 @@ -import { useCallback, useEffect, useState } from "react"; +import { useCallback, useEffect, useMemo, useRef, useState } from "react"; const envApiBase = ( import.meta.env.VITE_EVAL_API_BASE as string | undefined @@ -21,8 +21,6 @@ const apiBaseHint = const SESSION_LIST_POLL_MS = 4000; /** 对比页左侧线上对话轮询 */ const DIALOGUE_POLL_MS = 3500; -/** 对比页右侧 GLM / 评测 run 轮询 */ -const SESSION_EVAL_POLL_MS = 8000; /** 高级页回归集 / 实验列表轮询 */ const ADMIN_POLL_MS = 8000; @@ -35,35 +33,94 @@ async function api( init?: RequestInit, ): Promise<{ ok: boolean; data?: T; error?: string; status: number }> { const url = `${apiBase}${path.startsWith("/") ? path : `/${path}`}`; - const r = await fetch(url, { - ...init, - headers: { - "X-Internal-Eval-Key": apiKey, - "Content-Type": "application/json", - ...(init?.headers ?? {}), - }, - }); - const text = await r.text(); - let data: T | undefined; try { - data = text ? (JSON.parse(text) as T) : undefined; - } catch { - /* ignore */ - } - if (!r.ok) { + const r = await fetch(url, { + ...init, + headers: { + "X-Internal-Eval-Key": apiKey, + "Content-Type": "application/json", + ...(init?.headers ?? {}), + }, + signal: init?.signal, + }); + const text = await r.text(); + let data: T | undefined; + try { + data = text ? (JSON.parse(text) as T) : undefined; + } catch { + /* ignore */ + } + if (!r.ok) { + return { + ok: false, + status: r.status, + error: + typeof data === "object" && + data && + "detail" in (data as object) && + data !== null + ? String((data as unknown as { detail: unknown }).detail) + : text || r.statusText, + }; + } + return { ok: true, data, status: r.status }; + } catch (e: unknown) { + const name = e instanceof Error ? e.name : ""; + if (name === "AbortError") { + return { ok: false, status: 0, error: "aborted" }; + } return { ok: false, - status: r.status, - error: - typeof data === "object" && - data && - "detail" in (data as object) && - data !== null - ? String((data as unknown as { detail: unknown }).detail) - : text || r.statusText, + status: 0, + error: e instanceof Error ? e.message : "network error", }; } - return { ok: true, data, status: r.status }; +} + +/** 与后端 replay 一致:strip 后非空的用户句。 */ +function utterancesForReplayFromTurns( + turns: { user: string; ai: string }[], +): string[] { + return turns + .map((t) => (t.user || "").trim()) + .filter((u) => u.length > 0 && u !== "(空)"); +} + +/** 将 DB 消息序合并为「一轮用户 + 拼接后的 AI」(多段 AGENT_RESPONSE)。 */ +function pairDialogueTurns( + messages: DialogueMessage[], +): { user: string; assistant: string }[] { + const out: { user: string; assistant: string }[] = []; + let currentUser: string | null = null; + const aiAccum: string[] = []; + + const closeTurn = () => { + if (currentUser !== null) { + out.push({ + user: currentUser, + assistant: aiAccum.join("\n\n").trim(), + }); + currentUser = null; + aiAccum.length = 0; + } + }; + + for (const m of messages) { + const r = (m.role || "").toLowerCase(); + if (r === "human") { + closeTurn(); + currentUser = (m.content || "").trim(); + } else { + const t = (m.content || "").trim(); + if (t) aiAccum.push(t); + } + } + closeTurn(); + return out; +} + +function normTurnText(s: string): string { + return (s || "").replace(/\r\n/g, "\n").trim(); } type SessionItem = { @@ -83,41 +140,6 @@ type DialogueMessage = { created_at?: string | null; }; -type RunTurnOut = { - id: string; - turn_index: number; - user_utterance: string; - assistant_reply: string | null; - duration_ms: number | null; - judge_scores_json: Record | null; - judge_rationale: string | null; -}; - -type EvalRunOut = { - id: string; - experiment_id: string; - case_id: string; - side: string; - status: string; - error_message: string | null; - memoir_markdown: string | null; - conversation_score_total: number | null; - memoir_score_total: number | null; - composite_score: number | null; - judge_bundle_json: Record | null; - turns: RunTurnOut[]; -}; - -type SessionEvalRunItem = { - experiment_name: string; - run: EvalRunOut; -}; - -function fmtScore(n: unknown): string { - if (typeof n === "number" && !Number.isNaN(n)) return n.toFixed(1); - return "—"; -} - function JsonPreview({ value }: { value: unknown }) { if (value == null) return ; return ( @@ -174,11 +196,16 @@ function formatTime(iso: string | null | undefined) { } } +type FixtureDetailResponse = { + turns: { user: string; ai: string }[]; + source_user_id?: string | null; + memoir_sections?: { title: string; body: string }[]; +}; + export default function App() { - const [view, setView] = useState<"home" | "session" | "admin">("home"); + const [mainView, setMainView] = useState<"conv" | "memoir" | "admin">("conv"); const [msg, setMsg] = useState(""); const [sessions, setSessions] = useState([]); - const [selectedId, setSelectedId] = useState(null); const [dialogue, setDialogue] = useState([]); const [fallbackUserLines, setFallbackUserLines] = useState([]); @@ -186,12 +213,25 @@ export default function App() { const [versions, setVersions] = useState<{ id: string; name: string }[]>([]); - const [sessionEvalItems, setSessionEvalItems] = useState( - [], - ); - const [sessionEvalUpdatedAt, setSessionEvalUpdatedAt] = useState( - null, - ); + const [evalUserId, setEvalUserId] = useState(""); + const [replayConversationId, setReplayConversationId] = useState(""); + const [replayBusy, setReplayBusy] = useState(false); + const [replayProgress, setReplayProgress] = useState<{ + current: number; + total: number; + } | null>(null); + const replayAbortRef = useRef(null); + const [judgeConvBusy, setJudgeConvBusy] = useState(false); + const [convJudgeBaseline, setConvJudgeBaseline] = useState(null); + const [convJudgeReplay, setConvJudgeReplay] = useState(null); + const [convJudgeStreamText, setConvJudgeStreamText] = useState(""); + const [convJudgeErrors, setConvJudgeErrors] = useState([]); + const [convJudgePhase, setConvJudgePhase] = useState(""); + const [memoirSnapshot, setMemoirSnapshot] = useState(null); + const [memoirSnapBusy, setMemoirSnapBusy] = useState(false); + const [memoirJudgeBusy, setMemoirJudgeBusy] = useState(false); + const [manualMemoirJudge, setManualMemoirJudge] = useState(null); + const [showSessionPicker, setShowSessionPicker] = useState(false); const [adminTab, setAdminTab] = useState< "sets" | "versions" | "experiments" @@ -218,6 +258,32 @@ export default function App() { const [fixtureTurns, setFixtureTurns] = useState< { user: string; ai: string }[] >([]); + const [fixtureMemoirSections, setFixtureMemoirSections] = useState< + { title: string; body: string }[] + >([]); + + const turnAlignment = useMemo(() => { + const base = utterancesForReplayFromTurns(fixtureTurns); + const pairs = pairDialogueTurns(dialogue); + const n = Math.max(base.length, pairs.length); + const rows: { + index: number; + baselineUser: string; + dbUser: string; + match: boolean; + }[] = []; + for (let i = 0; i < n; i++) { + const b = base[i] ?? ""; + const p = pairs[i]?.user ?? ""; + rows.push({ + index: i + 1, + baselineUser: b, + dbUser: p, + match: normTurnText(b) === normTurnText(p), + }); + } + return rows; + }, [fixtureTurns, dialogue]); /** 近期全部:含已结束会话;仅进行中:status=active(多数字段在用户挂断后为 ended,列表会空) */ const [sessionFilter, setSessionFilter] = useState<"recent" | "active">( @@ -250,50 +316,51 @@ export default function App() { if (r.ok && r.data) setVersions(r.data); }, []); - const pullSessionEvalRuns = useCallback(async (conversationId: string) => { - const r = await api<{ items: SessionEvalRunItem[] }>( - `/internal/api/evaluation/sessions/${conversationId}/evaluation-runs`, - ); - if (r.ok && r.data?.items) setSessionEvalItems(r.data.items); - else setSessionEvalItems([]); - setSessionEvalUpdatedAt(new Date()); - }, []); - - const pullDialogue = useCallback(async (conversationId: string) => { - const d = await api<{ messages: DialogueMessage[] }>( - `/internal/api/evaluation/sessions/${conversationId}/dialogue`, - ); - if (d.ok && d.data?.messages?.length) { - setDialogue(d.data.messages); - setFallbackUserLines([]); - } else { - const t = await api<{ - user_utterances_from_messages: string[]; - user_utterances_from_segments: string[]; - }>(`/internal/api/evaluation/sessions/${conversationId}/transcript`); - if (t.ok && t.data) { - const lines = - t.data.user_utterances_from_messages.length > 0 - ? t.data.user_utterances_from_messages - : t.data.user_utterances_from_segments; - setDialogue([]); - setFallbackUserLines(lines); + const pullDialogue = useCallback( + async (conversationId: string, signal?: AbortSignal) => { + const d = await api<{ messages: DialogueMessage[] }>( + `/internal/api/evaluation/sessions/${conversationId}/dialogue`, + { signal }, + ); + if (d.error === "aborted") return; + if (d.ok && d.data?.messages?.length) { + setDialogue(d.data.messages); + setFallbackUserLines([]); + } else { + const t = await api<{ + user_utterances_from_messages: string[]; + user_utterances_from_segments: string[]; + }>(`/internal/api/evaluation/sessions/${conversationId}/transcript`, { + signal, + }); + if (t.error === "aborted") return; + if (t.ok && t.data) { + const lines = + t.data.user_utterances_from_messages.length > 0 + ? t.data.user_utterances_from_messages + : t.data.user_utterances_from_segments; + setDialogue([]); + setFallbackUserLines(lines); + } } - } - setDialogueUpdatedAt(new Date()); + setDialogueUpdatedAt(new Date()); + }, + [], + ); + + const stopReplay = useCallback(() => { + replayAbortRef.current?.abort(); }, []); - const loadSessionPageInitial = useCallback( - (conversationId: string) => { - setLoadingLeft(true); - setDialogue([]); - setFallbackUserLines([]); - setSessionEvalItems([]); - setSessionEvalUpdatedAt(null); - void pullDialogue(conversationId).finally(() => setLoadingLeft(false)); - }, - [pullDialogue], - ); + useEffect(() => { + const ac = replayAbortRef; + const onPageHide = () => ac.current?.abort(); + window.addEventListener("pagehide", onPageHide); + return () => { + window.removeEventListener("pagehide", onPageHide); + ac.current?.abort(); + }; + }, []); const refreshAdminData = useCallback(async () => { const rs = await api<{ id: string; name: string }[]>( @@ -328,34 +395,37 @@ export default function App() { setEvalReachable("bad"); } })(); + }, []); + + useEffect(() => { + if (mainView !== "conv") return; void refreshSessionList(); const t = setInterval(() => void refreshSessionList(), SESSION_LIST_POLL_MS); return () => clearInterval(t); - }, [refreshSessionList]); + }, [mainView, refreshSessionList]); useEffect(() => { void refreshVersions(); }, [refreshVersions]); useEffect(() => { - if (view !== "session" || !selectedId) return; + if (mainView !== "conv" || !replayConversationId.trim()) return; + let cancelled = false; + setLoadingLeft(true); + void pullDialogue(replayConversationId).finally(() => { + if (!cancelled) setLoadingLeft(false); + }); const t = setInterval(() => { - void pullDialogue(selectedId); + void pullDialogue(replayConversationId); }, DIALOGUE_POLL_MS); - return () => clearInterval(t); - }, [view, selectedId, pullDialogue]); + return () => { + cancelled = true; + clearInterval(t); + }; + }, [mainView, replayConversationId, pullDialogue]); useEffect(() => { - if (view !== "session" || !selectedId) return; - void pullSessionEvalRuns(selectedId); - const t = setInterval(() => { - void pullSessionEvalRuns(selectedId); - }, SESSION_EVAL_POLL_MS); - return () => clearInterval(t); - }, [view, selectedId, pullSessionEvalRuns]); - - useEffect(() => { - if (view !== "session" || !selectedId) return; + if (mainView !== "conv" && mainView !== "memoir") return; void (async () => { const r = await api<{ items: string[] }>( "/internal/api/evaluation/fixtures/user-exports", @@ -373,33 +443,328 @@ export default function App() { return items[0] ?? ""; }); })(); - }, [view, selectedId]); + }, [mainView]); useEffect(() => { - if (view !== "session" || !fixtureName) { + if ((mainView !== "conv" && mainView !== "memoir") || !fixtureName) { setFixtureTurns([]); + setFixtureMemoirSections([]); return; } void (async () => { - const r = await api<{ turns: { user: string; ai: string }[] }>( + const r = await api( `/internal/api/evaluation/fixtures/user-exports/${encodeURIComponent(fixtureName)}`, ); - if (r.ok && r.data?.turns) setFixtureTurns(r.data.turns); - else setFixtureTurns([]); + if (r.ok && r.data?.turns) { + setFixtureTurns(r.data.turns); + setFixtureMemoirSections(r.data.memoir_sections ?? []); + const sid = r.data.source_user_id ?? null; + if (sid && mainView === "memoir") + setEvalUserId((prev) => (prev.trim() ? prev : sid)); + } else { + setFixtureTurns([]); + setFixtureMemoirSections([]); + } })(); - }, [view, fixtureName]); + }, [mainView, fixtureName]); useEffect(() => { - if (view !== "admin") return; + if (mainView !== "admin") return; void refreshAdminData(); const t = setInterval(() => void refreshAdminData(), ADMIN_POLL_MS); return () => clearInterval(t); - }, [view, refreshAdminData]); + }, [mainView, refreshAdminData]); - function openSession(id: string) { - setSelectedId(id); - setView("session"); - loadSessionPageInitial(id); + async function createEvalSandboxOnly() { + const r = await api<{ + user_id: string; + conversation_id: string; + phone: string; + nickname: string; + }>("/internal/api/evaluation/sessions/eval-sandbox", { + method: "POST", + body: "{}", + }); + if (r.ok && r.data) { + setEvalUserId(r.data.user_id); + setReplayConversationId(r.data.conversation_id); + setConvJudgeBaseline(null); + setConvJudgeReplay(null); + setConvJudgeStreamText(""); + setConvJudgeErrors([]); + setConvJudgePhase(""); + setDialogue([]); + setFallbackUserLines([]); + setMsg( + `评测沙箱就绪:临时手机号 ${r.data.phone},user_id / conversation_id 已填入(可随时「新沙箱」清空重来)。`, + ); + } else { + setMsg(r.error ?? "创建沙箱失败"); + } + } + + async function bootstrapReplaySession() { + const uid = evalUserId.trim(); + if (!uid) { + setMsg("高级选项:请先填写已有用户的 UUID"); + return; + } + const r = await api<{ conversation_id: string }>( + "/internal/api/evaluation/sessions/replay-bootstrap", + { method: "POST", body: JSON.stringify({ user_id: uid }) }, + ); + setMsg( + r.ok + ? `已在该用户下新建会话 ${r.data?.conversation_id ?? ""}` + : (r.error ?? "bootstrap 失败"), + ); + if (r.ok && r.data?.conversation_id) { + setReplayConversationId(r.data.conversation_id); + setConvJudgeBaseline(null); + setConvJudgeReplay(null); + setConvJudgeStreamText(""); + setConvJudgeErrors([]); + setConvJudgePhase(""); + setDialogue([]); + setFallbackUserLines([]); + } + } + + async function runReplay() { + if (!fixtureName) { + setMsg("请选择基准 MD"); + return; + } + const utts = utterancesForReplayFromTurns(fixtureTurns); + if (!utts.length) { + setMsg("当前基准 MD 没有可回放的用户句(请先加载轮次)"); + return; + } + + replayAbortRef.current?.abort(); + const ac = new AbortController(); + replayAbortRef.current = ac; + const { signal } = ac; + + setReplayBusy(true); + setReplayProgress(null); + try { + let cid = replayConversationId.trim(); + if (!cid) { + const sb = await api<{ + user_id: string; + conversation_id: string; + phone: string; + }>("/internal/api/evaluation/sessions/eval-sandbox", { + method: "POST", + body: "{}", + signal, + }); + if (sb.error === "aborted") { + setMsg("回放已中止(关闭/刷新页面或「停止回放」)"); + return; + } + if (!sb.ok || !sb.data) { + setMsg(sb.error ?? "自动创建沙箱失败"); + return; + } + setEvalUserId(sb.data.user_id); + setReplayConversationId(sb.data.conversation_id); + cid = sb.data.conversation_id; + setConvJudgeBaseline(null); + setConvJudgeReplay(null); + setConvJudgeStreamText(""); + setConvJudgeErrors([]); + setConvJudgePhase(""); + setDialogue([]); + setFallbackUserLines([]); + } + + let replayed = 0; + for (let i = 0; i < utts.length; i++) { + if (signal.aborted) { + setMsg("回放已中止(关闭/刷新页面或「停止回放」)"); + return; + } + setReplayProgress({ current: i + 1, total: utts.length }); + const last = i === utts.length - 1; + const r = await api<{ + turns_replayed: number; + utterances_echo: string[]; + }>("/internal/api/evaluation/replay/conversation", { + method: "POST", + signal, + body: JSON.stringify({ + conversation_id: cid, + user_utterances: [utts[i]], + flush_memoir_after: last, + skip_tts: true, + }), + }); + if (r.error === "aborted") { + setMsg("回放已中止(关闭/刷新页面或「停止回放」)"); + return; + } + if (!r.ok) { + setMsg(r.error ?? "回放失败"); + return; + } + replayed += r.data?.turns_replayed ?? 0; + await pullDialogue(cid, signal); + } + + setMsg( + `回放完成:${replayed} 轮(分轮请求,避免长阻塞;当前会话 ${cid.slice(0, 8)}…;最后一轮已 flush 回忆录队列,成稿仍依赖 Celery)`, + ); + } finally { + setReplayBusy(false); + setReplayProgress(null); + } + } + + async function runJudgeConversationStream() { + const cid = replayConversationId.trim(); + if (!cid) { + setMsg("请先有一次会话(执行回放、仅建沙箱或粘贴会话 ID)"); + return; + } + setJudgeConvBusy(true); + setConvJudgeBaseline(null); + setConvJudgeReplay(null); + setConvJudgeStreamText(""); + setConvJudgeErrors([]); + setConvJudgePhase("连接评审服务…"); + try { + const url = `${apiBase}/internal/api/evaluation/judge/conversation-stream`; + const res = await fetch(url, { + method: "POST", + headers: { + "X-Internal-Eval-Key": apiKey, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + conversation_id: cid, + fixture_filename: fixtureName.trim() || null, + }), + }); + if (!res.ok) { + const t = await res.text(); + setMsg(`评审流启动失败:HTTP ${res.status} ${t.slice(0, 240)}`); + setConvJudgePhase(""); + return; + } + const reader = res.body?.getReader(); + if (!reader) { + setMsg("当前环境无法读取响应流"); + setConvJudgePhase(""); + return; + } + const decoder = new TextDecoder(); + let buf = ""; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + buf += decoder.decode(value, { stream: true }); + const chunks = buf.split("\n\n"); + buf = chunks.pop() ?? ""; + for (const block of chunks) { + const line = block.trim(); + if (!line.startsWith("data: ")) continue; + let evt: Record; + try { + evt = JSON.parse(line.slice(6)) as Record; + } catch { + continue; + } + const ev = evt.event as string | undefined; + if (ev === "meta") { + setConvJudgePhase("GLM:基准整体打分…"); + } else if (ev === "warning") { + setConvJudgeErrors((prev) => [ + ...prev, + String(evt.message ?? "warning"), + ]); + } else if (ev === "baseline_judge") { + setConvJudgeBaseline(evt.judge ?? null); + setConvJudgePhase("GLM:回放对话整体打分…"); + } else if (ev === "replay_judge") { + setConvJudgeReplay(evt.judge ?? null); + setConvJudgePhase("GLM:对比与建议(流式输出)…"); + } else if (ev === "compare_delta") { + const piece = String(evt.text ?? ""); + if (piece) + setConvJudgeStreamText((prev) => prev + piece); + } else if (ev === "error") { + setConvJudgeErrors((prev) => [ + ...prev, + `${String(evt.phase ?? "error")}: ${String(evt.message ?? "")}`, + ]); + } else if (ev === "done") { + setConvJudgePhase(""); + setMsg("GLM 对话评审流已结束"); + } + } + } + } catch (e) { + setConvJudgeErrors((prev) => [ + ...prev, + e instanceof Error ? e.message : "评审流异常", + ]); + setMsg(e instanceof Error ? e.message : "评审流异常"); + } finally { + setJudgeConvBusy(false); + setConvJudgePhase(""); + } + } + + async function runMemoirSnapshot() { + const uid = evalUserId.trim(); + if (!uid) { + setMsg("请填写用户 ID"); + return; + } + setMemoirSnapBusy(true); + try { + const r = await api( + `/internal/api/evaluation/users/${encodeURIComponent(uid)}/memoir-snapshot`, + ); + setMsg(r.ok ? "已刷新库中章节 / 故事列表" : (r.error ?? "加载失败")); + if (r.ok) setMemoirSnapshot(r.data); + } finally { + setMemoirSnapBusy(false); + } + } + + async function runJudgeMemoir() { + const uid = evalUserId.trim(); + if (!uid) { + setMsg("请填写用户 ID"); + return; + } + setMemoirJudgeBusy(true); + try { + const r = await api("/internal/api/evaluation/judge/memoir-chapters", { + method: "POST", + body: JSON.stringify({ + user_id: uid, + baseline_sections: fixtureMemoirSections.length + ? fixtureMemoirSections + : null, + }), + }); + setMsg(r.ok ? "GLM 章节评审完成" : (r.error ?? "评审失败")); + if (r.ok) setManualMemoirJudge(r.data); + } finally { + setMemoirJudgeBusy(false); + } + } + + function pickSessionAsReplayTarget(id: string) { + setReplayConversationId(id); + setShowSessionPicker(false); + const s = sessions.find((x) => x.id === id); + if (s?.user_id) setEvalUserId((prev) => prev.trim() || s.user_id); + setMsg(`已选用会话 ${id.slice(0, 8)}… 为回放目标(将向该会话追加消息)`); } async function createSet() { @@ -437,12 +802,13 @@ export default function App() { } async function snapshotFromDetail() { - if (!selectedId || !selSet) { - setMsg("在「高级配置」中选回归集 ID"); + const cid = replayConversationId.trim(); + if (!cid || !selSet) { + setMsg("先在对话评测中填写 conversation_id 并在高级页选用回归集"); return; } const r = await api( - `/internal/api/evaluation/regression-sets/${selSet}/snapshot-from-conversation/${selectedId}`, + `/internal/api/evaluation/regression-sets/${selSet}/snapshot-from-conversation/${cid}`, { method: "POST", body: JSON.stringify({ @@ -513,26 +879,22 @@ export default function App() { @@ -551,135 +913,23 @@ export default function App() { ) : null} - {view === "home" ? ( -
-
-

- {sessionFilter === "active" ? "进行中的会话" : "近期会话"} -

-
- - -
-
-

- 每 {SESSION_LIST_POLL_MS / 1000} 秒自动刷新列表 - {sessionsUpdatedAt - ? ` · 上次更新 ${sessionsUpdatedAt.toLocaleTimeString()}` - : ""} -

-

- 点选一条进入对比:左侧为线上落库对话,右侧为候选链路透传回放(流式)。 - {sessionFilter === "active" - ? " 若无数据,多半是会话已结束(status=ended),请切到「近期全部」。" - : ""} -

- {sessions.length === 0 ? ( -

- 暂无会话。请确认数据库有对话记录,且评测 API 与 App 共用同一 DATABASE_URL。 -

- ) : ( -
    - {sessions.map((s) => ( -
  • openSession(s.id)} - onKeyDown={(e) => - e.key === "Enter" ? openSession(s.id) : undefined - } - role="button" - tabIndex={0} - > -
    - - {s.user_phone ?? "无手机号"} - - {s.current_topic ? `· ${s.current_topic}` : ""} - - - - 最近消息 {formatTime(s.last_message_at)} - -
    -
    - - {s.status ?? "—"} - - {" · "} - 阶段 {s.conversation_stage ?? "—"} ·{" "} - - {s.id.slice(0, 12)}… - -
    -
  • - ))} -
- )} -
- ) : null} - - {view === "session" && selectedId ? ( + {mainView === "conv" ? (
+

+ 默认不填用户与会话:点「执行回放」会自动创建临时用户 + 新会话(伪手机号{" "} + eval_… + ),再按基准里的用户句逐轮请求后端(每轮一次 HTTP,界面可保持响应)。左侧列是导出 MD 里的用户 + 当时导出的 AI,仅作对照;中间「落库对话」里的 AI 是当前环境重新生成的,必然与左侧导出 AI 不同——这是预期。下方「逐轮用户句对齐」表可核对:每一轮写入 DB 的用户话是否和基准用户句一致。关闭或刷新本页会中止未完成的回放。 + 回忆录模块在「回忆录章节」页;若用沙箱用户看章节,请先在本页跑完回放(并开 Celery)。 +

- - {selectedId} + + + +
+
- 左①每 {DIALOGUE_POLL_MS / 1000}s 同步 · 右③每{" "} - {SESSION_EVAL_POLL_MS / 1000}s 拉取评测 + 当前 user_id{" "} + + {evalUserId ? `${evalUserId.slice(0, 10)}…` : "—"} + {" "} + · conversation_id{" "} + + {replayConversationId + ? `${replayConversationId.slice(0, 10)}…` + : "—"} + + + + + + + 对话同步每 {DIALOGUE_POLL_MS / 1000}s {dialogueUpdatedAt - ? ` · 对话 ${dialogueUpdatedAt.toLocaleTimeString()}` - : ""} - {sessionEvalUpdatedAt - ? ` · 评审 ${sessionEvalUpdatedAt.toLocaleTimeString()}` + ? ` · ${dialogueUpdatedAt.toLocaleTimeString()}` : ""}
+
+ + 高级:指定已有用户或粘贴 conversation_id + +
+ + + +
+
+ + {showSessionPicker ? ( +
+
+ + + + 列表每 {SESSION_LIST_POLL_MS / 1000}s 刷新 + {sessionsUpdatedAt + ? ` · ${sessionsUpdatedAt.toLocaleTimeString()}` + : ""} + +
+
    + {sessions.map((s) => ( +
  • + {" "} + {s.id.slice(0, 10)}…{" "} + {s.user_phone ?? s.user_id.slice(0, 8)} +
  • + ))} +
+
+ ) : null} +
-
-

- ① 线上 / 历史(DB · 自动同步) -

- {loadingLeft ? ( -

加载中…

- ) : dialogue.length > 0 ? ( -
- {dialogue.map((m, i) => ( -
-
- {m.role === "human" ? "用户" : "AI"} ·{" "} - {formatTime(m.created_at ?? null)} -
- {m.content} -
- ))} -
- ) : fallbackUserLines.length > 0 ? ( -
-

- 无线上消息表记录,仅展示抽取的用户轮次(transcript): -

- {fallbackUserLines.map((line, i) => ( -
- {i + 1}. - {line} -
- ))} -
- ) : ( -

暂无左侧数据。

- )} -
-
-

- ② 导出快照(tests/user_exports · 只读对照) +

+ 基准(导出 MD:用户 + AI 对照)

- {!fixtureName ? ( -

- 仓库内未找到{" "} - api/tests/user_exports/*.md,或 API 无法读取该目录。 -

- ) : fixtureTurns.length === 0 ? ( -

正在加载 {fixtureName}…

+ {!fixtureName || fixtureTurns.length === 0 ? ( +

选择 MD 后加载轮次

) : (
{fixtureTurns.map((row, i) => (
-
+
用户 · 轮次 {i + 1}
{row.user} @@ -881,312 +1223,440 @@ export default function App() {
-
+
导出中的 AI
- {row.ai || "(空)"} + {row.ai?.trim() ? row.ai : "(空)"}
))}
)}
-
-

- ③ GLM 评审(回归实验) +

+ 落库对话(DB · 用户句应与基准一致,AI 为当前后端新生成)

- {sessionEvalItems.length === 0 ? ( -

- 尚无命中本会话的评测 run。请先将该会话快照进回归集(case 需带{" "} - source_conversation_id - ),在高级页 enqueue 实验;跑完后此处会显示对话分、访谈摘录稿分、以及该用户名下各{" "} - Chapter / Story{" "} - 正文(canonical_markdown)的成稿分项分。baseline 与 candidate - 各一条 run,可对比综合分与 bundle。 + {loadingLeft ? ( +

加载中…

+ ) : !replayConversationId.trim() ? ( +

+ 执行回放或「仅建沙箱」后将自动拉取本轮会话的落库消息

- ) : ( -
- {sessionEvalItems.map(({ experiment_name, run }) => { - const bundle = run.judge_bundle_json; - const chapters = - bundle && - typeof bundle === "object" && - Array.isArray( - (bundle as Record).chapters, - ) - ? ((bundle as Record).chapters as unknown[]) - : []; - const stories = - bundle && - typeof bundle === "object" && - Array.isArray((bundle as Record).stories) - ? ((bundle as Record).stories as unknown[]) - : []; - const convJ = - bundle && - typeof bundle === "object" && - "conversation_judge" in bundle - ? (bundle as Record).conversation_judge - : null; - const memJ = - bundle && - typeof bundle === "object" && - "memoir_judge" in bundle - ? (bundle as Record).memoir_judge - : null; - return ( -
-
- {experiment_name} -
-
- {run.side} · {run.status} - {run.error_message ? ` · ${run.error_message}` : ""} -
-
- - 综合{" "} - - {fmtScore(run.composite_score)} - - - - 对话{" "} - {fmtScore(run.conversation_score_total)} - - - 成稿均值{" "} - {fmtScore(run.memoir_score_total)} - - - (含摘录稿 + Chapter + Story 分项之平均,见实验执行逻辑) - -
- -

- 整段对话评审 -

- - -

- 访谈摘录稿(候选回放拼接稿) -

- - -

- Chapter(DB · 每章 GLM) -

- {chapters.length === 0 ? ( -

无或未跑分

- ) : ( -
    - {chapters.map((row, idx) => { - const r = row as Record; - const j = r.judge as Record | undefined; - return ( -
  • - - {String(r.title ?? "")} - {" "} - · 总分{" "} - {fmtScore(j?.total_score)} - {typeof j?.rationale === "string" && - j.rationale.trim() ? ( -
    - {j.rationale} -
    - ) : null} -
  • - ); - })} -
- )} - -

- Story(DB · 每篇 GLM) -

- {stories.length === 0 ? ( -

无或未跑分

- ) : ( -
    - {stories.map((row, idx) => { - const r = row as Record; - const j = r.judge as Record | undefined; - return ( -
  • - - {String(r.title ?? "")} - {" "} - · 总分{" "} - {fmtScore(j?.total_score)} - {typeof j?.rationale === "string" && - j.rationale.trim() ? ( -
    - {j.rationale} -
    - ) : null} -
  • - ); - })} -
- )} - -

- 各轮对话分(候选回放) -

- {run.turns.length === 0 ? ( -

- ) : ( -
    - {run.turns.map((t) => ( -
  • - 轮 {t.turn_index + 1} ·{" "} - {fmtScore( - t.judge_scores_json && - typeof t.judge_scores_json === "object" && - "total_score" in t.judge_scores_json - ? (t.judge_scores_json as Record) - .total_score - : null, - )} - {t.judge_rationale - ? ` — ${t.judge_rationale.slice(0, 120)}${t.judge_rationale.length > 120 ? "…" : ""}` - : ""} -
  • - ))} -
- )} + ) : dialogue.length > 0 ? ( +
+ {dialogue.map((m, i) => ( +
+
+ {m.role === "human" ? "用户" : "AI"}
- ); - })} + {m.content} +
+ ))}
+ ) : fallbackUserLines.length > 0 ? ( +
+

仅 transcript(无 messages 表)

+ {fallbackUserLines.map((line, i) => ( +
+ {i + 1}. {line} +
+ ))} +
+ ) : ( +

暂无消息

)}
+ + {turnAlignment.length > 0 ? ( +
+

+ 逐轮用户句对齐(基准 vs DB 合并后的「每轮一条用户」) +

+

+ 绿色表示与基准用户句一致;若不一致,多为会话里混入手动输入、或未清空旧会话就再次回放。 +

+
+ + + + + + + + + + + {turnAlignment.map((row) => { + const clip = (s: string, n: number) => { + const t = (s || "").replace(/\s+/g, " ").trim(); + return t.length > n ? `${t.slice(0, n)}…` : t || "—"; + }; + return ( + + + + + + + ); + })} + +
+ 轮次 + + 状态 + + 基准用户句(节选) + + DB 用户句(节选) +
+ {row.index} + + {row.match ? "一致" : "不一致"} + + {clip(row.baselineUser, 120)} + + {clip(row.dbUser, 120)} +
+
+
+ ) : null} + +
+

+ 手动 GLM · 对话评审(页面底部) +

+

+ 流程:两次整体打分(导出基准全文 transcript 一次、当前落库回放 transcript + 一次),再流式输出中文对比与改进建议。请在上文选择与本会话一致的基准 MD; + 未配置服务端 eval_judge_api_key / zhipu_api_key 时会报错。若某一侧 GLM + JSON 解析失败,见服务端日志中的 conversation judge failed。 +

+ {convJudgePhase ? ( +

{convJudgePhase}

+ ) : null} + {convJudgeErrors.length > 0 ? ( +
    + {convJudgeErrors.map((e, i) => ( +
  • {e}
  • + ))} +
+ ) : null} +
+
+
+ 基准(导出 MD)整体分 +
+ {convJudgeBaseline && + typeof convJudgeBaseline === "object" && + convJudgeBaseline !== null ? ( + <> +
+ {typeof (convJudgeBaseline as { total_score?: number }).total_score === + "number" + ? (convJudgeBaseline as { total_score: number }).total_score.toFixed(1) + : "—"} +
+ ) ?? {}} + /> + + ) : ( +

+ {!fixtureName.trim() + ? "未选择基准 MD:服务端仅对回放 transcript 做整体分与单侧建议。" + : "等待基准整体分…(若失败见上方红色错误与服务端日志)"} +

+ )} +
+
+
+ 回放 / 新测(DB)整体分 +
+ {convJudgeReplay && + typeof convJudgeReplay === "object" && + convJudgeReplay !== null ? ( + <> +
+ {typeof (convJudgeReplay as { total_score?: number }).total_score === + "number" + ? (convJudgeReplay as { total_score: number }).total_score.toFixed(1) + : "—"} +
+ ) ?? {}} /> + + ) : ( +

等待打分结果…

+ )} +
+
+
+
+ 对比与建议(流式) +
+
+ {convJudgeStreamText || ( + + 点击工具栏「GLM 评审对话(流式)」后,此处逐字显示模型输出。 + + )} +
+
+
) : null} - {view === "admin" ? ( + {mainView === "memoir" ? ( +
+

回忆录章节评测

+

+ 基准正文来自同一套 MD 的「回忆录章节」段落;与库中 Chapter/Story 对照后由 GLM 按 rubric 打分。 +

+
+ + + + +
+

+ 基线条目:{fixtureMemoirSections.length} 段(自 MD 解析) +

+
+
+

导出基线(节选)

+
    + {fixtureMemoirSections.map((s, i) => ( +
  • + {s.title} +
    + {(s.body || "").slice(0, 400)} + {(s.body || "").length > 400 ? "…" : ""} +
    +
  • + ))} +
+
+
+

数据库快照

+ {memoirSnapshot && + typeof memoirSnapshot === "object" && + memoirSnapshot !== null ? ( + + ) : ( +

点击「刷新库中章节/故事」

+ )} +
+
+
+

手动 GLM · 章节/故事

+ {manualMemoirJudge ? : ( +

点击「GLM 评审章节」

+ )} +
+
+ ) : null} + + {mainView === "admin" ? (

在此页停留时,回归集与实验列表每 {ADMIN_POLL_MS / 1000}{" "}