feat/ eval
This commit is contained in:
@@ -629,6 +629,8 @@ async def process_user_message(
|
||||
db: AsyncSession,
|
||||
user: User = None,
|
||||
user_message_timestamp: Optional[datetime] = None,
|
||||
*,
|
||||
force_skip_tts: bool = False,
|
||||
) -> None:
|
||||
"""处理用户消息,生成 Agent 回应。由 ChatOrchestrator 路由到 ProfileAgent 或 InterviewAgent。"""
|
||||
store = ConversationHistoryStore(db)
|
||||
@@ -671,7 +673,7 @@ async def process_user_message(
|
||||
turn.skip_tts,
|
||||
)
|
||||
responses = turn.messages
|
||||
skip_tts = turn.skip_tts
|
||||
skip_tts = bool(turn.skip_tts or force_skip_tts)
|
||||
|
||||
segment.agent_response = AI_RESPONSE_SEGMENT_JOIN.join(responses)
|
||||
_mark_conversation_active(conversation)
|
||||
|
||||
@@ -7,9 +7,26 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.db import get_async_db
|
||||
from app.features.evaluation.admin_service import EvaluationAdminService
|
||||
from app.features.evaluation.judge_manual_service import EvalJudgeManualService
|
||||
from app.features.evaluation.replay_service import ReplayConversationService
|
||||
from app.features.quota.deps import get_quota_service
|
||||
from app.features.quota.service import QuotaService
|
||||
|
||||
|
||||
def get_evaluation_admin_service(
|
||||
db: Annotated[AsyncSession, Depends(get_async_db)],
|
||||
) -> EvaluationAdminService:
|
||||
return EvaluationAdminService(db)
|
||||
|
||||
|
||||
def get_replay_conversation_service(
|
||||
db: Annotated[AsyncSession, Depends(get_async_db)],
|
||||
quota: Annotated[QuotaService, Depends(get_quota_service)],
|
||||
) -> ReplayConversationService:
|
||||
return ReplayConversationService(db, quota)
|
||||
|
||||
|
||||
def get_eval_judge_manual_service(
|
||||
db: Annotated[AsyncSession, Depends(get_async_db)],
|
||||
) -> EvalJudgeManualService:
|
||||
return EvalJudgeManualService(db)
|
||||
|
||||
@@ -49,3 +49,47 @@ def extract_dialogue_turns_from_export_md(text: str) -> list[tuple[str, str]]:
|
||||
raw_ai = ((ai_m.group(1) if ai_m else "") or "").strip()
|
||||
out.append((u, _normalize_export_ai_block(raw_ai)))
|
||||
return out
|
||||
|
||||
|
||||
_MEMOIR_SECTION_HEADER = re.compile(
|
||||
r"^##\s*回忆录章节(生成正文)\s*$",
|
||||
re.MULTILINE | re.IGNORECASE,
|
||||
)
|
||||
|
||||
_IMAGE_REF = re.compile(r"\{\{IMAGE:[^}]*\}\}\s*", re.DOTALL)
|
||||
|
||||
|
||||
def extract_source_user_id_from_export_md(text: str) -> str | None:
|
||||
"""匹配导出头 ``**User ID:** `uuid` ``。"""
|
||||
m = re.search(r"\*\*User ID:\*\*\s*`([0-9a-fA-F-]{36})`", text)
|
||||
if not m:
|
||||
return None
|
||||
return m.group(1).strip()
|
||||
|
||||
|
||||
def extract_memoir_chapter_sections_from_export_md(text: str) -> list[tuple[str, str]]:
|
||||
"""从 ``## 回忆录章节(生成正文)`` 起按 ``##`` / ``###`` 标题切分基线正文(去掉 IMAGE 占位)。"""
|
||||
m = _MEMOIR_SECTION_HEADER.search(text)
|
||||
if not m:
|
||||
return []
|
||||
tail = (text[m.end() :] or "").strip()
|
||||
if not tail:
|
||||
return []
|
||||
pieces = re.split(r"\n(?=(?:###\s|##\s+))", tail)
|
||||
out: list[tuple[str, str]] = []
|
||||
for piece in pieces:
|
||||
piece = piece.strip()
|
||||
if not piece.startswith("#"):
|
||||
continue
|
||||
first_nl = piece.find("\n")
|
||||
if first_nl == -1:
|
||||
title = piece.lstrip("#").strip()
|
||||
body = ""
|
||||
else:
|
||||
title = piece[:first_nl].lstrip("#").strip()
|
||||
body = (piece[first_nl + 1 :] or "").strip()
|
||||
body = _IMAGE_REF.sub("", body)
|
||||
body = re.sub(r"\n{3,}", "\n\n", body).strip()
|
||||
if title and body:
|
||||
out.append((title, body))
|
||||
return out
|
||||
|
||||
372
api/app/features/evaluation/judge_manual_service.py
Normal file
372
api/app/features/evaluation/judge_manual_service.py
Normal file
@@ -0,0 +1,372 @@
|
||||
"""手动触发 GLM 评审(不写 eval_runs)。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from collections.abc import AsyncIterator
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.dependencies import get_eval_judge_langchain_llm
|
||||
from app.core.logging import get_logger
|
||||
from app.features.evaluation.errors import (
|
||||
EvaluationBadRequestError,
|
||||
EvaluationNotFoundError,
|
||||
)
|
||||
from app.features.evaluation.execution_service import _assistant_text_for_eval_display
|
||||
from app.features.evaluation.judge_service import EvalJudgeService
|
||||
from app.features.evaluation.schemas import MemoirSectionBaselineOut
|
||||
from app.features.evaluation.session_catalog_service import SessionCatalogService
|
||||
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
|
||||
from app.features.memoir.repo import get_chapters_for_memoir_list
|
||||
from app.features.story.repo import get_stories_for_user
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
|
||||
_MAX_EVAL_CHAPTERS = 30
|
||||
_MAX_EVAL_STORIES = 40
|
||||
|
||||
|
||||
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
|
||||
s = (text or "").strip()
|
||||
if len(s) <= max_chars:
|
||||
return s
|
||||
return f"{s[:max_chars]}\n\n…(已截断供评审)"
|
||||
|
||||
|
||||
def _transcript_from_export_turns(turns: list[tuple[str, str]]) -> str:
|
||||
parts: list[str] = []
|
||||
for u, ai in turns:
|
||||
u = (u or "").strip()
|
||||
ai = (ai or "").strip()
|
||||
if u:
|
||||
parts.append(f"用户: {u}")
|
||||
if ai:
|
||||
parts.append(f"AI: {_assistant_text_for_eval_display(ai)}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _normalize_title_key(title: str) -> str:
|
||||
t = (title or "").strip().lower()
|
||||
t = re.sub(r"^#+\s*", "", t)
|
||||
return re.sub(r"\s+", " ", t)
|
||||
|
||||
|
||||
def _baseline_for_chapter_title(
|
||||
baselines: list[MemoirSectionBaselineOut],
|
||||
chapter_title: str,
|
||||
index: int,
|
||||
) -> MemoirSectionBaselineOut | None:
|
||||
if baselines:
|
||||
key = _normalize_title_key(chapter_title)
|
||||
for b in baselines:
|
||||
if _normalize_title_key(b.title) == key:
|
||||
return b
|
||||
if 0 <= index < len(baselines):
|
||||
return baselines[index]
|
||||
return None
|
||||
|
||||
|
||||
class EvalJudgeManualService:
|
||||
def __init__(self, db: AsyncSession) -> None:
|
||||
self._db = db
|
||||
|
||||
async def judge_conversation(
|
||||
self,
|
||||
conversation_id: str,
|
||||
fixture_filename: str | None,
|
||||
) -> dict[str, Any]:
|
||||
cid = (conversation_id or "").strip()
|
||||
if not cid:
|
||||
raise EvaluationBadRequestError("conversation_id is required")
|
||||
|
||||
catalog = SessionCatalogService(self._db)
|
||||
dialogue = await catalog.get_session_dialogue(cid)
|
||||
if not dialogue:
|
||||
raise EvaluationNotFoundError("conversation not found")
|
||||
|
||||
parts: list[str] = []
|
||||
for m in dialogue.messages:
|
||||
r = (m.role or "").lower()
|
||||
label = "用户" if r == "human" else "AI"
|
||||
raw = m.content or ""
|
||||
out = _assistant_text_for_eval_display(raw) if r != "human" else raw
|
||||
parts.append(f"{label}: {out}")
|
||||
replay_transcript = "\n\n".join(parts)
|
||||
if not replay_transcript.strip():
|
||||
raise EvaluationBadRequestError("no messages to judge")
|
||||
|
||||
fn = (fixture_filename or "").strip() or None
|
||||
baseline_transcript = ""
|
||||
if fn:
|
||||
try:
|
||||
turns, _ = read_user_export_fixture(fn)
|
||||
baseline_transcript = _transcript_from_export_turns(turns)
|
||||
except ValueError as e:
|
||||
raise EvaluationBadRequestError(str(e)) from e
|
||||
except FileNotFoundError as e:
|
||||
raise EvaluationNotFoundError("fixture not found") from e
|
||||
|
||||
errors: list[str] = []
|
||||
judge_llm = get_eval_judge_langchain_llm()
|
||||
judge = EvalJudgeService(judge_llm)
|
||||
baseline_judge_dict: dict[str, Any] | None = None
|
||||
if baseline_transcript.strip():
|
||||
bj = await judge.judge_conversation(full_transcript=baseline_transcript)
|
||||
if bj:
|
||||
baseline_judge_dict = bj.model_dump()
|
||||
else:
|
||||
errors.append("baseline_glm_failed")
|
||||
elif fn:
|
||||
errors.append("baseline_transcript_empty")
|
||||
|
||||
rj = await judge.judge_conversation(full_transcript=replay_transcript)
|
||||
replay_judge_dict = rj.model_dump() if rj else None
|
||||
if not rj:
|
||||
errors.append("replay_glm_failed")
|
||||
|
||||
return {
|
||||
"conversation_id": cid,
|
||||
"fixture_filename": fn,
|
||||
"baseline_transcript": baseline_transcript,
|
||||
"replay_transcript": replay_transcript,
|
||||
"baseline_judge": baseline_judge_dict,
|
||||
"replay_judge": replay_judge_dict,
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
async def iter_conversation_judge_sse(
|
||||
self,
|
||||
conversation_id: str,
|
||||
fixture_filename: str | None,
|
||||
) -> AsyncIterator[dict[str, Any]]:
|
||||
"""供 SSE:先整体基准分、再整体回放分,再流式对比与建议。"""
|
||||
cid = (conversation_id or "").strip()
|
||||
if not cid:
|
||||
yield {
|
||||
"event": "error",
|
||||
"phase": "validate",
|
||||
"message": "conversation_id is required",
|
||||
}
|
||||
return
|
||||
|
||||
catalog = SessionCatalogService(self._db)
|
||||
dialogue = await catalog.get_session_dialogue(cid)
|
||||
if not dialogue:
|
||||
yield {
|
||||
"event": "error",
|
||||
"phase": "load",
|
||||
"message": "conversation not found",
|
||||
}
|
||||
return
|
||||
|
||||
parts: list[str] = []
|
||||
for m in dialogue.messages:
|
||||
r = (m.role or "").lower()
|
||||
label = "用户" if r == "human" else "AI"
|
||||
raw = m.content or ""
|
||||
out = _assistant_text_for_eval_display(raw) if r != "human" else raw
|
||||
parts.append(f"{label}: {out}")
|
||||
replay_transcript = "\n\n".join(parts)
|
||||
if not replay_transcript.strip():
|
||||
yield {"event": "error", "phase": "load", "message": "no messages to judge"}
|
||||
return
|
||||
|
||||
fn = (fixture_filename or "").strip() or None
|
||||
baseline_transcript = ""
|
||||
if fn:
|
||||
try:
|
||||
turns, _ = read_user_export_fixture(fn)
|
||||
baseline_transcript = _transcript_from_export_turns(turns)
|
||||
except ValueError as e:
|
||||
yield {"event": "error", "phase": "fixture", "message": str(e)}
|
||||
return
|
||||
except FileNotFoundError:
|
||||
yield {
|
||||
"event": "error",
|
||||
"phase": "fixture",
|
||||
"message": "fixture not found",
|
||||
}
|
||||
return
|
||||
|
||||
judge_llm = get_eval_judge_langchain_llm()
|
||||
if not judge_llm:
|
||||
yield {
|
||||
"event": "error",
|
||||
"phase": "config",
|
||||
"message": "评审 LLM 未配置(eval_judge_api_key / zhipu_api_key)",
|
||||
}
|
||||
return
|
||||
|
||||
judge = EvalJudgeService(judge_llm)
|
||||
yield {"event": "meta", "conversation_id": cid, "fixture_filename": fn}
|
||||
|
||||
if not baseline_transcript.strip():
|
||||
yield {
|
||||
"event": "warning",
|
||||
"message": "未提供基准 MD 或基准无文本:仅对回放对话打分并输出单侧改进建议",
|
||||
}
|
||||
|
||||
baseline_judge = None
|
||||
if baseline_transcript.strip():
|
||||
baseline_judge = await judge.judge_conversation(
|
||||
full_transcript=baseline_transcript
|
||||
)
|
||||
yield {
|
||||
"event": "baseline_judge",
|
||||
"ok": baseline_judge is not None,
|
||||
"judge": baseline_judge.model_dump() if baseline_judge else None,
|
||||
}
|
||||
if not baseline_judge:
|
||||
yield {
|
||||
"event": "error",
|
||||
"phase": "baseline_glm",
|
||||
"message": "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)",
|
||||
}
|
||||
else:
|
||||
yield {
|
||||
"event": "baseline_judge",
|
||||
"ok": False,
|
||||
"skipped": True,
|
||||
"judge": None,
|
||||
}
|
||||
|
||||
replay_judge = await judge.judge_conversation(full_transcript=replay_transcript)
|
||||
yield {
|
||||
"event": "replay_judge",
|
||||
"ok": replay_judge is not None,
|
||||
"judge": replay_judge.model_dump() if replay_judge else None,
|
||||
}
|
||||
if not replay_judge:
|
||||
yield {
|
||||
"event": "error",
|
||||
"phase": "replay_glm",
|
||||
"message": "回放对话整体 GLM 打分失败(空密钥、限流或 JSON 解析失败,见服务端日志)",
|
||||
}
|
||||
yield {"event": "done"}
|
||||
return
|
||||
|
||||
async for piece in judge.stream_conversation_compare(
|
||||
baseline_transcript=baseline_transcript,
|
||||
replay_transcript=replay_transcript,
|
||||
baseline_judge=baseline_judge,
|
||||
replay_judge=replay_judge,
|
||||
):
|
||||
if piece:
|
||||
yield {"event": "compare_delta", "text": piece}
|
||||
|
||||
yield {"event": "done"}
|
||||
|
||||
async def judge_memoir_for_user(
|
||||
self,
|
||||
user_id: str,
|
||||
baseline_sections: list[MemoirSectionBaselineOut] | None,
|
||||
) -> dict[str, Any]:
|
||||
uid = (user_id or "").strip()
|
||||
if not uid:
|
||||
raise EvaluationBadRequestError("user_id is required")
|
||||
|
||||
judge_llm = get_eval_judge_langchain_llm()
|
||||
judge = EvalJudgeService(judge_llm)
|
||||
baselines = list(baseline_sections or [])
|
||||
|
||||
chapter_results: list[dict[str, Any]] = []
|
||||
try:
|
||||
chapters = await get_chapters_for_memoir_list(
|
||||
uid, self._db, active_only=True, is_new_only=None
|
||||
)
|
||||
for i, ch in enumerate(chapters[:_MAX_EVAL_CHAPTERS]):
|
||||
body = (ch.canonical_markdown or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
bl = _baseline_for_chapter_title(baselines, ch.title or "", i)
|
||||
baseline_excerpt = ""
|
||||
if bl and (bl.body or "").strip():
|
||||
baseline_excerpt = _clip_md_for_judge(bl.body, max_chars=6000)
|
||||
md = f"# 章节:{ch.title}\n\n"
|
||||
if baseline_excerpt:
|
||||
md += f"## 导出基线(节选)\n\n{baseline_excerpt}\n\n"
|
||||
md += f"## 当前成稿\n\n{_clip_md_for_judge(body)}"
|
||||
cj = await judge.judge_memoir(memoir_markdown=md)
|
||||
chapter_results.append(
|
||||
{
|
||||
"id": ch.id,
|
||||
"title": ch.title,
|
||||
"order_index": ch.order_index,
|
||||
"baseline_title": bl.title if bl else None,
|
||||
"judge": cj.model_dump() if cj else None,
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("manual memoir chapter judges failed: {}", e)
|
||||
|
||||
story_results: list[dict[str, Any]] = []
|
||||
try:
|
||||
stories = await get_stories_for_user(self._db, uid, status="active")
|
||||
for st in stories[:_MAX_EVAL_STORIES]:
|
||||
body = (st.canonical_markdown or "").strip()
|
||||
if not body:
|
||||
continue
|
||||
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
|
||||
sj = await judge.judge_memoir(memoir_markdown=md)
|
||||
story_results.append(
|
||||
{
|
||||
"id": st.id,
|
||||
"title": st.title,
|
||||
"stage": st.stage,
|
||||
"judge": sj.model_dump() if sj else None,
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("manual memoir story judges failed: {}", e)
|
||||
|
||||
return {
|
||||
"user_id": uid,
|
||||
"chapter_results": chapter_results,
|
||||
"story_results": story_results,
|
||||
}
|
||||
|
||||
async def memoir_snapshot(self, user_id: str) -> dict[str, Any]:
|
||||
uid = (user_id or "").strip()
|
||||
if not uid:
|
||||
raise EvaluationBadRequestError("user_id is required")
|
||||
|
||||
chapters_out: list[dict[str, Any]] = []
|
||||
stories_out: list[dict[str, Any]] = []
|
||||
try:
|
||||
chapters = await get_chapters_for_memoir_list(
|
||||
uid, self._db, active_only=True, is_new_only=None
|
||||
)
|
||||
for ch in chapters[:_MAX_EVAL_CHAPTERS]:
|
||||
chapters_out.append(
|
||||
{
|
||||
"id": ch.id,
|
||||
"title": ch.title,
|
||||
"category": ch.category,
|
||||
"order_index": ch.order_index,
|
||||
"canonical_markdown": ch.canonical_markdown,
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("memoir snapshot chapters failed: {}", e)
|
||||
try:
|
||||
stories = await get_stories_for_user(self._db, uid, status="active")
|
||||
for st in stories[:_MAX_EVAL_STORIES]:
|
||||
stories_out.append(
|
||||
{
|
||||
"id": st.id,
|
||||
"title": st.title,
|
||||
"stage": st.stage,
|
||||
"canonical_markdown": st.canonical_markdown,
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("memoir snapshot stories failed: {}", e)
|
||||
|
||||
return {
|
||||
"user_id": uid,
|
||||
"chapters": chapters_out,
|
||||
"stories": stories_out,
|
||||
}
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import AsyncIterator
|
||||
from typing import Any
|
||||
|
||||
from app.core.llm_call import LLMCallError, allm_json_call
|
||||
@@ -12,6 +13,7 @@ from app.features.evaluation.judge_schemas import (
|
||||
TurnJudgeOutput,
|
||||
)
|
||||
from app.features.evaluation.rubrics.conversation_v1 import (
|
||||
COMPARE_CONV_STREAM_HINT,
|
||||
CONV_JUDGE_INSTRUCTIONS,
|
||||
TURN_JUDGE_INSTRUCTIONS,
|
||||
)
|
||||
@@ -21,7 +23,9 @@ logger = get_logger(__name__)
|
||||
|
||||
_TURN_MAX = 768
|
||||
_CONV_MAX = 8192
|
||||
_CONV_JUDGE_JSON_MAX = 2048
|
||||
_MEMOIR_MAX = 12000
|
||||
_COMPARE_STREAM_MAX = 6144
|
||||
|
||||
|
||||
class EvalJudgeService:
|
||||
@@ -75,13 +79,81 @@ class EvalJudgeService:
|
||||
self._llm,
|
||||
prompt,
|
||||
ConversationJudgeOutput,
|
||||
max_tokens=_TURN_MAX,
|
||||
max_tokens=_CONV_JUDGE_JSON_MAX,
|
||||
agent="EvalJudgeService.judge_conversation",
|
||||
)
|
||||
except LLMCallError as e:
|
||||
logger.warning("conversation judge failed: {}", e)
|
||||
return None
|
||||
|
||||
async def stream_conversation_compare(
|
||||
self,
|
||||
*,
|
||||
baseline_transcript: str,
|
||||
replay_transcript: str,
|
||||
baseline_judge: ConversationJudgeOutput | None,
|
||||
replay_judge: ConversationJudgeOutput | None,
|
||||
) -> AsyncIterator[str]:
|
||||
"""流式输出中文对比与建议(非 JSON)。"""
|
||||
if not self._llm:
|
||||
yield "[错误] 未配置评审模型 API Key(eval_judge_api_key / zhipu_api_key)"
|
||||
return
|
||||
b_tr = (baseline_transcript or "").strip()[:_CONV_MAX]
|
||||
r_tr = (replay_transcript or "").strip()[:_CONV_MAX]
|
||||
b_json = (
|
||||
baseline_judge.model_dump_json(ensure_ascii=False)
|
||||
if baseline_judge
|
||||
else "null"
|
||||
)
|
||||
r_json = (
|
||||
replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
|
||||
)
|
||||
if baseline_judge and replay_judge:
|
||||
prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分(JSON)。请用中文直接写正文(不要用 JSON、不要用 Markdown 代码块):
|
||||
|
||||
【A:导出基准对话】(历史快照:用户与当时导出的线上 AI,多轮合并为一篇)
|
||||
{b_tr}
|
||||
|
||||
【B:本次回放/新测对话】(用户句与基准对齐,AI 为当前后端重新生成)
|
||||
{r_tr}
|
||||
|
||||
【A 的整体评分 JSON】
|
||||
{b_json}
|
||||
|
||||
【B 的整体评分 JSON】
|
||||
{r_json}
|
||||
|
||||
请依次撰写:
|
||||
1) 两段对话在整体体验上的主要差异(共情、追问、重复感、自然度等);
|
||||
2) B 相对 A 的优点与不足;
|
||||
3) 若 B 在关键维度明显弱于 A,给出可操作的改进方向(系统提示、访谈策略、模型或温度等)。
|
||||
|
||||
笔调简洁、偏执行清单。"""
|
||||
elif replay_judge:
|
||||
prompt = f"""{COMPARE_CONV_STREAM_HINT}
|
||||
|
||||
【回放/新测 transcript】
|
||||
{r_tr}
|
||||
|
||||
【整体评分 JSON】
|
||||
{r_json}
|
||||
"""
|
||||
else:
|
||||
yield "[错误] 缺少回放对话评分,无法生成建议"
|
||||
return
|
||||
|
||||
llm = self._llm
|
||||
if hasattr(llm, "bind"):
|
||||
llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX)
|
||||
try:
|
||||
async for chunk in llm.astream(prompt):
|
||||
piece = getattr(chunk, "content", None)
|
||||
if piece:
|
||||
yield piece
|
||||
except Exception as e:
|
||||
logger.warning("conversation compare stream failed: {}", e)
|
||||
yield f"\n\n[流式输出中断:{e}]"
|
||||
|
||||
async def judge_memoir(self, *, memoir_markdown: str) -> MemoirJudgeOutput | None:
|
||||
if not self._llm:
|
||||
return None
|
||||
|
||||
172
api/app/features/evaluation/replay_service.py
Normal file
172
api/app/features/evaluation/replay_service.py
Normal file
@@ -0,0 +1,172 @@
|
||||
"""内部评测:按 App 一致路径回放用户轮次(segment + orchestrator + memoir 队列)。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import secrets
|
||||
import uuid
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.db import utc_now
|
||||
from app.core.logging import get_logger
|
||||
from app.core.security import hash_password
|
||||
from app.features.auth import repo as auth_repo
|
||||
from app.features.conversation.models import Conversation
|
||||
from app.features.conversation.service import ConversationService
|
||||
from app.features.conversation.ws.pipeline import (
|
||||
background_runner,
|
||||
process_user_message,
|
||||
)
|
||||
from app.features.evaluation.errors import (
|
||||
EvaluationBadRequestError,
|
||||
EvaluationNotFoundError,
|
||||
)
|
||||
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
|
||||
from app.features.quota.service import QuotaService
|
||||
from app.features.user.models import User
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class ReplayConversationService:
|
||||
def __init__(self, db: AsyncSession, quota_service: QuotaService) -> None:
|
||||
self._db = db
|
||||
self._quota = quota_service
|
||||
|
||||
async def create_eval_sandbox(self) -> tuple[str, str, str, str]:
|
||||
"""新建仅用于评测的临时用户(唯一伪手机号)+ 新会话。"""
|
||||
user_id = str(uuid.uuid4())
|
||||
phone: str | None = None
|
||||
for _ in range(8):
|
||||
candidate = f"eval_{secrets.token_hex(10)}"
|
||||
existing = await auth_repo.get_user_by_phone(candidate, self._db)
|
||||
if not existing:
|
||||
phone = candidate
|
||||
break
|
||||
if not phone:
|
||||
raise EvaluationBadRequestError("could not allocate eval phone")
|
||||
|
||||
user = User(
|
||||
id=user_id,
|
||||
phone=phone,
|
||||
password_hash=hash_password(secrets.token_urlsafe(24)),
|
||||
nickname="评测临时用户",
|
||||
subscription_type="free",
|
||||
created_at=utc_now(),
|
||||
)
|
||||
await auth_repo.create_user(user, self._db)
|
||||
await self._db.commit()
|
||||
await self._db.refresh(user)
|
||||
|
||||
conversation_id = str(uuid.uuid4())
|
||||
conv_service = ConversationService(self._db, self._quota)
|
||||
conv, err = await conv_service.ensure_ws_connection(conversation_id, user_id)
|
||||
if err or not conv:
|
||||
raise EvaluationBadRequestError(err or "failed to create conversation")
|
||||
|
||||
logger.info(
|
||||
"eval sandbox user_id={} phone={} conversation_id={}",
|
||||
user_id,
|
||||
phone,
|
||||
conversation_id,
|
||||
)
|
||||
return user_id, conversation_id, phone, user.nickname
|
||||
|
||||
async def bootstrap_conversation(self, user_id: str) -> str:
|
||||
uid = (user_id or "").strip()
|
||||
if not uid:
|
||||
raise EvaluationBadRequestError("user_id is required")
|
||||
user = await self._db.get(User, uid)
|
||||
if not user:
|
||||
raise EvaluationBadRequestError("user not found")
|
||||
conversation_id = str(uuid.uuid4())
|
||||
conv_service = ConversationService(self._db, self._quota)
|
||||
conv, err = await conv_service.ensure_ws_connection(conversation_id, uid)
|
||||
if err or not conv:
|
||||
raise EvaluationBadRequestError(err or "failed to create conversation")
|
||||
logger.info(
|
||||
"eval replay bootstrap conversation_id={} user_id={}",
|
||||
conversation_id,
|
||||
uid,
|
||||
)
|
||||
return conversation_id
|
||||
|
||||
async def replay_fixture(
|
||||
self,
|
||||
*,
|
||||
conversation_id: str,
|
||||
fixture_filename: str,
|
||||
flush_memoir_after: bool,
|
||||
skip_tts: bool,
|
||||
) -> tuple[int, list[str]]:
|
||||
try:
|
||||
turns, _ = read_user_export_fixture(fixture_filename)
|
||||
except ValueError as e:
|
||||
raise EvaluationBadRequestError(str(e)) from e
|
||||
except FileNotFoundError:
|
||||
raise EvaluationNotFoundError("fixture not found") from None
|
||||
utterances = [u.strip() for u, _ in turns if (u or "").strip()]
|
||||
if not utterances:
|
||||
raise EvaluationBadRequestError("fixture produced no user utterances")
|
||||
n = await self.replay_utterances(
|
||||
conversation_id=conversation_id,
|
||||
utterances=utterances,
|
||||
flush_memoir_after=flush_memoir_after,
|
||||
skip_tts=skip_tts,
|
||||
)
|
||||
return n, utterances
|
||||
|
||||
async def replay_utterances(
|
||||
self,
|
||||
*,
|
||||
conversation_id: str,
|
||||
utterances: list[str],
|
||||
flush_memoir_after: bool,
|
||||
skip_tts: bool,
|
||||
) -> int:
|
||||
cid = (conversation_id or "").strip()
|
||||
if not cid:
|
||||
raise EvaluationBadRequestError("conversation_id is required")
|
||||
conv = await self._db.get(Conversation, cid)
|
||||
if not conv or conv.deleted_at is not None:
|
||||
raise EvaluationNotFoundError("conversation not found")
|
||||
user = await self._db.get(User, conv.user_id)
|
||||
if not user:
|
||||
raise EvaluationBadRequestError("user not found for conversation")
|
||||
|
||||
conv_service = ConversationService(self._db, self._quota)
|
||||
count = 0
|
||||
for raw in utterances:
|
||||
text = (raw or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
segment = await conv_service.create_user_segment(conv, conv.user_id, text)
|
||||
ts = segment.created_at or conv.last_message_at
|
||||
await background_runner.queue_message(
|
||||
conv.user_id,
|
||||
segment.id,
|
||||
text_char_count=len(text),
|
||||
)
|
||||
await process_user_message(
|
||||
conversation_id=cid,
|
||||
user_message=text,
|
||||
conversation=conv,
|
||||
segment=segment,
|
||||
db=self._db,
|
||||
user=user,
|
||||
user_message_timestamp=ts,
|
||||
force_skip_tts=skip_tts,
|
||||
)
|
||||
count += 1
|
||||
|
||||
if flush_memoir_after and conv.user_id:
|
||||
await background_runner.flush_pending(conv.user_id)
|
||||
|
||||
logger.info(
|
||||
"eval replay done conversation_id={} turns={} flush={} skip_tts={}",
|
||||
cid,
|
||||
count,
|
||||
flush_memoir_after,
|
||||
skip_tts,
|
||||
)
|
||||
return count
|
||||
@@ -2,32 +2,55 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Annotated
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from fastapi.responses import StreamingResponse
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.db import get_async_db
|
||||
from app.features.evaluation.admin_service import EvaluationAdminService
|
||||
from app.features.evaluation.deps import get_evaluation_admin_service
|
||||
from app.features.evaluation.deps import (
|
||||
get_eval_judge_manual_service,
|
||||
get_evaluation_admin_service,
|
||||
get_replay_conversation_service,
|
||||
)
|
||||
from app.features.evaluation.errors import (
|
||||
EvaluationBadRequestError,
|
||||
EvaluationNotFoundError,
|
||||
)
|
||||
from app.features.evaluation.importers.user_export_markdown import (
|
||||
extract_memoir_chapter_sections_from_export_md,
|
||||
extract_source_user_id_from_export_md,
|
||||
)
|
||||
from app.features.evaluation.internal_auth import InternalEvalAuth
|
||||
from app.features.evaluation.judge_manual_service import EvalJudgeManualService
|
||||
from app.features.evaluation.presenters import case_out, run_out
|
||||
from app.features.evaluation.replay_service import ReplayConversationService
|
||||
from app.features.evaluation.schemas import (
|
||||
CaseCreate,
|
||||
CaseOut,
|
||||
EvalRunOut,
|
||||
EvalSandboxOut,
|
||||
ExperimentCreate,
|
||||
ExperimentDetailOut,
|
||||
ExperimentOut,
|
||||
GateVerdictOut,
|
||||
ImportJsonCaseBody,
|
||||
ImportMarkdownBody,
|
||||
ManualJudgeConversationBody,
|
||||
ManualJudgeConversationOut,
|
||||
ManualJudgeConversationStreamBody,
|
||||
ManualJudgeMemoirBody,
|
||||
ManualJudgeMemoirOut,
|
||||
MemoirSectionBaselineOut,
|
||||
RegressionSetCreate,
|
||||
RegressionSetOut,
|
||||
ReplayBootstrapBody,
|
||||
ReplayBootstrapOut,
|
||||
ReplayConversationBody,
|
||||
ReplayConversationOut,
|
||||
SessionDialogueOut,
|
||||
SessionEvalRunsOut,
|
||||
SessionListItem,
|
||||
@@ -37,10 +60,12 @@ from app.features.evaluation.schemas import (
|
||||
UserExportFixtureDetailOut,
|
||||
UserExportFixtureListOut,
|
||||
UserExportFixtureTurnOut,
|
||||
UserMemoirSnapshotOut,
|
||||
VersionCreate,
|
||||
VersionOut,
|
||||
)
|
||||
from app.features.evaluation.session_catalog_service import SessionCatalogService
|
||||
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
|
||||
|
||||
router = APIRouter(tags=["internal-evaluation"])
|
||||
|
||||
@@ -209,6 +234,175 @@ async def list_session_evaluation_runs(
|
||||
return await svc.list_session_evaluation_runs(conversation_id)
|
||||
|
||||
|
||||
@router.post("/sessions/replay-bootstrap", response_model=ReplayBootstrapOut)
|
||||
async def replay_bootstrap(
|
||||
body: ReplayBootstrapBody,
|
||||
_auth: InternalEvalAuth,
|
||||
replay: Annotated[
|
||||
ReplayConversationService, Depends(get_replay_conversation_service)
|
||||
],
|
||||
):
|
||||
try:
|
||||
cid = await replay.bootstrap_conversation(body.user_id)
|
||||
except EvaluationBadRequestError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
return ReplayBootstrapOut(conversation_id=cid)
|
||||
|
||||
|
||||
@router.post("/sessions/eval-sandbox", response_model=EvalSandboxOut)
|
||||
async def create_eval_sandbox(
|
||||
_auth: InternalEvalAuth,
|
||||
replay: Annotated[
|
||||
ReplayConversationService, Depends(get_replay_conversation_service)
|
||||
],
|
||||
):
|
||||
try:
|
||||
uid, cid, phone, nick = await replay.create_eval_sandbox()
|
||||
except EvaluationBadRequestError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
return EvalSandboxOut(
|
||||
user_id=uid,
|
||||
conversation_id=cid,
|
||||
phone=phone,
|
||||
nickname=nick,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/replay/conversation", response_model=ReplayConversationOut)
|
||||
async def replay_conversation(
|
||||
body: ReplayConversationBody,
|
||||
_auth: InternalEvalAuth,
|
||||
replay: Annotated[
|
||||
ReplayConversationService, Depends(get_replay_conversation_service)
|
||||
],
|
||||
):
|
||||
if body.fixture_filename and body.user_utterances:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="provide only one of fixture_filename or user_utterances",
|
||||
)
|
||||
try:
|
||||
if body.fixture_filename:
|
||||
fn = body.fixture_filename.strip()
|
||||
n, echo = await replay.replay_fixture(
|
||||
conversation_id=body.conversation_id,
|
||||
fixture_filename=fn,
|
||||
flush_memoir_after=body.flush_memoir_after,
|
||||
skip_tts=body.skip_tts,
|
||||
)
|
||||
elif body.user_utterances is not None:
|
||||
utt = [str(u) for u in body.user_utterances if str(u).strip()]
|
||||
if not utt:
|
||||
raise EvaluationBadRequestError("user_utterances is empty")
|
||||
n = await replay.replay_utterances(
|
||||
conversation_id=body.conversation_id,
|
||||
utterances=utt,
|
||||
flush_memoir_after=body.flush_memoir_after,
|
||||
skip_tts=body.skip_tts,
|
||||
)
|
||||
echo = utt
|
||||
else:
|
||||
raise EvaluationBadRequestError(
|
||||
"fixture_filename or user_utterances required"
|
||||
)
|
||||
except EvaluationNotFoundError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
except EvaluationBadRequestError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
return ReplayConversationOut(
|
||||
conversation_id=body.conversation_id,
|
||||
turns_replayed=n,
|
||||
utterances_echo=echo,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/judge/conversation", response_model=ManualJudgeConversationOut)
|
||||
async def judge_conversation_manual(
|
||||
body: ManualJudgeConversationBody,
|
||||
_auth: InternalEvalAuth,
|
||||
judge_svc: Annotated[
|
||||
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
|
||||
],
|
||||
):
|
||||
try:
|
||||
payload = await judge_svc.judge_conversation(
|
||||
body.conversation_id,
|
||||
body.fixture_filename,
|
||||
)
|
||||
except EvaluationNotFoundError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
except EvaluationBadRequestError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
return ManualJudgeConversationOut.model_validate(payload)
|
||||
|
||||
|
||||
@router.post("/judge/conversation-stream")
|
||||
async def judge_conversation_manual_stream(
|
||||
body: ManualJudgeConversationStreamBody,
|
||||
_auth: InternalEvalAuth,
|
||||
judge_svc: Annotated[
|
||||
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
|
||||
],
|
||||
):
|
||||
async def event_iter():
|
||||
try:
|
||||
async for evt in judge_svc.iter_conversation_judge_sse(
|
||||
body.conversation_id,
|
||||
body.fixture_filename,
|
||||
):
|
||||
yield f"data: {json.dumps(evt, ensure_ascii=False)}\n\n"
|
||||
except Exception as e:
|
||||
err = json.dumps(
|
||||
{"event": "error", "phase": "server", "message": str(e)},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
yield f"data: {err}\n\n"
|
||||
yield f"data: {json.dumps({'event': 'done'}, ensure_ascii=False)}\n\n"
|
||||
|
||||
return StreamingResponse(
|
||||
event_iter(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@router.post("/judge/memoir-chapters", response_model=ManualJudgeMemoirOut)
|
||||
async def judge_memoir_chapters_manual(
|
||||
body: ManualJudgeMemoirBody,
|
||||
_auth: InternalEvalAuth,
|
||||
judge_svc: Annotated[
|
||||
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
|
||||
],
|
||||
):
|
||||
try:
|
||||
payload = await judge_svc.judge_memoir_for_user(
|
||||
body.user_id,
|
||||
body.baseline_sections,
|
||||
)
|
||||
except EvaluationBadRequestError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
return ManualJudgeMemoirOut.model_validate(payload)
|
||||
|
||||
|
||||
@router.get("/users/{user_id}/memoir-snapshot", response_model=UserMemoirSnapshotOut)
|
||||
async def get_user_memoir_snapshot(
|
||||
user_id: str,
|
||||
_auth: InternalEvalAuth,
|
||||
judge_svc: Annotated[
|
||||
EvalJudgeManualService, Depends(get_eval_judge_manual_service)
|
||||
],
|
||||
):
|
||||
try:
|
||||
payload = await judge_svc.memoir_snapshot(user_id)
|
||||
except EvaluationBadRequestError as e:
|
||||
raise _eval_http_exc(e) from e
|
||||
return UserMemoirSnapshotOut.model_validate(payload)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/fixtures/user-exports",
|
||||
response_model=UserExportFixtureListOut,
|
||||
@@ -227,19 +421,23 @@ async def list_user_export_fixtures(
|
||||
async def get_user_export_fixture(
|
||||
filename: str,
|
||||
_auth: InternalEvalAuth,
|
||||
svc: Annotated[EvaluationAdminService, Depends(get_evaluation_admin_service)],
|
||||
):
|
||||
try:
|
||||
turns = svc.load_user_export_fixture_turns(filename)
|
||||
turns, raw_md = read_user_export_fixture(filename)
|
||||
except ValueError:
|
||||
raise HTTPException(
|
||||
status_code=400, detail="invalid fixture filename"
|
||||
) from None
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(status_code=404, detail="fixture not found") from None
|
||||
memoir_tuples = extract_memoir_chapter_sections_from_export_md(raw_md)
|
||||
return UserExportFixtureDetailOut(
|
||||
filename=filename,
|
||||
turns=[UserExportFixtureTurnOut(user=u, ai=a) for u, a in turns],
|
||||
source_user_id=extract_source_user_id_from_export_md(raw_md),
|
||||
memoir_sections=[
|
||||
MemoirSectionBaselineOut(title=t, body=b) for t, b in memoir_tuples
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -22,3 +22,10 @@ CONV_JUDGE_INSTRUCTIONS = """你是访谈整段对话评审。给定完整 trans
|
||||
dimension_scores 建议至少包含:emotion, information, structure, repetition, naturalness(各 0-100 相对分量即可),用于反映整段是否重复盘问、是否自然;另可有 rationale。
|
||||
|
||||
只输出 JSON:total_score, dimension_scores, rationale。"""
|
||||
|
||||
|
||||
COMPARE_CONV_STREAM_HINT = """你是访谈对话评测专家。下面给出一份「回放/新测」完整对话 transcript 及其整体评分(JSON)。请用中文直接写正文(不要用 JSON):
|
||||
1) 对这段对话的整体评价与风险点;
|
||||
2) 可操作的改进建议(提示词、流程、模型参数等)。
|
||||
|
||||
笔调简洁、可执行。"""
|
||||
|
||||
@@ -133,9 +133,100 @@ class UserExportFixtureListOut(BaseModel):
|
||||
items: list[str]
|
||||
|
||||
|
||||
class MemoirSectionBaselineOut(BaseModel):
|
||||
title: str
|
||||
body: str
|
||||
|
||||
|
||||
class UserExportFixtureDetailOut(BaseModel):
|
||||
filename: str
|
||||
turns: list[UserExportFixtureTurnOut]
|
||||
source_user_id: str | None = None
|
||||
memoir_sections: list[MemoirSectionBaselineOut] = Field(default_factory=list)
|
||||
|
||||
|
||||
class ReplayBootstrapBody(BaseModel):
|
||||
user_id: str
|
||||
|
||||
|
||||
class ReplayBootstrapOut(BaseModel):
|
||||
conversation_id: str
|
||||
|
||||
|
||||
class EvalSandboxOut(BaseModel):
|
||||
"""内部评测专用:一次性临时账号 + 空白会话,不落真实手机号业务。"""
|
||||
|
||||
user_id: str
|
||||
conversation_id: str
|
||||
phone: str
|
||||
nickname: str
|
||||
|
||||
|
||||
class ReplayConversationBody(BaseModel):
|
||||
conversation_id: str
|
||||
fixture_filename: str | None = None
|
||||
user_utterances: list[str] | None = None
|
||||
flush_memoir_after: bool = True
|
||||
skip_tts: bool = True
|
||||
|
||||
|
||||
class ReplayConversationOut(BaseModel):
|
||||
conversation_id: str
|
||||
turns_replayed: int
|
||||
utterances_echo: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class ManualJudgeConversationBody(BaseModel):
|
||||
conversation_id: str
|
||||
"""与当前评测台选中的 MD 一致,供基准 transcript / 整体打分。"""
|
||||
fixture_filename: str | None = None
|
||||
|
||||
|
||||
class ManualJudgeConversationStreamBody(BaseModel):
|
||||
conversation_id: str
|
||||
fixture_filename: str | None = None
|
||||
|
||||
|
||||
class ManualJudgeConversationOut(BaseModel):
|
||||
conversation_id: str
|
||||
fixture_filename: str | None = None
|
||||
baseline_transcript: str = ""
|
||||
replay_transcript: str
|
||||
baseline_judge: dict[str, Any] | None = None
|
||||
replay_judge: dict[str, Any] | None = None
|
||||
errors: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class ManualJudgeMemoirBody(BaseModel):
|
||||
user_id: str
|
||||
baseline_sections: list[MemoirSectionBaselineOut] | None = None
|
||||
|
||||
|
||||
class ManualJudgeMemoirOut(BaseModel):
|
||||
user_id: str
|
||||
chapter_results: list[dict[str, Any]] = Field(default_factory=list)
|
||||
story_results: list[dict[str, Any]] = Field(default_factory=list)
|
||||
|
||||
|
||||
class MemoirChapterSnapOut(BaseModel):
|
||||
id: str
|
||||
title: str
|
||||
category: str | None = None
|
||||
order_index: int | None = None
|
||||
canonical_markdown: str | None = None
|
||||
|
||||
|
||||
class MemoirStorySnapOut(BaseModel):
|
||||
id: str
|
||||
title: str
|
||||
stage: str | None = None
|
||||
canonical_markdown: str | None = None
|
||||
|
||||
|
||||
class UserMemoirSnapshotOut(BaseModel):
|
||||
user_id: str
|
||||
chapters: list[MemoirChapterSnapOut]
|
||||
stories: list[MemoirStorySnapOut]
|
||||
|
||||
|
||||
class SnapshotFromConversationBody(BaseModel):
|
||||
|
||||
@@ -55,6 +55,39 @@ VITE_EVAL_API_BASE=http://127.0.0.1:8001 VITE_EVAL_API_KEY=与上同 npm run dev
|
||||
|
||||
浏览器 `EventSource` 无法带自定义 Header,流式端点支持 **query** `?key=`,与 `X-Internal-Eval-Key` 等效。
|
||||
|
||||
## 评测 Web:两大模块
|
||||
|
||||
- **对话评测**:选 `api/tests/user_exports/*.md` 为基准 →「新建评测会话」或填写已有 `conversation_id` →「执行回放」→「GLM 评审对话」。
|
||||
- **回忆录章节**:同一套 fixture 会带上导出 MD 中的 `source_user_id` 与 `memoir_sections`;「刷新库中章节/故事」拉 DB 快照 →「GLM 评审章节」(基线节选与当前成稿一并送评)。
|
||||
|
||||
## 真实链路透传回放(与 App 一致)
|
||||
|
||||
| 方法 | 路径 | 说明 |
|
||||
|------|------|------|
|
||||
| `POST` | `/internal/api/evaluation/sessions/eval-sandbox` | 无 body:新建**临时用户**(`eval_` 伪手机号)+ 空白 `conversation_id` |
|
||||
| `POST` | `/internal/api/evaluation/sessions/replay-bootstrap` | body:`{ "user_id" }`,在已有用户下返回新 `conversation_id` |
|
||||
| `POST` | `/internal/api/evaluation/replay/conversation` | body:`conversation_id`、`fixture_filename` **或** `user_utterances`;可选 `flush_memoir_after`(默认 true)、`skip_tts`(默认 true) |
|
||||
|
||||
每轮等价于 WebSocket 文本路径:`create_user_segment` → `process_user_message`(内部可 `force_skip_tts`)→ `background_runner.queue_message`。
|
||||
|
||||
- **TTS**:回放默认 `skip_tts: true`,不在评测台跑语音合成。
|
||||
- **Memory / 回忆录管线**:`queue_message` 与末尾 `flush_pending` 依赖 **Celery worker**(`process_memoir_phase1` 等);仅起 internal API 未起 worker 时,对话会落库但章节异步不会推进。
|
||||
|
||||
## 手动 GLM(不写 `eval_runs` 表)
|
||||
|
||||
| 方法 | 路径 | 说明 |
|
||||
|------|------|------|
|
||||
| `POST` | `/internal/api/evaluation/judge/conversation` | body:`{ "conversation_id" }`,返回轮次分 + 全文对话分 |
|
||||
| `POST` | `/internal/api/evaluation/judge/memoir-chapters` | body:`{ "user_id", "baseline_sections"? }`,Chapter/Story 分项 |
|
||||
| `GET` | `/internal/api/evaluation/users/{user_id}/memoir-snapshot` | 只读章节与故事正文快照 |
|
||||
|
||||
## Fixture 详情扩展
|
||||
|
||||
`GET /internal/api/evaluation/fixtures/user-exports/{filename}` 在原有 `turns` 外增加:
|
||||
|
||||
- `source_user_id`:导出抬头中的 User ID
|
||||
- `memoir_sections`:`## 回忆录章节(生成正文)` 下按标题切分的基线正文(已去掉 `{{IMAGE:...}}` 占位)
|
||||
|
||||
## 门禁规则(v1)
|
||||
|
||||
- 所有 case 的合成均分:候选须 **严格高于** 基线。
|
||||
|
||||
@@ -5,6 +5,8 @@ import pytest
|
||||
|
||||
from app.features.evaluation.importers.user_export_markdown import (
|
||||
extract_dialogue_turns_from_export_md,
|
||||
extract_memoir_chapter_sections_from_export_md,
|
||||
extract_source_user_id_from_export_md,
|
||||
extract_user_utterances_from_export_md,
|
||||
)
|
||||
|
||||
@@ -72,3 +74,32 @@ def test_extract_dialogue_turns_from_repo_user_export() -> None:
|
||||
turns = extract_dialogue_turns_from_export_md(text)
|
||||
assert len(turns) >= 5
|
||||
assert "你好" in turns[0][0]
|
||||
|
||||
|
||||
def test_extract_source_user_id_from_export_md() -> None:
|
||||
md = "- **User ID:** `e27fcd97-fefa-43b8-a7a3-3ecd49ebf5f0`\n"
|
||||
assert (
|
||||
extract_source_user_id_from_export_md(md)
|
||||
== "e27fcd97-fefa-43b8-a7a3-3ecd49ebf5f0"
|
||||
)
|
||||
|
||||
|
||||
def test_extract_memoir_chapter_sections_from_export_md() -> None:
|
||||
md = """
|
||||
## 回忆录章节(生成正文)
|
||||
|
||||
### First chapter
|
||||
|
||||
Line a.
|
||||
{{IMAGE:foo}}
|
||||
|
||||
### Second title
|
||||
|
||||
Line b.
|
||||
"""
|
||||
sections = extract_memoir_chapter_sections_from_export_md(md)
|
||||
assert len(sections) == 2
|
||||
assert sections[0][0] == "First chapter"
|
||||
assert "Line a." in sections[0][1]
|
||||
assert "{{IMAGE" not in sections[0][1]
|
||||
assert sections[1][0] == "Second title"
|
||||
|
||||
74
api/tests/evaluation/test_replay_router.py
Normal file
74
api/tests/evaluation/test_replay_router.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""回放 / 评审路由参数校验(最小 HTTP)。"""
|
||||
|
||||
import pytest
|
||||
from httpx import ASGITransport, AsyncClient
|
||||
|
||||
from app.features.evaluation.internal_auth import get_internal_eval_principal
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_replay_conversation_requires_fixture_or_utterances(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
from fastapi import FastAPI
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.core.config.settings.internal_eval_api_key",
|
||||
"secret",
|
||||
raising=False,
|
||||
)
|
||||
from app.features.evaluation.router import router
|
||||
|
||||
app = FastAPI()
|
||||
app.include_router(router, prefix="/internal/api/evaluation")
|
||||
|
||||
async def _override_auth():
|
||||
from app.features.evaluation.internal_auth import InternalEvalPrincipal
|
||||
|
||||
return InternalEvalPrincipal()
|
||||
|
||||
app.dependency_overrides[get_internal_eval_principal] = _override_auth
|
||||
transport = ASGITransport(app=app)
|
||||
async with AsyncClient(transport=transport, base_url="http://t") as client:
|
||||
r = await client.post(
|
||||
"/internal/api/evaluation/replay/conversation",
|
||||
headers={"X-Internal-Eval-Key": "secret"},
|
||||
json={"conversation_id": "00000000-0000-0000-0000-000000000001"},
|
||||
)
|
||||
assert r.status_code == 400
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_replay_conversation_rejects_both_fixture_and_utterances(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
from fastapi import FastAPI
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.core.config.settings.internal_eval_api_key",
|
||||
"secret",
|
||||
raising=False,
|
||||
)
|
||||
from app.features.evaluation.router import router
|
||||
|
||||
app = FastAPI()
|
||||
app.include_router(router, prefix="/internal/api/evaluation")
|
||||
|
||||
async def _override_auth():
|
||||
from app.features.evaluation.internal_auth import InternalEvalPrincipal
|
||||
|
||||
return InternalEvalPrincipal()
|
||||
|
||||
app.dependency_overrides[get_internal_eval_principal] = _override_auth
|
||||
transport = ASGITransport(app=app)
|
||||
async with AsyncClient(transport=transport, base_url="http://t") as client:
|
||||
r = await client.post(
|
||||
"/internal/api/evaluation/replay/conversation",
|
||||
headers={"X-Internal-Eval-Key": "secret"},
|
||||
json={
|
||||
"conversation_id": "00000000-0000-0000-0000-000000000001",
|
||||
"fixture_filename": "x.md",
|
||||
"user_utterances": ["a"],
|
||||
},
|
||||
)
|
||||
assert r.status_code == 400
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user