Files
life-echo/api/app/features/evaluation/judge_manual_service.py
Kevin 99543d04c6 feat(eval): internal-eval stack, judge fixes, and eval web overhaul
- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.
2026-04-07 17:18:47 +08:00

480 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""手动触发 GLM 评审(不写 eval_runs"""
from __future__ import annotations
import re
from collections.abc import AsyncIterator
from typing import Any
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.dependencies import get_eval_judge_langchain_llm
from app.core.logging import get_logger
from app.features.conversation import repo as conversation_repo
from app.features.evaluation.errors import (
EvaluationBadRequestError,
EvaluationNotFoundError,
)
from app.features.evaluation.judge_service import EvalJudgeService
from app.features.evaluation.transcript_for_judge import (
assistant_text_for_eval_display,
format_eval_turn_block,
format_export_turns_with_labels,
format_session_messages_with_turn_labels,
pair_session_messages_to_turns,
)
from app.features.evaluation.schemas import MemoirSectionBaselineOut
from app.features.evaluation.session_catalog_service import SessionCatalogService
from app.features.evaluation.user_export_fixtures import read_user_export_fixture
from app.features.memoir.repo import get_chapters_for_memoir_list
from app.features.story.repo import get_stories_for_user
logger = get_logger(__name__)
_MAX_JUDGE_MARKDOWN_CHARS = 20_000
_MAX_EVAL_CHAPTERS = 30
_MAX_EVAL_STORIES = 40
_MAX_EVIDENCE_CONVERSATIONS = 8
_MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000
_PRIOR_TRANSCRIPT_MAX_CHARS = 8000
async def _iter_turn_judgments_for_turns(
judge: EvalJudgeService,
turns: list[tuple[str, str]],
*,
sse_event: str,
) -> AsyncIterator[dict[str, Any]]:
"""与 `execute_eval_run` 相同的逐轮 prior 截断与块累积。"""
prior_blocks: list[str] = []
for idx, (u_raw, ai_raw) in enumerate(turns):
u = (u_raw or "").strip()
reply = assistant_text_for_eval_display(str(ai_raw))
prior = "\n\n".join(prior_blocks)
if len(prior) > _PRIOR_TRANSCRIPT_MAX_CHARS:
prior = prior[-_PRIOR_TRANSCRIPT_MAX_CHARS:]
tj = await judge.judge_turn(
prior_transcript=prior,
user_utterance=u,
assistant_reply=reply,
turn_index_0=idx,
)
yield {
"event": sse_event,
"turn_index": idx,
"ok": tj is not None,
"judge": tj.model_dump() if tj else None,
}
prior_blocks.append(format_eval_turn_block(idx, u, reply))
def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str:
s = (text or "").strip()
if len(s) <= max_chars:
return s
return f"{s[:max_chars]}\n\n…(已截断供评审)"
def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str:
s = (text or "").strip()
if len(s) <= max_chars:
return s
return f"{s[:max_chars]}\n\n…(访谈证据已截断)"
async def _conversation_transcript_for_manual(
db: AsyncSession, conversation_id: str
) -> str:
rows = await conversation_repo.get_conversation_messages(conversation_id, db)
return format_session_messages_with_turn_labels(rows)
async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str:
conversations = await conversation_repo.get_user_conversations(user_id, db)
if not conversations:
return ""
parts: list[str] = []
for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]):
transcript = await _conversation_transcript_for_manual(db, str(conv.id))
if transcript:
parts.append(f"## 会话 {str(conv.id)}\n{transcript}")
return _trim_evidence_text("\n\n".join(parts))
def _normalize_title_key(title: str) -> str:
t = (title or "").strip().lower()
t = re.sub(r"^#+\s*", "", t)
return re.sub(r"\s+", " ", t)
def _baseline_for_chapter_title(
baselines: list[MemoirSectionBaselineOut],
chapter_title: str,
index: int,
) -> MemoirSectionBaselineOut | None:
if baselines:
key = _normalize_title_key(chapter_title)
for b in baselines:
if _normalize_title_key(b.title) == key:
return b
if 0 <= index < len(baselines):
return baselines[index]
return None
class EvalJudgeManualService:
def __init__(self, db: AsyncSession) -> None:
self._db = db
async def judge_conversation(
self,
conversation_id: str,
fixture_filename: str | None,
) -> dict[str, Any]:
cid = (conversation_id or "").strip()
if not cid:
raise EvaluationBadRequestError("conversation_id is required")
catalog = SessionCatalogService(self._db)
dialogue = await catalog.get_session_dialogue(cid)
if not dialogue:
raise EvaluationNotFoundError("conversation not found")
replay_transcript = format_session_messages_with_turn_labels(
list(dialogue.messages)
)
if not replay_transcript.strip():
raise EvaluationBadRequestError("no messages to judge")
fn = (fixture_filename or "").strip() or None
baseline_transcript = ""
if fn:
try:
turns, _ = read_user_export_fixture(fn)
baseline_transcript = format_export_turns_with_labels(turns)
except ValueError as e:
raise EvaluationBadRequestError(str(e)) from e
except FileNotFoundError as e:
raise EvaluationNotFoundError("fixture not found") from e
errors: list[str] = []
judge_llm = get_eval_judge_langchain_llm()
judge = EvalJudgeService(judge_llm)
baseline_judge_dict: dict[str, Any] | None = None
if baseline_transcript.strip():
baseline_result = await judge.judge_conversation_result(
full_transcript=baseline_transcript
)
bj = baseline_result.output
if bj:
baseline_judge_dict = bj.model_dump()
else:
errors.append(
f"baseline_glm_failed: {baseline_result.error or 'unknown error'}"
)
elif fn:
errors.append("baseline_transcript_empty")
replay_result = await judge.judge_conversation_result(
full_transcript=replay_transcript
)
rj = replay_result.output
replay_judge_dict = rj.model_dump() if rj else None
if not rj:
errors.append(
f"replay_glm_failed: {replay_result.error or 'unknown error'}"
)
return {
"conversation_id": cid,
"fixture_filename": fn,
"baseline_transcript": baseline_transcript,
"replay_transcript": replay_transcript,
"baseline_judge": baseline_judge_dict,
"replay_judge": replay_judge_dict,
"errors": errors,
}
async def iter_conversation_judge_sse(
self,
conversation_id: str,
fixture_filename: str | None,
*,
include_turn_judges: bool = False,
include_baseline_turn_judges: bool = False,
) -> AsyncIterator[dict[str, Any]]:
"""供 SSE先整体基准分、再整体回放分可选逐轮分再流式对比与建议。"""
cid = (conversation_id or "").strip()
if not cid:
yield {
"event": "error",
"phase": "validate",
"message": "conversation_id is required",
}
return
catalog = SessionCatalogService(self._db)
dialogue = await catalog.get_session_dialogue(cid)
if not dialogue:
yield {
"event": "error",
"phase": "load",
"message": "conversation not found",
}
return
replay_transcript = format_session_messages_with_turn_labels(
list(dialogue.messages)
)
if not replay_transcript.strip():
yield {"event": "error", "phase": "load", "message": "no messages to judge"}
return
fn = (fixture_filename or "").strip() or None
baseline_transcript = ""
export_turns: list[tuple[str, str]] | None = None
if fn:
try:
turns, _ = read_user_export_fixture(fn)
export_turns = list(turns)
baseline_transcript = format_export_turns_with_labels(turns)
except ValueError as e:
yield {"event": "error", "phase": "fixture", "message": str(e)}
return
except FileNotFoundError:
yield {
"event": "error",
"phase": "fixture",
"message": "fixture not found",
}
return
judge_llm = get_eval_judge_langchain_llm()
if not judge_llm:
yield {
"event": "error",
"phase": "config",
"message": "评审 LLM 未配置eval_judge_api_key / zhipu_api_key",
}
return
judge = EvalJudgeService(judge_llm)
yield {"event": "meta", "conversation_id": cid, "fixture_filename": fn}
if not baseline_transcript.strip():
yield {
"event": "warning",
"message": "未提供基准 MD 或基准无文本:仅对回放对话打分并输出单侧改进建议",
}
baseline_judge = None
if baseline_transcript.strip():
baseline_result = await judge.judge_conversation_result(
full_transcript=baseline_transcript
)
baseline_judge = baseline_result.output
yield {
"event": "baseline_judge",
"ok": baseline_judge is not None,
"judge": baseline_judge.model_dump() if baseline_judge else None,
}
if not baseline_judge:
yield {
"event": "error",
"phase": "baseline_glm",
"message": (
f"基准整体打分失败:{baseline_result.error}"
if baseline_result.error
else "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)"
),
}
elif (
include_baseline_turn_judges
and export_turns
and baseline_judge is not None
):
yield {"event": "meta", "phase": "baseline_turn_judges_start"}
async for row in _iter_turn_judgments_for_turns(
judge,
export_turns,
sse_event="baseline_turn_judge",
):
yield row
else:
yield {
"event": "baseline_judge",
"ok": False,
"skipped": True,
"judge": None,
}
replay_result = await judge.judge_conversation_result(
full_transcript=replay_transcript
)
replay_judge = replay_result.output
yield {
"event": "replay_judge",
"ok": replay_judge is not None,
"judge": replay_judge.model_dump() if replay_judge else None,
}
if not replay_judge:
yield {
"event": "error",
"phase": "replay_glm",
"message": (
f"回放对话整体 GLM 打分失败:{replay_result.error}"
if replay_result.error
else "回放对话整体 GLM 打分失败(空密钥、限流或 JSON 解析失败,见服务端日志)"
),
}
yield {"event": "done"}
return
if include_turn_judges:
replay_pairs = pair_session_messages_to_turns(list(dialogue.messages))
if replay_pairs:
yield {"event": "meta", "phase": "replay_turn_judges_start"}
async for row in _iter_turn_judgments_for_turns(
judge,
replay_pairs,
sse_event="replay_turn_judge",
):
yield row
async for piece in judge.stream_conversation_compare(
baseline_transcript=baseline_transcript,
replay_transcript=replay_transcript,
baseline_judge=baseline_judge,
replay_judge=replay_judge,
):
if piece:
yield {"event": "compare_delta", "text": piece}
yield {"event": "done"}
async def judge_memoir_for_user(
self,
user_id: str,
baseline_sections: list[MemoirSectionBaselineOut] | None,
) -> dict[str, Any]:
uid = (user_id or "").strip()
if not uid:
raise EvaluationBadRequestError("user_id is required")
judge_llm = get_eval_judge_langchain_llm()
judge = EvalJudgeService(judge_llm)
baselines = list(baseline_sections or [])
evidence_transcript = await _user_transcript_evidence(self._db, uid)
chapter_results: list[dict[str, Any]] = []
try:
chapters = await get_chapters_for_memoir_list(
uid, self._db, active_only=True, is_new_only=None
)
for i, ch in enumerate(chapters[:_MAX_EVAL_CHAPTERS]):
body = (ch.canonical_markdown or "").strip()
if not body:
continue
bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i)
baseline_excerpt = ""
if bl and (bl.body or "").strip():
baseline_excerpt = _clip_md_for_judge(bl.body, max_chars=6000)
md = f"# 章节:{ch.title}\n\n"
if baseline_excerpt:
md += f"## 导出基线(节选)\n\n{baseline_excerpt}\n\n"
md += f"## 当前成稿\n\n{_clip_md_for_judge(body)}"
cj = await judge.judge_memoir(
memoir_markdown=md,
source_transcript=evidence_transcript,
reference_memoir_markdown=baseline_excerpt,
evidence_notes=(
"严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
),
)
chapter_results.append(
{
"id": ch.id,
"title": ch.title,
"order_index": ch.order_index,
"baseline_title": bl.title if bl else None,
"judge": cj.model_dump() if cj else None,
}
)
except Exception as e:
logger.warning("manual memoir chapter judges failed: {}", e)
story_results: list[dict[str, Any]] = []
try:
stories = await get_stories_for_user(self._db, uid, status="active")
for st in stories[:_MAX_EVAL_STORIES]:
body = (st.canonical_markdown or "").strip()
if not body:
continue
md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}"
sj = await judge.judge_memoir(
memoir_markdown=md,
source_transcript=evidence_transcript,
evidence_notes=(
"严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。"
),
)
story_results.append(
{
"id": st.id,
"title": st.title,
"stage": st.stage,
"judge": sj.model_dump() if sj else None,
}
)
except Exception as e:
logger.warning("manual memoir story judges failed: {}", e)
return {
"user_id": uid,
"chapter_results": chapter_results,
"story_results": story_results,
}
async def memoir_snapshot(self, user_id: str) -> dict[str, Any]:
uid = (user_id or "").strip()
if not uid:
raise EvaluationBadRequestError("user_id is required")
chapters_out: list[dict[str, Any]] = []
stories_out: list[dict[str, Any]] = []
try:
chapters = await get_chapters_for_memoir_list(
uid, self._db, active_only=True, is_new_only=None
)
for ch in chapters[:_MAX_EVAL_CHAPTERS]:
chapters_out.append(
{
"id": ch.id,
"title": ch.title,
"category": ch.category,
"order_index": ch.order_index,
"canonical_markdown": ch.canonical_markdown,
}
)
except Exception as e:
logger.warning("memoir snapshot chapters failed: {}", e)
try:
stories = await get_stories_for_user(self._db, uid, status="active")
for st in stories[:_MAX_EVAL_STORIES]:
stories_out.append(
{
"id": st.id,
"title": st.title,
"stage": st.stage,
"canonical_markdown": st.canonical_markdown,
}
)
except Exception as e:
logger.warning("memoir snapshot stories failed: {}", e)
return {
"user_id": uid,
"chapters": chapters_out,
"stories": stories_out,
}