"""手动触发 GLM 评审(不写 eval_runs)。""" from __future__ import annotations import re from collections.abc import AsyncIterator from typing import Any from sqlalchemy.ext.asyncio import AsyncSession from app.core.dependencies import get_eval_judge_langchain_llm from app.core.logging import get_logger from app.features.conversation import repo as conversation_repo from app.features.evaluation.errors import ( EvaluationBadRequestError, EvaluationNotFoundError, ) from app.features.evaluation.execution_service import _assistant_text_for_eval_display from app.features.evaluation.judge_service import EvalJudgeService from app.features.evaluation.schemas import MemoirSectionBaselineOut from app.features.evaluation.session_catalog_service import SessionCatalogService from app.features.evaluation.user_export_fixtures import read_user_export_fixture from app.features.memoir.repo import get_chapters_for_memoir_list from app.features.story.repo import get_stories_for_user logger = get_logger(__name__) _MAX_JUDGE_MARKDOWN_CHARS = 20_000 _MAX_EVAL_CHAPTERS = 30 _MAX_EVAL_STORIES = 40 _MAX_EVIDENCE_CONVERSATIONS = 8 _MAX_EVIDENCE_TRANSCRIPT_CHARS = 16_000 def _clip_md_for_judge(text: str, max_chars: int = _MAX_JUDGE_MARKDOWN_CHARS) -> str: s = (text or "").strip() if len(s) <= max_chars: return s return f"{s[:max_chars]}\n\n…(已截断供评审)" def _transcript_from_export_turns(turns: list[tuple[str, str]]) -> str: parts: list[str] = [] for u, ai in turns: u = (u or "").strip() ai = (ai or "").strip() if u: parts.append(f"用户: {u}") if ai: parts.append(f"AI: {_assistant_text_for_eval_display(ai)}") return "\n\n".join(parts) def _trim_evidence_text(text: str, max_chars: int = _MAX_EVIDENCE_TRANSCRIPT_CHARS) -> str: s = (text or "").strip() if len(s) <= max_chars: return s return f"{s[:max_chars]}\n\n…(访谈证据已截断)" async def _conversation_transcript_for_manual( db: AsyncSession, conversation_id: str ) -> str: rows = await conversation_repo.get_conversation_messages(conversation_id, db) parts: list[str] = [] for row in rows: role = (row.role or "").lower() body = (row.content or "").strip() if not body: continue label = "用户" if role == "human" else "AI" out = _assistant_text_for_eval_display(body) if role != "human" else body parts.append(f"{label}: {out}") return "\n\n".join(parts) async def _user_transcript_evidence(db: AsyncSession, user_id: str) -> str: conversations = await conversation_repo.get_user_conversations(user_id, db) if not conversations: return "" parts: list[str] = [] for conv in reversed(conversations[:_MAX_EVIDENCE_CONVERSATIONS]): transcript = await _conversation_transcript_for_manual(db, str(conv.id)) if transcript: parts.append(f"## 会话 {str(conv.id)}\n{transcript}") return _trim_evidence_text("\n\n".join(parts)) def _normalize_title_key(title: str) -> str: t = (title or "").strip().lower() t = re.sub(r"^#+\s*", "", t) return re.sub(r"\s+", " ", t) def _baseline_for_chapter_title( baselines: list[MemoirSectionBaselineOut], chapter_title: str, index: int, ) -> MemoirSectionBaselineOut | None: if baselines: key = _normalize_title_key(chapter_title) for b in baselines: if _normalize_title_key(b.title) == key: return b if 0 <= index < len(baselines): return baselines[index] return None class EvalJudgeManualService: def __init__(self, db: AsyncSession) -> None: self._db = db async def judge_conversation( self, conversation_id: str, fixture_filename: str | None, ) -> dict[str, Any]: cid = (conversation_id or "").strip() if not cid: raise EvaluationBadRequestError("conversation_id is required") catalog = SessionCatalogService(self._db) dialogue = await catalog.get_session_dialogue(cid) if not dialogue: raise EvaluationNotFoundError("conversation not found") parts: list[str] = [] for m in dialogue.messages: r = (m.role or "").lower() label = "用户" if r == "human" else "AI" raw = m.content or "" out = _assistant_text_for_eval_display(raw) if r != "human" else raw parts.append(f"{label}: {out}") replay_transcript = "\n\n".join(parts) if not replay_transcript.strip(): raise EvaluationBadRequestError("no messages to judge") fn = (fixture_filename or "").strip() or None baseline_transcript = "" if fn: try: turns, _ = read_user_export_fixture(fn) baseline_transcript = _transcript_from_export_turns(turns) except ValueError as e: raise EvaluationBadRequestError(str(e)) from e except FileNotFoundError as e: raise EvaluationNotFoundError("fixture not found") from e errors: list[str] = [] judge_llm = get_eval_judge_langchain_llm() judge = EvalJudgeService(judge_llm) baseline_judge_dict: dict[str, Any] | None = None if baseline_transcript.strip(): bj = await judge.judge_conversation(full_transcript=baseline_transcript) if bj: baseline_judge_dict = bj.model_dump() else: errors.append("baseline_glm_failed") elif fn: errors.append("baseline_transcript_empty") rj = await judge.judge_conversation(full_transcript=replay_transcript) replay_judge_dict = rj.model_dump() if rj else None if not rj: errors.append("replay_glm_failed") return { "conversation_id": cid, "fixture_filename": fn, "baseline_transcript": baseline_transcript, "replay_transcript": replay_transcript, "baseline_judge": baseline_judge_dict, "replay_judge": replay_judge_dict, "errors": errors, } async def iter_conversation_judge_sse( self, conversation_id: str, fixture_filename: str | None, ) -> AsyncIterator[dict[str, Any]]: """供 SSE:先整体基准分、再整体回放分,再流式对比与建议。""" cid = (conversation_id or "").strip() if not cid: yield { "event": "error", "phase": "validate", "message": "conversation_id is required", } return catalog = SessionCatalogService(self._db) dialogue = await catalog.get_session_dialogue(cid) if not dialogue: yield { "event": "error", "phase": "load", "message": "conversation not found", } return parts: list[str] = [] for m in dialogue.messages: r = (m.role or "").lower() label = "用户" if r == "human" else "AI" raw = m.content or "" out = _assistant_text_for_eval_display(raw) if r != "human" else raw parts.append(f"{label}: {out}") replay_transcript = "\n\n".join(parts) if not replay_transcript.strip(): yield {"event": "error", "phase": "load", "message": "no messages to judge"} return fn = (fixture_filename or "").strip() or None baseline_transcript = "" if fn: try: turns, _ = read_user_export_fixture(fn) baseline_transcript = _transcript_from_export_turns(turns) except ValueError as e: yield {"event": "error", "phase": "fixture", "message": str(e)} return except FileNotFoundError: yield { "event": "error", "phase": "fixture", "message": "fixture not found", } return judge_llm = get_eval_judge_langchain_llm() if not judge_llm: yield { "event": "error", "phase": "config", "message": "评审 LLM 未配置(eval_judge_api_key / zhipu_api_key)", } return judge = EvalJudgeService(judge_llm) yield {"event": "meta", "conversation_id": cid, "fixture_filename": fn} if not baseline_transcript.strip(): yield { "event": "warning", "message": "未提供基准 MD 或基准无文本:仅对回放对话打分并输出单侧改进建议", } baseline_judge = None if baseline_transcript.strip(): baseline_judge = await judge.judge_conversation( full_transcript=baseline_transcript ) yield { "event": "baseline_judge", "ok": baseline_judge is not None, "judge": baseline_judge.model_dump() if baseline_judge else None, } if not baseline_judge: yield { "event": "error", "phase": "baseline_glm", "message": "基准整体打分失败(密钥、限流或 JSON 解析失败,见服务端日志)", } else: yield { "event": "baseline_judge", "ok": False, "skipped": True, "judge": None, } replay_judge = await judge.judge_conversation(full_transcript=replay_transcript) yield { "event": "replay_judge", "ok": replay_judge is not None, "judge": replay_judge.model_dump() if replay_judge else None, } if not replay_judge: yield { "event": "error", "phase": "replay_glm", "message": "回放对话整体 GLM 打分失败(空密钥、限流或 JSON 解析失败,见服务端日志)", } yield {"event": "done"} return async for piece in judge.stream_conversation_compare( baseline_transcript=baseline_transcript, replay_transcript=replay_transcript, baseline_judge=baseline_judge, replay_judge=replay_judge, ): if piece: yield {"event": "compare_delta", "text": piece} yield {"event": "done"} async def judge_memoir_for_user( self, user_id: str, baseline_sections: list[MemoirSectionBaselineOut] | None, ) -> dict[str, Any]: uid = (user_id or "").strip() if not uid: raise EvaluationBadRequestError("user_id is required") judge_llm = get_eval_judge_langchain_llm() judge = EvalJudgeService(judge_llm) baselines = list(baseline_sections or []) evidence_transcript = await _user_transcript_evidence(self._db, uid) chapter_results: list[dict[str, Any]] = [] try: chapters = await get_chapters_for_memoir_list( uid, self._db, active_only=True, is_new_only=None ) for i, ch in enumerate(chapters[:_MAX_EVAL_CHAPTERS]): body = (ch.canonical_markdown or "").strip() if not body: continue bl = _baseline_for_chapter_title(baselines, str(ch.title or ""), i) baseline_excerpt = "" if bl and (bl.body or "").strip(): baseline_excerpt = _clip_md_for_judge(bl.body, max_chars=6000) md = f"# 章节:{ch.title}\n\n" if baseline_excerpt: md += f"## 导出基线(节选)\n\n{baseline_excerpt}\n\n" md += f"## 当前成稿\n\n{_clip_md_for_judge(body)}" cj = await judge.judge_memoir( memoir_markdown=md, source_transcript=evidence_transcript, reference_memoir_markdown=baseline_excerpt, evidence_notes=( "严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。" ), ) chapter_results.append( { "id": ch.id, "title": ch.title, "order_index": ch.order_index, "baseline_title": bl.title if bl else None, "judge": cj.model_dump() if cj else None, } ) except Exception as e: logger.warning("manual memoir chapter judges failed: {}", e) story_results: list[dict[str, Any]] = [] try: stories = await get_stories_for_user(self._db, uid, status="active") for st in stories[:_MAX_EVAL_STORIES]: body = (st.canonical_markdown or "").strip() if not body: continue md = f"# 故事:{st.title}\n\n{_clip_md_for_judge(body)}" sj = await judge.judge_memoir( memoir_markdown=md, source_transcript=evidence_transcript, evidence_notes=( "严格按文档打分;真实性、事实覆盖率、可追溯性必须优先对照该用户历史访谈证据。" ), ) story_results.append( { "id": st.id, "title": st.title, "stage": st.stage, "judge": sj.model_dump() if sj else None, } ) except Exception as e: logger.warning("manual memoir story judges failed: {}", e) return { "user_id": uid, "chapter_results": chapter_results, "story_results": story_results, } async def memoir_snapshot(self, user_id: str) -> dict[str, Any]: uid = (user_id or "").strip() if not uid: raise EvaluationBadRequestError("user_id is required") chapters_out: list[dict[str, Any]] = [] stories_out: list[dict[str, Any]] = [] try: chapters = await get_chapters_for_memoir_list( uid, self._db, active_only=True, is_new_only=None ) for ch in chapters[:_MAX_EVAL_CHAPTERS]: chapters_out.append( { "id": ch.id, "title": ch.title, "category": ch.category, "order_index": ch.order_index, "canonical_markdown": ch.canonical_markdown, } ) except Exception as e: logger.warning("memoir snapshot chapters failed: {}", e) try: stories = await get_stories_for_user(self._db, uid, status="active") for st in stories[:_MAX_EVAL_STORIES]: stories_out.append( { "id": st.id, "title": st.title, "stage": st.stage, "canonical_markdown": st.canonical_markdown, } ) except Exception as e: logger.warning("memoir snapshot stories failed: {}", e) return { "user_id": uid, "chapters": chapters_out, "stories": stories_out, }