"""智谱 GLM-5 评审调用(结构化 JSON)。""" from __future__ import annotations from collections.abc import AsyncIterator from dataclasses import dataclass from typing import Any, Generic, TypeVar from app.core.config import settings from app.core.llm_call import LLMCallError, allm_json_call from app.core.logging import get_logger from app.features.evaluation.judge_schemas import ( ConversationJudgeOutput, MemoirJudgeOutput, TurnJudgeOutput, ) from app.features.evaluation.rubrics.conversation_v1 import ( COMPARE_CONV_STREAM_HINT, CONV_JUDGE_INSTRUCTIONS, TURN_JUDGE_INSTRUCTIONS, ) from app.features.evaluation.rubrics.memoir_v1 import MEMOIR_JUDGE_INSTRUCTIONS logger = get_logger(__name__) TJudgeOutput = TypeVar( "TJudgeOutput", TurnJudgeOutput, ConversationJudgeOutput, MemoirJudgeOutput ) _TURN_MAX = 768 _CONV_JUDGE_JSON_MAX = 2048 _CONV_HEADER = "【完整对话】(每轮以 `[Turn k]` 开头)\n\n" _MEMOIR_MAX = 12000 _MEMOIR_JSON_MAX = 1536 _COMPARE_STREAM_MAX = 6144 _MEMOIR_EVIDENCE_MAX = 12000 def _eval_judge_prompt_char_pool() -> int: """整段请求的字符预算(由评审模型 context window 推导,保守)。""" toks = ( settings.eval_judge_context_window_tokens - settings.eval_judge_completion_reserve_tokens - settings.eval_judge_prompt_budget_safety_tokens ) toks = max(1, toks) return max(1, int(toks / settings.eval_judge_approx_tokens_per_char)) def eval_judge_conversation_transcript_max_chars() -> int: """整段对话评审:【完整对话】transcript 最大字符数。""" if settings.eval_judge_max_transcript_chars > 0: return settings.eval_judge_max_transcript_chars overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32 return max(1, _eval_judge_prompt_char_pool() - overhead) def eval_judge_turn_prior_transcript_max_chars() -> int: """逐轮评审:截至上一轮的 transcript 节选上限(含与用户/助手正文头的固定开销)。""" if settings.eval_judge_max_transcript_chars > 0: return settings.eval_judge_max_transcript_chars static = len(TURN_JUDGE_INSTRUCTIONS) + 8800 return max(1, _eval_judge_prompt_char_pool() - static) def eval_judge_compare_transcript_each_max_chars() -> int: """A/B 两段 transcript 同 prompt 时,每条 transcript 的上限(均分字符预算)。""" if settings.eval_judge_max_compare_transcript_chars_each > 0: return settings.eval_judge_max_compare_transcript_chars_each pool = _eval_judge_prompt_char_pool() - settings.eval_judge_compare_prompt_overhead_chars return max(1, pool // 2) @dataclass(slots=True) class JudgeCallResult(Generic[TJudgeOutput]): output: TJudgeOutput | None error: str | None = None def _judge_error_message(e: LLMCallError) -> str: prefix = { "invoke": "模型调用失败", "decode": "JSON 解析失败", "validation": "结果校验失败", }.get(e.kind, "评审失败") detail = str(e).strip() return f"{prefix}: {detail}" if detail else prefix def _build_memoir_judge_prompt( *, memoir_markdown: str, source_transcript: str = "", reference_memoir_markdown: str = "", evidence_notes: str = "", ) -> str: """Assemble an evidence-aware memoir judging prompt.""" source = (source_transcript or "").strip() reference = (reference_memoir_markdown or "").strip() notes = (evidence_notes or "").strip() sections = [ MEMOIR_JUDGE_INSTRUCTIONS, "", "【证据与输入顺序】以下区块按优先级给出:评审说明(若有)→ 原始访谈证据 → 参考基线(若有)→ 待评成稿。**真实性相关细项必须以原始访谈证据为准。**", "", ] if notes: sections.extend(["【评审说明】", notes[:1200], ""]) if source: sections.extend(["【原始访谈/证据】", source[:_MEMOIR_EVIDENCE_MAX], ""]) else: sections.extend( [ "【原始访谈/证据】", "无可用原始访谈证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。", "", ] ) if reference: sections.extend(["【参考基线/导出成稿】", reference[:_MEMOIR_EVIDENCE_MAX], ""]) sections.extend(["【当前回忆录正文】", memoir_markdown[:_MEMOIR_MAX]]) return "\n".join(sections) class EvalJudgeService: def __init__(self, judge_llm: Any | None) -> None: self._llm = judge_llm async def judge_turn( self, *, prior_transcript: str, user_utterance: str, assistant_reply: str, turn_index_0: int = 0, ) -> TurnJudgeOutput | None: if not self._llm: return None t = max(0, int(turn_index_0)) prompt = f"""{TURN_JUDGE_INSTRUCTIONS} 【本轮位置】完整对话中当前轮次为 Turn {t + 1}(与下方节选及全量 transcript 的 `[Turn ...]` 编号一致)。evidence_refs.turn_index 请使用该编号。 【截至上一轮的对话节选】(含 `[Turn k]` 标签) {prior_transcript[: eval_judge_turn_prior_transcript_max_chars()]} 【本轮用户】 {user_utterance[:4000]} 【本轮 AI】 {assistant_reply[:4000]} """ try: return await allm_json_call( self._llm, prompt, TurnJudgeOutput, max_tokens=_TURN_MAX, agent="EvalJudgeService.judge_turn", ) except LLMCallError as e: logger.warning("turn judge failed: {}", e) return None async def judge_conversation_result( self, *, full_transcript: str ) -> JudgeCallResult[ConversationJudgeOutput]: if not self._llm: return JudgeCallResult(output=None, error="评审模型未配置") prompt = f"""{CONV_JUDGE_INSTRUCTIONS} 【完整对话】(每轮以 `[Turn k]` 开头) {full_transcript[: eval_judge_conversation_transcript_max_chars()]} """ try: out = await allm_json_call( self._llm, prompt, ConversationJudgeOutput, max_tokens=_CONV_JUDGE_JSON_MAX, agent="EvalJudgeService.judge_conversation", ) return JudgeCallResult(output=out) except LLMCallError as e: error = _judge_error_message(e) logger.warning("conversation judge failed: {}", error) return JudgeCallResult(output=None, error=error) async def judge_conversation( self, *, full_transcript: str ) -> ConversationJudgeOutput | None: result = await self.judge_conversation_result(full_transcript=full_transcript) return result.output async def stream_conversation_compare( self, *, baseline_transcript: str, replay_transcript: str, baseline_judge: ConversationJudgeOutput | None, replay_judge: ConversationJudgeOutput | None, ) -> AsyncIterator[str]: """流式输出中文对比与建议(非 JSON)。""" if not self._llm: yield "[错误] 未配置评审模型 API Key(eval_judge_api_key / zhipu_api_key)" return cap_each = eval_judge_compare_transcript_each_max_chars() cap_single = eval_judge_conversation_transcript_max_chars() b_tr = (baseline_transcript or "").strip()[:cap_each] r_tr = (replay_transcript or "").strip()[:cap_each] b_json = ( baseline_judge.model_dump_json(ensure_ascii=False) if baseline_judge else "null" ) r_json = ( replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null" ) if baseline_judge and replay_judge: prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分(JSON)。请用中文直接写正文(不要用 JSON、不要用 Markdown 代码块): 【A:导出基准对话】(历史快照:用户与当时导出的线上 AI,多轮合并为一篇) {b_tr} 【B:本次回放/新测对话】(用户句与基准对齐,AI 为当前后端重新生成) {r_tr} 【A 的整体评分 JSON】 {b_json} 【B 的整体评分 JSON】 {r_json} 请依次撰写: 1) 两段对话在整体体验上的主要差异(情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等); 2) B 相对 A 的优点与不足; 3) 若 B 在关键维度明显弱于 A,给出可操作的改进方向(系统提示、访谈策略、模型或温度等)。 笔调简洁、偏执行清单。""" elif replay_judge: r_one = (replay_transcript or "").strip()[:cap_single] prompt = f"""{COMPARE_CONV_STREAM_HINT} 【回放/新测 transcript】 {r_one} 【整体评分 JSON】 {r_json} """ else: yield "[错误] 缺少回放对话评分,无法生成建议" return llm = self._llm if hasattr(llm, "bind"): llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX) try: async for chunk in llm.astream(prompt): piece = getattr(chunk, "content", None) if piece: yield piece except Exception as e: logger.warning("conversation compare stream failed: {}", e) yield f"\n\n[流式输出中断:{e}]" async def judge_memoir( self, *, memoir_markdown: str, source_transcript: str = "", reference_memoir_markdown: str = "", evidence_notes: str = "", ) -> MemoirJudgeOutput | None: result = await self.judge_memoir_result( memoir_markdown=memoir_markdown, source_transcript=source_transcript, reference_memoir_markdown=reference_memoir_markdown, evidence_notes=evidence_notes, ) return result.output async def judge_memoir_result( self, *, memoir_markdown: str, source_transcript: str = "", reference_memoir_markdown: str = "", evidence_notes: str = "", ) -> JudgeCallResult[MemoirJudgeOutput]: if not self._llm: return JudgeCallResult(output=None, error="评审模型未配置") prompt = _build_memoir_judge_prompt( memoir_markdown=memoir_markdown, source_transcript=source_transcript, reference_memoir_markdown=reference_memoir_markdown, evidence_notes=evidence_notes, ) try: out = await allm_json_call( self._llm, prompt, MemoirJudgeOutput, max_tokens=_MEMOIR_JSON_MAX, agent="EvalJudgeService.judge_memoir", ) return JudgeCallResult(output=out) except LLMCallError as e: error = _judge_error_message(e) logger.warning("memoir judge failed: {}", error) return JudgeCallResult(output=None, error=error)