Files
life-echo/api/app/features/evaluation/judge_service.py

311 lines
11 KiB
Python
Raw Normal View History

"""智谱 GLM-5 评审调用(结构化 JSON"""
from __future__ import annotations
2026-04-06 23:19:20 +08:00
from collections.abc import AsyncIterator
from dataclasses import dataclass
from typing import Any, Generic, TypeVar
from app.core.config import settings
from app.core.llm_call import LLMCallError, allm_json_call
from app.core.logging import get_logger
from app.features.evaluation.judge_schemas import (
ConversationJudgeOutput,
MemoirJudgeOutput,
TurnJudgeOutput,
)
from app.features.evaluation.rubrics.conversation_v1 import (
2026-04-06 23:19:20 +08:00
COMPARE_CONV_STREAM_HINT,
CONV_JUDGE_INSTRUCTIONS,
TURN_JUDGE_INSTRUCTIONS,
)
from app.features.evaluation.rubrics.memoir_v1 import MEMOIR_JUDGE_INSTRUCTIONS
logger = get_logger(__name__)
TJudgeOutput = TypeVar(
"TJudgeOutput", TurnJudgeOutput, ConversationJudgeOutput, MemoirJudgeOutput
)
_TURN_MAX = 768
2026-04-06 23:19:20 +08:00
_CONV_JUDGE_JSON_MAX = 2048
_CONV_HEADER = "【完整对话】(每轮以 `[Turn k]` 开头)\n\n"
_MEMOIR_MAX = 12000
_MEMOIR_JSON_MAX = 1536
2026-04-06 23:19:20 +08:00
_COMPARE_STREAM_MAX = 6144
_MEMOIR_EVIDENCE_MAX = 12000
def _eval_judge_prompt_char_pool() -> int:
"""整段请求的字符预算(由评审模型 context window 推导,保守)。"""
toks = (
settings.eval_judge_context_window_tokens
- settings.eval_judge_completion_reserve_tokens
- settings.eval_judge_prompt_budget_safety_tokens
)
toks = max(1, toks)
return max(1, int(toks / settings.eval_judge_approx_tokens_per_char))
def eval_judge_conversation_transcript_max_chars() -> int:
"""整段对话评审【完整对话】transcript 最大字符数。"""
if settings.eval_judge_max_transcript_chars > 0:
return settings.eval_judge_max_transcript_chars
overhead = len(CONV_JUDGE_INSTRUCTIONS) + len(_CONV_HEADER) + 32
return max(1, _eval_judge_prompt_char_pool() - overhead)
def eval_judge_turn_prior_transcript_max_chars() -> int:
"""逐轮评审:截至上一轮的 transcript 节选上限(含与用户/助手正文头的固定开销)。"""
if settings.eval_judge_max_transcript_chars > 0:
return settings.eval_judge_max_transcript_chars
static = len(TURN_JUDGE_INSTRUCTIONS) + 8800
return max(1, _eval_judge_prompt_char_pool() - static)
def eval_judge_compare_transcript_each_max_chars() -> int:
"""A/B 两段 transcript 同 prompt 时,每条 transcript 的上限(均分字符预算)。"""
if settings.eval_judge_max_compare_transcript_chars_each > 0:
return settings.eval_judge_max_compare_transcript_chars_each
pool = _eval_judge_prompt_char_pool() - settings.eval_judge_compare_prompt_overhead_chars
return max(1, pool // 2)
@dataclass(slots=True)
class JudgeCallResult(Generic[TJudgeOutput]):
output: TJudgeOutput | None
error: str | None = None
def _judge_error_message(e: LLMCallError) -> str:
prefix = {
"invoke": "模型调用失败",
"decode": "JSON 解析失败",
"validation": "结果校验失败",
}.get(e.kind, "评审失败")
detail = str(e).strip()
return f"{prefix}: {detail}" if detail else prefix
def _build_memoir_judge_prompt(
*,
memoir_markdown: str,
source_transcript: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> str:
"""Assemble an evidence-aware memoir judging prompt."""
source = (source_transcript or "").strip()
reference = (reference_memoir_markdown or "").strip()
notes = (evidence_notes or "").strip()
sections = [
MEMOIR_JUDGE_INSTRUCTIONS,
"",
"【证据与输入顺序】以下区块按优先级给出:评审说明(若有)→ 原始访谈证据 → 参考基线(若有)→ 待评成稿。**真实性相关细项必须以原始访谈证据为准。**",
"",
]
if notes:
sections.extend(["【评审说明】", notes[:1200], ""])
if source:
sections.extend(["【原始访谈/证据】", source[:_MEMOIR_EVIDENCE_MAX], ""])
else:
sections.extend(
[
"【原始访谈/证据】",
"无可用原始访谈证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。",
"",
]
)
if reference:
sections.extend(["【参考基线/导出成稿】", reference[:_MEMOIR_EVIDENCE_MAX], ""])
sections.extend(["【当前回忆录正文】", memoir_markdown[:_MEMOIR_MAX]])
return "\n".join(sections)
class EvalJudgeService:
def __init__(self, judge_llm: Any | None) -> None:
self._llm = judge_llm
async def judge_turn(
self,
*,
prior_transcript: str,
user_utterance: str,
assistant_reply: str,
turn_index_0: int = 0,
) -> TurnJudgeOutput | None:
if not self._llm:
return None
t = max(0, int(turn_index_0))
prompt = f"""{TURN_JUDGE_INSTRUCTIONS}
本轮位置完整对话中当前轮次为 Turn {t + 1}与下方节选及全量 transcript `[Turn ...]` 编号一致evidence_refs.turn_index 请使用该编号
截至上一轮的对话节选 `[Turn k]` 标签
{prior_transcript[: eval_judge_turn_prior_transcript_max_chars()]}
本轮用户
{user_utterance[:4000]}
本轮 AI
{assistant_reply[:4000]}
"""
try:
return await allm_json_call(
self._llm,
prompt,
TurnJudgeOutput,
max_tokens=_TURN_MAX,
agent="EvalJudgeService.judge_turn",
)
except LLMCallError as e:
logger.warning("turn judge failed: {}", e)
return None
async def judge_conversation_result(
self, *, full_transcript: str
) -> JudgeCallResult[ConversationJudgeOutput]:
if not self._llm:
return JudgeCallResult(output=None, error="评审模型未配置")
prompt = f"""{CONV_JUDGE_INSTRUCTIONS}
完整对话每轮以 `[Turn k]` 开头
{full_transcript[: eval_judge_conversation_transcript_max_chars()]}
"""
try:
out = await allm_json_call(
self._llm,
prompt,
ConversationJudgeOutput,
2026-04-06 23:19:20 +08:00
max_tokens=_CONV_JUDGE_JSON_MAX,
agent="EvalJudgeService.judge_conversation",
)
return JudgeCallResult(output=out)
except LLMCallError as e:
error = _judge_error_message(e)
logger.warning("conversation judge failed: {}", error)
return JudgeCallResult(output=None, error=error)
async def judge_conversation(
self, *, full_transcript: str
) -> ConversationJudgeOutput | None:
result = await self.judge_conversation_result(full_transcript=full_transcript)
return result.output
2026-04-06 23:19:20 +08:00
async def stream_conversation_compare(
self,
*,
baseline_transcript: str,
replay_transcript: str,
baseline_judge: ConversationJudgeOutput | None,
replay_judge: ConversationJudgeOutput | None,
) -> AsyncIterator[str]:
"""流式输出中文对比与建议(非 JSON"""
if not self._llm:
yield "[错误] 未配置评审模型 API Keyeval_judge_api_key / zhipu_api_key"
return
cap_each = eval_judge_compare_transcript_each_max_chars()
cap_single = eval_judge_conversation_transcript_max_chars()
b_tr = (baseline_transcript or "").strip()[:cap_each]
r_tr = (replay_transcript or "").strip()[:cap_each]
2026-04-06 23:19:20 +08:00
b_json = (
baseline_judge.model_dump_json(ensure_ascii=False)
if baseline_judge
else "null"
)
r_json = (
replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
)
if baseline_judge and replay_judge:
prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分JSON。请用中文直接写正文不要用 JSON、不要用 Markdown 代码块):
A导出基准对话历史快照用户与当时导出的线上 AI多轮合并为一篇
{b_tr}
B本次回放/新测对话用户句与基准对齐AI 为当前后端重新生成
{r_tr}
A 的整体评分 JSON
{b_json}
B 的整体评分 JSON
{r_json}
请依次撰写
1) 两段对话在整体体验上的主要差异情绪承接信息挖掘人物建模访谈结构提问质量上下文与重复盘问等
2026-04-06 23:19:20 +08:00
2) B 相对 A 的优点与不足
3) B 在关键维度明显弱于 A给出可操作的改进方向系统提示访谈策略模型或温度等
笔调简洁偏执行清单"""
elif replay_judge:
r_one = (replay_transcript or "").strip()[:cap_single]
2026-04-06 23:19:20 +08:00
prompt = f"""{COMPARE_CONV_STREAM_HINT}
回放/新测 transcript
{r_one}
2026-04-06 23:19:20 +08:00
整体评分 JSON
{r_json}
"""
else:
yield "[错误] 缺少回放对话评分,无法生成建议"
return
llm = self._llm
if hasattr(llm, "bind"):
llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX)
try:
async for chunk in llm.astream(prompt):
piece = getattr(chunk, "content", None)
if piece:
yield piece
except Exception as e:
logger.warning("conversation compare stream failed: {}", e)
yield f"\n\n[流式输出中断:{e}]"
async def judge_memoir(
self,
*,
memoir_markdown: str,
source_transcript: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> MemoirJudgeOutput | None:
result = await self.judge_memoir_result(
memoir_markdown=memoir_markdown,
source_transcript=source_transcript,
reference_memoir_markdown=reference_memoir_markdown,
evidence_notes=evidence_notes,
)
return result.output
async def judge_memoir_result(
self,
*,
memoir_markdown: str,
source_transcript: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> JudgeCallResult[MemoirJudgeOutput]:
if not self._llm:
return JudgeCallResult(output=None, error="评审模型未配置")
prompt = _build_memoir_judge_prompt(
memoir_markdown=memoir_markdown,
source_transcript=source_transcript,
reference_memoir_markdown=reference_memoir_markdown,
evidence_notes=evidence_notes,
)
try:
out = await allm_json_call(
self._llm,
prompt,
MemoirJudgeOutput,
max_tokens=_MEMOIR_JSON_MAX,
agent="EvalJudgeService.judge_memoir",
)
return JudgeCallResult(output=out)
except LLMCallError as e:
error = _judge_error_message(e)
logger.warning("memoir judge failed: {}", error)
return JudgeCallResult(output=None, error=error)