Files
life-echo/api/app/features/evaluation/judge_service.py
Kevin 5972b0e721 feat(evaluation): 成稿 100 分 rubric、证据评审与评测台调整
- 回忆录细项上限收紧为合计 100 分,去掉 110 折算与 raw_dimension_total
- judge_memoir 拼接原始访谈与可选导出基线;无证据时提示保守打真实性相关分
- 自动评测 run 与手动章节/故事评审统一带 transcript 证据(会话/用户聚合、截断)
- 访谈打分仍为情绪强化版 15 细项、总分 100
- 评测台默认基准改为 zuckxu 导出 MD;移除逐轮用户句对齐表及相关逻辑
- 新增 judge schema 与 memoir prompt 组装的单元测试
2026-04-07 10:36:22 +08:00

215 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""GLM 评审调用(结构化 JSON"""
from __future__ import annotations
from collections.abc import AsyncIterator
from typing import Any
from app.core.llm_call import LLMCallError, allm_json_call
from app.core.logging import get_logger
from app.features.evaluation.judge_schemas import (
ConversationJudgeOutput,
MemoirJudgeOutput,
TurnJudgeOutput,
)
from app.features.evaluation.rubrics.conversation_v1 import (
COMPARE_CONV_STREAM_HINT,
CONV_JUDGE_INSTRUCTIONS,
TURN_JUDGE_INSTRUCTIONS,
)
from app.features.evaluation.rubrics.memoir_v1 import MEMOIR_JUDGE_INSTRUCTIONS
logger = get_logger(__name__)
_TURN_MAX = 768
_CONV_MAX = 8192
_CONV_JUDGE_JSON_MAX = 2048
_MEMOIR_MAX = 12000
_COMPARE_STREAM_MAX = 6144
_MEMOIR_EVIDENCE_MAX = 12000
def _build_memoir_judge_prompt(
*,
memoir_markdown: str,
source_transcript: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> str:
"""Assemble an evidence-aware memoir judging prompt."""
source = (source_transcript or "").strip()
reference = (reference_memoir_markdown or "").strip()
notes = (evidence_notes or "").strip()
sections = [MEMOIR_JUDGE_INSTRUCTIONS, ""]
if notes:
sections.extend(["【评审说明】", notes[:1200], ""])
if source:
sections.extend(["【原始访谈/证据】", source[:_MEMOIR_EVIDENCE_MAX], ""])
else:
sections.extend(
[
"【原始访谈/证据】",
"无可用原始访谈证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。",
"",
]
)
if reference:
sections.extend(["【参考基线/导出成稿】", reference[:_MEMOIR_EVIDENCE_MAX], ""])
sections.extend(["【当前回忆录正文】", memoir_markdown[:_MEMOIR_MAX]])
return "\n".join(sections)
class EvalJudgeService:
def __init__(self, judge_llm: Any | None) -> None:
self._llm = judge_llm
async def judge_turn(
self,
*,
prior_transcript: str,
user_utterance: str,
assistant_reply: str,
) -> TurnJudgeOutput | None:
if not self._llm:
return None
prompt = f"""{TURN_JUDGE_INSTRUCTIONS}
【截至上一轮的对话摘要/节选】
{prior_transcript[:_CONV_MAX]}
【本轮用户】
{user_utterance[:4000]}
【本轮 AI】
{assistant_reply[:4000]}
"""
try:
return await allm_json_call(
self._llm,
prompt,
TurnJudgeOutput,
max_tokens=_TURN_MAX,
agent="EvalJudgeService.judge_turn",
)
except LLMCallError as e:
logger.warning("turn judge failed: {}", e)
return None
async def judge_conversation(
self, *, full_transcript: str
) -> ConversationJudgeOutput | None:
if not self._llm:
return None
prompt = f"""{CONV_JUDGE_INSTRUCTIONS}
【完整对话】
{full_transcript[:_CONV_MAX]}
"""
try:
return await allm_json_call(
self._llm,
prompt,
ConversationJudgeOutput,
max_tokens=_CONV_JUDGE_JSON_MAX,
agent="EvalJudgeService.judge_conversation",
)
except LLMCallError as e:
logger.warning("conversation judge failed: {}", e)
return None
async def stream_conversation_compare(
self,
*,
baseline_transcript: str,
replay_transcript: str,
baseline_judge: ConversationJudgeOutput | None,
replay_judge: ConversationJudgeOutput | None,
) -> AsyncIterator[str]:
"""流式输出中文对比与建议(非 JSON"""
if not self._llm:
yield "[错误] 未配置评审模型 API Keyeval_judge_api_key / zhipu_api_key"
return
b_tr = (baseline_transcript or "").strip()[:_CONV_MAX]
r_tr = (replay_transcript or "").strip()[:_CONV_MAX]
b_json = (
baseline_judge.model_dump_json(ensure_ascii=False)
if baseline_judge
else "null"
)
r_json = (
replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
)
if baseline_judge and replay_judge:
prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分JSON。请用中文直接写正文不要用 JSON、不要用 Markdown 代码块):
【A导出基准对话】历史快照用户与当时导出的线上 AI多轮合并为一篇
{b_tr}
【B本次回放/新测对话】用户句与基准对齐AI 为当前后端重新生成)
{r_tr}
【A 的整体评分 JSON】
{b_json}
【B 的整体评分 JSON】
{r_json}
请依次撰写:
1) 两段对话在整体体验上的主要差异(情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等);
2) B 相对 A 的优点与不足;
3) 若 B 在关键维度明显弱于 A给出可操作的改进方向系统提示、访谈策略、模型或温度等
笔调简洁、偏执行清单。"""
elif replay_judge:
prompt = f"""{COMPARE_CONV_STREAM_HINT}
【回放/新测 transcript】
{r_tr}
【整体评分 JSON】
{r_json}
"""
else:
yield "[错误] 缺少回放对话评分,无法生成建议"
return
llm = self._llm
if hasattr(llm, "bind"):
llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX)
try:
async for chunk in llm.astream(prompt):
piece = getattr(chunk, "content", None)
if piece:
yield piece
except Exception as e:
logger.warning("conversation compare stream failed: {}", e)
yield f"\n\n[流式输出中断:{e}]"
async def judge_memoir(
self,
*,
memoir_markdown: str,
source_transcript: str = "",
reference_memoir_markdown: str = "",
evidence_notes: str = "",
) -> MemoirJudgeOutput | None:
if not self._llm:
return None
prompt = _build_memoir_judge_prompt(
memoir_markdown=memoir_markdown,
source_transcript=source_transcript,
reference_memoir_markdown=reference_memoir_markdown,
evidence_notes=evidence_notes,
)
try:
return await allm_json_call(
self._llm,
prompt,
MemoirJudgeOutput,
max_tokens=_TURN_MAX,
agent="EvalJudgeService.judge_memoir",
)
except LLMCallError as e:
logger.warning("memoir judge failed: {}", e)
return None