- 回忆录细项上限收紧为合计 100 分,去掉 110 折算与 raw_dimension_total - judge_memoir 拼接原始访谈与可选导出基线;无证据时提示保守打真实性相关分 - 自动评测 run 与手动章节/故事评审统一带 transcript 证据(会话/用户聚合、截断) - 访谈打分仍为情绪强化版 15 细项、总分 100 - 评测台默认基准改为 zuckxu 导出 MD;移除逐轮用户句对齐表及相关逻辑 - 新增 judge schema 与 memoir prompt 组装的单元测试
215 lines
6.8 KiB
Python
215 lines
6.8 KiB
Python
"""GLM 评审调用(结构化 JSON)。"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from collections.abc import AsyncIterator
|
||
from typing import Any
|
||
|
||
from app.core.llm_call import LLMCallError, allm_json_call
|
||
from app.core.logging import get_logger
|
||
from app.features.evaluation.judge_schemas import (
|
||
ConversationJudgeOutput,
|
||
MemoirJudgeOutput,
|
||
TurnJudgeOutput,
|
||
)
|
||
from app.features.evaluation.rubrics.conversation_v1 import (
|
||
COMPARE_CONV_STREAM_HINT,
|
||
CONV_JUDGE_INSTRUCTIONS,
|
||
TURN_JUDGE_INSTRUCTIONS,
|
||
)
|
||
from app.features.evaluation.rubrics.memoir_v1 import MEMOIR_JUDGE_INSTRUCTIONS
|
||
|
||
logger = get_logger(__name__)
|
||
|
||
_TURN_MAX = 768
|
||
_CONV_MAX = 8192
|
||
_CONV_JUDGE_JSON_MAX = 2048
|
||
_MEMOIR_MAX = 12000
|
||
_COMPARE_STREAM_MAX = 6144
|
||
_MEMOIR_EVIDENCE_MAX = 12000
|
||
|
||
|
||
def _build_memoir_judge_prompt(
|
||
*,
|
||
memoir_markdown: str,
|
||
source_transcript: str = "",
|
||
reference_memoir_markdown: str = "",
|
||
evidence_notes: str = "",
|
||
) -> str:
|
||
"""Assemble an evidence-aware memoir judging prompt."""
|
||
source = (source_transcript or "").strip()
|
||
reference = (reference_memoir_markdown or "").strip()
|
||
notes = (evidence_notes or "").strip()
|
||
sections = [MEMOIR_JUDGE_INSTRUCTIONS, ""]
|
||
if notes:
|
||
sections.extend(["【评审说明】", notes[:1200], ""])
|
||
if source:
|
||
sections.extend(["【原始访谈/证据】", source[:_MEMOIR_EVIDENCE_MAX], ""])
|
||
else:
|
||
sections.extend(
|
||
[
|
||
"【原始访谈/证据】",
|
||
"无可用原始访谈证据。对于记忆忠实度、事实准确性、事实覆盖率、记忆可追溯性,必须保守打分,不得凭空高分。",
|
||
"",
|
||
]
|
||
)
|
||
if reference:
|
||
sections.extend(["【参考基线/导出成稿】", reference[:_MEMOIR_EVIDENCE_MAX], ""])
|
||
sections.extend(["【当前回忆录正文】", memoir_markdown[:_MEMOIR_MAX]])
|
||
return "\n".join(sections)
|
||
|
||
|
||
class EvalJudgeService:
|
||
def __init__(self, judge_llm: Any | None) -> None:
|
||
self._llm = judge_llm
|
||
|
||
async def judge_turn(
|
||
self,
|
||
*,
|
||
prior_transcript: str,
|
||
user_utterance: str,
|
||
assistant_reply: str,
|
||
) -> TurnJudgeOutput | None:
|
||
if not self._llm:
|
||
return None
|
||
prompt = f"""{TURN_JUDGE_INSTRUCTIONS}
|
||
|
||
【截至上一轮的对话摘要/节选】
|
||
{prior_transcript[:_CONV_MAX]}
|
||
|
||
【本轮用户】
|
||
{user_utterance[:4000]}
|
||
|
||
【本轮 AI】
|
||
{assistant_reply[:4000]}
|
||
"""
|
||
try:
|
||
return await allm_json_call(
|
||
self._llm,
|
||
prompt,
|
||
TurnJudgeOutput,
|
||
max_tokens=_TURN_MAX,
|
||
agent="EvalJudgeService.judge_turn",
|
||
)
|
||
except LLMCallError as e:
|
||
logger.warning("turn judge failed: {}", e)
|
||
return None
|
||
|
||
async def judge_conversation(
|
||
self, *, full_transcript: str
|
||
) -> ConversationJudgeOutput | None:
|
||
if not self._llm:
|
||
return None
|
||
prompt = f"""{CONV_JUDGE_INSTRUCTIONS}
|
||
|
||
【完整对话】
|
||
{full_transcript[:_CONV_MAX]}
|
||
"""
|
||
try:
|
||
return await allm_json_call(
|
||
self._llm,
|
||
prompt,
|
||
ConversationJudgeOutput,
|
||
max_tokens=_CONV_JUDGE_JSON_MAX,
|
||
agent="EvalJudgeService.judge_conversation",
|
||
)
|
||
except LLMCallError as e:
|
||
logger.warning("conversation judge failed: {}", e)
|
||
return None
|
||
|
||
async def stream_conversation_compare(
|
||
self,
|
||
*,
|
||
baseline_transcript: str,
|
||
replay_transcript: str,
|
||
baseline_judge: ConversationJudgeOutput | None,
|
||
replay_judge: ConversationJudgeOutput | None,
|
||
) -> AsyncIterator[str]:
|
||
"""流式输出中文对比与建议(非 JSON)。"""
|
||
if not self._llm:
|
||
yield "[错误] 未配置评审模型 API Key(eval_judge_api_key / zhipu_api_key)"
|
||
return
|
||
b_tr = (baseline_transcript or "").strip()[:_CONV_MAX]
|
||
r_tr = (replay_transcript or "").strip()[:_CONV_MAX]
|
||
b_json = (
|
||
baseline_judge.model_dump_json(ensure_ascii=False)
|
||
if baseline_judge
|
||
else "null"
|
||
)
|
||
r_json = (
|
||
replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
|
||
)
|
||
if baseline_judge and replay_judge:
|
||
prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分(JSON)。请用中文直接写正文(不要用 JSON、不要用 Markdown 代码块):
|
||
|
||
【A:导出基准对话】(历史快照:用户与当时导出的线上 AI,多轮合并为一篇)
|
||
{b_tr}
|
||
|
||
【B:本次回放/新测对话】(用户句与基准对齐,AI 为当前后端重新生成)
|
||
{r_tr}
|
||
|
||
【A 的整体评分 JSON】
|
||
{b_json}
|
||
|
||
【B 的整体评分 JSON】
|
||
{r_json}
|
||
|
||
请依次撰写:
|
||
1) 两段对话在整体体验上的主要差异(情绪承接、信息挖掘、人物建模、访谈结构、提问质量、上下文与重复盘问等);
|
||
2) B 相对 A 的优点与不足;
|
||
3) 若 B 在关键维度明显弱于 A,给出可操作的改进方向(系统提示、访谈策略、模型或温度等)。
|
||
|
||
笔调简洁、偏执行清单。"""
|
||
elif replay_judge:
|
||
prompt = f"""{COMPARE_CONV_STREAM_HINT}
|
||
|
||
【回放/新测 transcript】
|
||
{r_tr}
|
||
|
||
【整体评分 JSON】
|
||
{r_json}
|
||
"""
|
||
else:
|
||
yield "[错误] 缺少回放对话评分,无法生成建议"
|
||
return
|
||
|
||
llm = self._llm
|
||
if hasattr(llm, "bind"):
|
||
llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX)
|
||
try:
|
||
async for chunk in llm.astream(prompt):
|
||
piece = getattr(chunk, "content", None)
|
||
if piece:
|
||
yield piece
|
||
except Exception as e:
|
||
logger.warning("conversation compare stream failed: {}", e)
|
||
yield f"\n\n[流式输出中断:{e}]"
|
||
|
||
async def judge_memoir(
|
||
self,
|
||
*,
|
||
memoir_markdown: str,
|
||
source_transcript: str = "",
|
||
reference_memoir_markdown: str = "",
|
||
evidence_notes: str = "",
|
||
) -> MemoirJudgeOutput | None:
|
||
if not self._llm:
|
||
return None
|
||
prompt = _build_memoir_judge_prompt(
|
||
memoir_markdown=memoir_markdown,
|
||
source_transcript=source_transcript,
|
||
reference_memoir_markdown=reference_memoir_markdown,
|
||
evidence_notes=evidence_notes,
|
||
)
|
||
try:
|
||
return await allm_json_call(
|
||
self._llm,
|
||
prompt,
|
||
MemoirJudgeOutput,
|
||
max_tokens=_TURN_MAX,
|
||
agent="EvalJudgeService.judge_memoir",
|
||
)
|
||
except LLMCallError as e:
|
||
logger.warning("memoir judge failed: {}", e)
|
||
return None
|