feat/ eval
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import AsyncIterator
|
||||
from typing import Any
|
||||
|
||||
from app.core.llm_call import LLMCallError, allm_json_call
|
||||
@@ -12,6 +13,7 @@ from app.features.evaluation.judge_schemas import (
|
||||
TurnJudgeOutput,
|
||||
)
|
||||
from app.features.evaluation.rubrics.conversation_v1 import (
|
||||
COMPARE_CONV_STREAM_HINT,
|
||||
CONV_JUDGE_INSTRUCTIONS,
|
||||
TURN_JUDGE_INSTRUCTIONS,
|
||||
)
|
||||
@@ -21,7 +23,9 @@ logger = get_logger(__name__)
|
||||
|
||||
_TURN_MAX = 768
|
||||
_CONV_MAX = 8192
|
||||
_CONV_JUDGE_JSON_MAX = 2048
|
||||
_MEMOIR_MAX = 12000
|
||||
_COMPARE_STREAM_MAX = 6144
|
||||
|
||||
|
||||
class EvalJudgeService:
|
||||
@@ -75,13 +79,81 @@ class EvalJudgeService:
|
||||
self._llm,
|
||||
prompt,
|
||||
ConversationJudgeOutput,
|
||||
max_tokens=_TURN_MAX,
|
||||
max_tokens=_CONV_JUDGE_JSON_MAX,
|
||||
agent="EvalJudgeService.judge_conversation",
|
||||
)
|
||||
except LLMCallError as e:
|
||||
logger.warning("conversation judge failed: {}", e)
|
||||
return None
|
||||
|
||||
async def stream_conversation_compare(
|
||||
self,
|
||||
*,
|
||||
baseline_transcript: str,
|
||||
replay_transcript: str,
|
||||
baseline_judge: ConversationJudgeOutput | None,
|
||||
replay_judge: ConversationJudgeOutput | None,
|
||||
) -> AsyncIterator[str]:
|
||||
"""流式输出中文对比与建议(非 JSON)。"""
|
||||
if not self._llm:
|
||||
yield "[错误] 未配置评审模型 API Key(eval_judge_api_key / zhipu_api_key)"
|
||||
return
|
||||
b_tr = (baseline_transcript or "").strip()[:_CONV_MAX]
|
||||
r_tr = (replay_transcript or "").strip()[:_CONV_MAX]
|
||||
b_json = (
|
||||
baseline_judge.model_dump_json(ensure_ascii=False)
|
||||
if baseline_judge
|
||||
else "null"
|
||||
)
|
||||
r_json = (
|
||||
replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
|
||||
)
|
||||
if baseline_judge and replay_judge:
|
||||
prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分(JSON)。请用中文直接写正文(不要用 JSON、不要用 Markdown 代码块):
|
||||
|
||||
【A:导出基准对话】(历史快照:用户与当时导出的线上 AI,多轮合并为一篇)
|
||||
{b_tr}
|
||||
|
||||
【B:本次回放/新测对话】(用户句与基准对齐,AI 为当前后端重新生成)
|
||||
{r_tr}
|
||||
|
||||
【A 的整体评分 JSON】
|
||||
{b_json}
|
||||
|
||||
【B 的整体评分 JSON】
|
||||
{r_json}
|
||||
|
||||
请依次撰写:
|
||||
1) 两段对话在整体体验上的主要差异(共情、追问、重复感、自然度等);
|
||||
2) B 相对 A 的优点与不足;
|
||||
3) 若 B 在关键维度明显弱于 A,给出可操作的改进方向(系统提示、访谈策略、模型或温度等)。
|
||||
|
||||
笔调简洁、偏执行清单。"""
|
||||
elif replay_judge:
|
||||
prompt = f"""{COMPARE_CONV_STREAM_HINT}
|
||||
|
||||
【回放/新测 transcript】
|
||||
{r_tr}
|
||||
|
||||
【整体评分 JSON】
|
||||
{r_json}
|
||||
"""
|
||||
else:
|
||||
yield "[错误] 缺少回放对话评分,无法生成建议"
|
||||
return
|
||||
|
||||
llm = self._llm
|
||||
if hasattr(llm, "bind"):
|
||||
llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX)
|
||||
try:
|
||||
async for chunk in llm.astream(prompt):
|
||||
piece = getattr(chunk, "content", None)
|
||||
if piece:
|
||||
yield piece
|
||||
except Exception as e:
|
||||
logger.warning("conversation compare stream failed: {}", e)
|
||||
yield f"\n\n[流式输出中断:{e}]"
|
||||
|
||||
async def judge_memoir(self, *, memoir_markdown: str) -> MemoirJudgeOutput | None:
|
||||
if not self._llm:
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user