feat/ eval

2026-04-06 23:19:20 +08:00
parent 2fded6fbd9
commit 29dec8fe32
13 changed files with 2266 additions and 683 deletions
--- a/api/app/features/evaluation/judge_service.py
+++ b/api/app/features/evaluation/judge_service.py
@@ -2,6 +2,7 @@

 from __future__ import annotations

+from collections.abc import AsyncIterator
 from typing import Any

 from app.core.llm_call import LLMCallError, allm_json_call
@@ -12,6 +13,7 @@ from app.features.evaluation.judge_schemas import (
    TurnJudgeOutput,
 )
 from app.features.evaluation.rubrics.conversation_v1 import (
+    COMPARE_CONV_STREAM_HINT,
    CONV_JUDGE_INSTRUCTIONS,
    TURN_JUDGE_INSTRUCTIONS,
 )
@@ -21,7 +23,9 @@ logger = get_logger(__name__)

 _TURN_MAX = 768
 _CONV_MAX = 8192
+_CONV_JUDGE_JSON_MAX = 2048
 _MEMOIR_MAX = 12000
+_COMPARE_STREAM_MAX = 6144


 class EvalJudgeService:
@@ -75,13 +79,81 @@ class EvalJudgeService:
                self._llm,
                prompt,
                ConversationJudgeOutput,
-                max_tokens=_TURN_MAX,
+                max_tokens=_CONV_JUDGE_JSON_MAX,
                agent="EvalJudgeService.judge_conversation",
            )
        except LLMCallError as e:
            logger.warning("conversation judge failed: {}", e)
            return None

+    async def stream_conversation_compare(
+        self,
+        *,
+        baseline_transcript: str,
+        replay_transcript: str,
+        baseline_judge: ConversationJudgeOutput | None,
+        replay_judge: ConversationJudgeOutput | None,
+    ) -> AsyncIterator[str]:
+        """流式输出中文对比与建议（非 JSON）。"""
+        if not self._llm:
+            yield "[错误] 未配置评审模型 API Key（eval_judge_api_key / zhipu_api_key）"
+            return
+        b_tr = (baseline_transcript or "").strip()[:_CONV_MAX]
+        r_tr = (replay_transcript or "").strip()[:_CONV_MAX]
+        b_json = (
+            baseline_judge.model_dump_json(ensure_ascii=False)
+            if baseline_judge
+            else "null"
+        )
+        r_json = (
+            replay_judge.model_dump_json(ensure_ascii=False) if replay_judge else "null"
+        )
+        if baseline_judge and replay_judge:
+            prompt = f"""你是访谈对话评测专家。下面给出两份完整对话 transcript 及各自的整体打分（JSON）。请用中文直接写正文（不要用 JSON、不要用 Markdown 代码块）：
+
+【A：导出基准对话】（历史快照：用户与当时导出的线上 AI，多轮合并为一篇）
+{b_tr}
+
+【B：本次回放/新测对话】（用户句与基准对齐，AI 为当前后端重新生成）
+{r_tr}
+
+【A 的整体评分 JSON】
+{b_json}
+
+【B 的整体评分 JSON】
+{r_json}
+
+请依次撰写：
+1) 两段对话在整体体验上的主要差异（共情、追问、重复感、自然度等）；
+2) B 相对 A 的优点与不足；
+3) 若 B 在关键维度明显弱于 A，给出可操作的改进方向（系统提示、访谈策略、模型或温度等）。
+
+笔调简洁、偏执行清单。"""
+        elif replay_judge:
+            prompt = f"""{COMPARE_CONV_STREAM_HINT}
+
+【回放/新测 transcript】
+{r_tr}
+
+【整体评分 JSON】
+{r_json}
+"""
+        else:
+            yield "[错误] 缺少回放对话评分，无法生成建议"
+            return
+
+        llm = self._llm
+        if hasattr(llm, "bind"):
+            llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX)
+        try:
+            async for chunk in llm.astream(prompt):
+                piece = getattr(chunk, "content", None)
+                if piece:
+                    yield piece
+        except Exception as e:
+            logger.warning("conversation compare stream failed: {}", e)
+            yield f"\n\n[流式输出中断：{e}]"
+
    async def judge_memoir(self, *, memoir_markdown: str) -> MemoirJudgeOutput | None:
        if not self._llm:
            return None