feat(eval): internal-eval stack, judge fixes, and eval web overhaul
- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001 when :8000 is already up; document in api/docs/internal-eval.md. - Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks, execution_service and router updates; tests for judge and composite eval. - Memory: ingest nested transaction for embedding/enrichment rollback safety. - Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError). - app-eval-web: Playground saved replays, dialogue turns helper, hash user_id for Memoir; Memoir chapter baseline↔DB row compare with title heuristics; Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI; react-markdown; development proxy and fixture updates.
This commit is contained in:
@@ -3,7 +3,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import AsyncIterator
|
||||
from typing import Any
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Generic, TypeVar
|
||||
|
||||
from app.core.llm_call import LLMCallError, allm_json_call
|
||||
from app.core.logging import get_logger
|
||||
@@ -21,14 +22,35 @@ from app.features.evaluation.rubrics.memoir_v1 import MEMOIR_JUDGE_INSTRUCTIONS
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
TJudgeOutput = TypeVar(
|
||||
"TJudgeOutput", TurnJudgeOutput, ConversationJudgeOutput, MemoirJudgeOutput
|
||||
)
|
||||
|
||||
_TURN_MAX = 768
|
||||
_CONV_MAX = 8192
|
||||
_CONV_JUDGE_JSON_MAX = 2048
|
||||
_MEMOIR_MAX = 12000
|
||||
_MEMOIR_JSON_MAX = 1536
|
||||
_COMPARE_STREAM_MAX = 6144
|
||||
_MEMOIR_EVIDENCE_MAX = 12000
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class JudgeCallResult(Generic[TJudgeOutput]):
|
||||
output: TJudgeOutput | None
|
||||
error: str | None = None
|
||||
|
||||
|
||||
def _judge_error_message(e: LLMCallError) -> str:
|
||||
prefix = {
|
||||
"invoke": "模型调用失败",
|
||||
"decode": "JSON 解析失败",
|
||||
"validation": "结果校验失败",
|
||||
}.get(e.kind, "评审失败")
|
||||
detail = str(e).strip()
|
||||
return f"{prefix}: {detail}" if detail else prefix
|
||||
|
||||
|
||||
def _build_memoir_judge_prompt(
|
||||
*,
|
||||
memoir_markdown: str,
|
||||
@@ -40,7 +62,12 @@ def _build_memoir_judge_prompt(
|
||||
source = (source_transcript or "").strip()
|
||||
reference = (reference_memoir_markdown or "").strip()
|
||||
notes = (evidence_notes or "").strip()
|
||||
sections = [MEMOIR_JUDGE_INSTRUCTIONS, ""]
|
||||
sections = [
|
||||
MEMOIR_JUDGE_INSTRUCTIONS,
|
||||
"",
|
||||
"【证据与输入顺序】以下区块按优先级给出:评审说明(若有)→ 原始访谈证据 → 参考基线(若有)→ 待评成稿。**真实性相关细项必须以原始访谈证据为准。**",
|
||||
"",
|
||||
]
|
||||
if notes:
|
||||
sections.extend(["【评审说明】", notes[:1200], ""])
|
||||
if source:
|
||||
@@ -69,12 +96,16 @@ class EvalJudgeService:
|
||||
prior_transcript: str,
|
||||
user_utterance: str,
|
||||
assistant_reply: str,
|
||||
turn_index_0: int = 0,
|
||||
) -> TurnJudgeOutput | None:
|
||||
if not self._llm:
|
||||
return None
|
||||
t = max(0, int(turn_index_0))
|
||||
prompt = f"""{TURN_JUDGE_INSTRUCTIONS}
|
||||
|
||||
【截至上一轮的对话摘要/节选】
|
||||
【本轮位置】完整对话中当前轮次为 Turn {t + 1}(与下方节选及全量 transcript 的 `[Turn ...]` 编号一致)。evidence_refs.turn_index 请使用该编号。
|
||||
|
||||
【截至上一轮的对话节选】(含 `[Turn k]` 标签)
|
||||
{prior_transcript[:_CONV_MAX]}
|
||||
|
||||
【本轮用户】
|
||||
@@ -95,27 +126,35 @@ class EvalJudgeService:
|
||||
logger.warning("turn judge failed: {}", e)
|
||||
return None
|
||||
|
||||
async def judge_conversation(
|
||||
async def judge_conversation_result(
|
||||
self, *, full_transcript: str
|
||||
) -> ConversationJudgeOutput | None:
|
||||
) -> JudgeCallResult[ConversationJudgeOutput]:
|
||||
if not self._llm:
|
||||
return None
|
||||
return JudgeCallResult(output=None, error="评审模型未配置")
|
||||
prompt = f"""{CONV_JUDGE_INSTRUCTIONS}
|
||||
|
||||
【完整对话】
|
||||
【完整对话】(每轮以 `[Turn k]` 开头)
|
||||
{full_transcript[:_CONV_MAX]}
|
||||
"""
|
||||
try:
|
||||
return await allm_json_call(
|
||||
out = await allm_json_call(
|
||||
self._llm,
|
||||
prompt,
|
||||
ConversationJudgeOutput,
|
||||
max_tokens=_CONV_JUDGE_JSON_MAX,
|
||||
agent="EvalJudgeService.judge_conversation",
|
||||
)
|
||||
return JudgeCallResult(output=out)
|
||||
except LLMCallError as e:
|
||||
logger.warning("conversation judge failed: {}", e)
|
||||
return None
|
||||
error = _judge_error_message(e)
|
||||
logger.warning("conversation judge failed: {}", error)
|
||||
return JudgeCallResult(output=None, error=error)
|
||||
|
||||
async def judge_conversation(
|
||||
self, *, full_transcript: str
|
||||
) -> ConversationJudgeOutput | None:
|
||||
result = await self.judge_conversation_result(full_transcript=full_transcript)
|
||||
return result.output
|
||||
|
||||
async def stream_conversation_compare(
|
||||
self,
|
||||
@@ -193,8 +232,24 @@ class EvalJudgeService:
|
||||
reference_memoir_markdown: str = "",
|
||||
evidence_notes: str = "",
|
||||
) -> MemoirJudgeOutput | None:
|
||||
result = await self.judge_memoir_result(
|
||||
memoir_markdown=memoir_markdown,
|
||||
source_transcript=source_transcript,
|
||||
reference_memoir_markdown=reference_memoir_markdown,
|
||||
evidence_notes=evidence_notes,
|
||||
)
|
||||
return result.output
|
||||
|
||||
async def judge_memoir_result(
|
||||
self,
|
||||
*,
|
||||
memoir_markdown: str,
|
||||
source_transcript: str = "",
|
||||
reference_memoir_markdown: str = "",
|
||||
evidence_notes: str = "",
|
||||
) -> JudgeCallResult[MemoirJudgeOutput]:
|
||||
if not self._llm:
|
||||
return None
|
||||
return JudgeCallResult(output=None, error="评审模型未配置")
|
||||
prompt = _build_memoir_judge_prompt(
|
||||
memoir_markdown=memoir_markdown,
|
||||
source_transcript=source_transcript,
|
||||
@@ -202,13 +257,15 @@ class EvalJudgeService:
|
||||
evidence_notes=evidence_notes,
|
||||
)
|
||||
try:
|
||||
return await allm_json_call(
|
||||
out = await allm_json_call(
|
||||
self._llm,
|
||||
prompt,
|
||||
MemoirJudgeOutput,
|
||||
max_tokens=_TURN_MAX,
|
||||
max_tokens=_MEMOIR_JSON_MAX,
|
||||
agent="EvalJudgeService.judge_memoir",
|
||||
)
|
||||
return JudgeCallResult(output=out)
|
||||
except LLMCallError as e:
|
||||
logger.warning("memoir judge failed: {}", e)
|
||||
return None
|
||||
error = _judge_error_message(e)
|
||||
logger.warning("memoir judge failed: {}", error)
|
||||
return JudgeCallResult(output=None, error=error)
|
||||
|
||||
Reference in New Issue
Block a user