feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
This commit is contained in:
Kevin
2026-04-10 10:23:43 +08:00
parent b0251e5b26
commit ac49bc7f23
59 changed files with 4773 additions and 696 deletions

View File

@@ -15,6 +15,13 @@ from typing import Any, Callable, Literal, TypeVar
from pydantic import BaseModel, ValidationError
try:
from openai import (
ContentFilterFinishReasonError as _OpenAIContentFilterFinishReasonError,
)
except ImportError: # 兼容性:旧版 SDK 无此类
_OpenAIContentFilterFinishReasonError = None
from app.core.agent_logging import agent_verbose_enabled, log_agent_payload
from app.core.json_utils import extract_json_payload
from app.core.langchain_llm import (
@@ -61,6 +68,57 @@ def _prompt_sha12(prompt: str) -> str:
return hashlib.sha256((prompt or "").encode("utf-8")).hexdigest()[:12]
def _iter_exception_chain(exc: BaseException):
"""包含自身与 ``__cause__`` / ``__context__`` 链,去重防环。"""
seen: set[int] = set()
cur: BaseException | None = exc
while cur is not None and id(cur) not in seen:
yield cur
seen.add(id(cur))
cur = cur.__cause__ or cur.__context__
def _is_content_filter_refusal(exc: BaseException) -> bool:
"""OpenAI / Azure 等内容审核拦截:无模型 JSON 可解析,属可预期失败,不宜打 ERROR 堆栈。"""
for e in _iter_exception_chain(exc):
if _OpenAIContentFilterFinishReasonError is not None and isinstance(
e,
_OpenAIContentFilterFinishReasonError,
):
return True
msg = str(e).lower()
if "content filter" in msg and (
"reject" in msg or "blocked" in msg or "filter" in msg
):
return True
return False
_LLM_MSG_CONTENT_FILTER = (
"模型输出被服务商内容安全策略拦截content filter通常与提示或上下文中触发了合规扫描有关"
"可尝试更换模型、缩短送入模型的正文/证据节选,或在服务商控制台调整内容过滤策略。"
)
def _format_llm_invoke_error_message(exc: BaseException) -> str:
if _is_content_filter_refusal(exc):
return _LLM_MSG_CONTENT_FILTER
return str(exc)
def _log_invoke_failure(*, agent: str, exc: BaseException, sync: bool) -> None:
if _is_content_filter_refusal(exc):
logger.info(
"event=llm_content_filter_blocked agent={} sync={} detail={}",
agent,
sync,
str(exc)[:500],
)
return
tag = "llm_json_call" if sync else "allm_json_call"
logger.bind(agent=agent).exception("{} invoke error: {}", tag, exc)
def _invoke_raw_sync(
llm: Any,
prompt: str,
@@ -272,7 +330,7 @@ def llm_json_call(
return fallback_factory()
raise
except Exception as e:
logger.bind(agent=agent).exception("llm_json_call invoke error: {}", e)
_log_invoke_failure(agent=agent, exc=e, sync=True)
used_fb = fallback_factory is not None
_emit_meta(
agent=agent,
@@ -295,7 +353,7 @@ def llm_json_call(
return fallback_factory()
raise LLMCallError(
"invoke",
str(e),
_format_llm_invoke_error_message(e),
raw_content=raw[:4096] if raw else None,
) from e
@@ -366,7 +424,7 @@ async def allm_json_call(
return fallback_factory()
raise
except Exception as e:
logger.bind(agent=agent).exception("allm_json_call invoke error: {}", e)
_log_invoke_failure(agent=agent, exc=e, sync=False)
used_fb = fallback_factory is not None
_emit_meta(
agent=agent,
@@ -389,7 +447,7 @@ async def allm_json_call(
return fallback_factory()
raise LLMCallError(
"invoke",
str(e),
_format_llm_invoke_error_message(e),
raw_content=raw[:4096] if raw else None,
) from e