feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001 when :8000 is already up; document in api/docs/internal-eval.md. - Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks, execution_service and router updates; tests for judge and composite eval. - Memory: ingest nested transaction for embedding/enrichment rollback safety. - Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError). - app-eval-web: Playground saved replays, dialogue turns helper, hash user_id for Memoir; Memoir chapter baseline↔DB row compare with title heuristics; Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI; react-markdown; development proxy and fixture updates.
2026-04-07 17:15:01 +08:00
parent a50b72e7b5
commit 99543d04c6
47 changed files with 4968 additions and 1279 deletions
--- a/api/tests/test_judge_service.py
+++ b/api/tests/test_judge_service.py
@@ -1,3 +1,71 @@
+"""评审服务：保留真实失败原因，便于 internal eval 页面排障。"""
+
+import pytest
+
+from app.core.llm_call import LLMCallError
+from app.features.evaluation.judge_schemas import ConversationJudgeOutput
+from app.features.evaluation.judge_service import EvalJudgeService
+
+
+def _conversation_payload() -> dict:
+    return {
+        "emotion_carry": 10,
+        "empathy_depth": 8,
+        "emotion_safety": 6,
+        "emotion_guidance": 6,
+        "fact_mining": 8,
+        "info_completeness_guide": 8,
+        "info_depth_mining": 9,
+        "persona_understanding": 7,
+        "persona_consistency_verify": 4,
+        "persona_expression_guide": 4,
+        "interview_structure": 6,
+        "context_memory": 5,
+        "rhythm_control": 4,
+        "question_quality": 7,
+        "follow_up_depth": 5,
+        "non_leading": 3,
+        "total_score": 100.0,
+        "rationale": "整体表现稳定。",
+    }
+
+
+@pytest.mark.asyncio
+async def test_judge_conversation_result_preserves_validation_error(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    async def _boom(*args, **kwargs):
+        raise LLMCallError(
+            "validation",
+            "pydantic validation failed: total_score mismatch",
+        )
+
+    monkeypatch.setattr("app.features.evaluation.judge_service.allm_json_call", _boom)
+
+    svc = EvalJudgeService(object())
+    result = await svc.judge_conversation_result(full_transcript="[Turn 1]\n用户: hi\nAI: hello")
+
+    assert result.output is None
+    assert result.error is not None
+    assert "结果校验失败" in result.error
+    assert "total_score mismatch" in result.error
+
+
+@pytest.mark.asyncio
+async def test_judge_conversation_wrapper_keeps_legacy_shape(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    expected = ConversationJudgeOutput.model_validate(_conversation_payload())
+
+    async def _ok(*args, **kwargs):
+        return expected
+
+    monkeypatch.setattr("app.features.evaluation.judge_service.allm_json_call", _ok)
+
+    svc = EvalJudgeService(object())
+    out = await svc.judge_conversation(full_transcript="[Turn 1]\n用户: hi\nAI: hello")
+
+    assert out == expected
 """Evaluation judge prompt assembly tests."""

 from app.features.evaluation.judge_service import _build_memoir_judge_prompt