feat(eval): internal-eval stack, judge fixes, and eval web overhaul

- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh
  wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001
  when :8000 is already up; document in api/docs/internal-eval.md.
- Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks,
  execution_service and router updates; tests for judge and composite eval.
- Memory: ingest nested transaction for embedding/enrichment rollback safety.
- Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError).
- app-eval-web: Playground saved replays, dialogue turns helper, hash user_id
  for Memoir; Memoir chapter baseline↔DB row compare with title heuristics;
  Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI;
  react-markdown; development proxy and fixture updates.
This commit is contained in:
Kevin
2026-04-07 17:15:01 +08:00
parent a50b72e7b5
commit 99543d04c6
47 changed files with 4968 additions and 1279 deletions

View File

@@ -1,3 +1,71 @@
"""评审服务:保留真实失败原因,便于 internal eval 页面排障。"""
import pytest
from app.core.llm_call import LLMCallError
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
from app.features.evaluation.judge_service import EvalJudgeService
def _conversation_payload() -> dict:
return {
"emotion_carry": 10,
"empathy_depth": 8,
"emotion_safety": 6,
"emotion_guidance": 6,
"fact_mining": 8,
"info_completeness_guide": 8,
"info_depth_mining": 9,
"persona_understanding": 7,
"persona_consistency_verify": 4,
"persona_expression_guide": 4,
"interview_structure": 6,
"context_memory": 5,
"rhythm_control": 4,
"question_quality": 7,
"follow_up_depth": 5,
"non_leading": 3,
"total_score": 100.0,
"rationale": "整体表现稳定。",
}
@pytest.mark.asyncio
async def test_judge_conversation_result_preserves_validation_error(
monkeypatch: pytest.MonkeyPatch,
) -> None:
async def _boom(*args, **kwargs):
raise LLMCallError(
"validation",
"pydantic validation failed: total_score mismatch",
)
monkeypatch.setattr("app.features.evaluation.judge_service.allm_json_call", _boom)
svc = EvalJudgeService(object())
result = await svc.judge_conversation_result(full_transcript="[Turn 1]\n用户: hi\nAI: hello")
assert result.output is None
assert result.error is not None
assert "结果校验失败" in result.error
assert "total_score mismatch" in result.error
@pytest.mark.asyncio
async def test_judge_conversation_wrapper_keeps_legacy_shape(
monkeypatch: pytest.MonkeyPatch,
) -> None:
expected = ConversationJudgeOutput.model_validate(_conversation_payload())
async def _ok(*args, **kwargs):
return expected
monkeypatch.setattr("app.features.evaluation.judge_service.allm_json_call", _ok)
svc = EvalJudgeService(object())
out = await svc.judge_conversation(full_transcript="[Turn 1]\n用户: hi\nAI: hello")
assert out == expected
"""Evaluation judge prompt assembly tests."""
from app.features.evaluation.judge_service import _build_memoir_judge_prompt