- Merge internal-eval into development.sh (single Celery/infra); internal-eval.sh wraps with LIFE_ECHO_WITH_INTERNAL_EVAL; EVAL_ATTACH_ONLY for attaching 8001 when :8000 is already up; document in api/docs/internal-eval.md. - Evaluation: transcript_for_judge, judge error surfacing, rubric/schema tweaks, execution_service and router updates; tests for judge and composite eval. - Memory: ingest nested transaction for embedding/enrichment rollback safety. - Conversation WS: logger.exception for pipeline errors (avoid loguru KeyError). - app-eval-web: Playground saved replays, dialogue turns helper, hash user_id for Memoir; Memoir chapter baseline↔DB row compare with title heuristics; Stories page (#memoir-stories); Markdown + copy buttons; toolbar/panel UI; react-markdown; development proxy and fixture updates.
96 lines
3.1 KiB
Python
96 lines
3.1 KiB
Python
"""评审服务:保留真实失败原因,便于 internal eval 页面排障。"""
|
|
|
|
import pytest
|
|
|
|
from app.core.llm_call import LLMCallError
|
|
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
|
|
from app.features.evaluation.judge_service import EvalJudgeService
|
|
|
|
|
|
def _conversation_payload() -> dict:
|
|
return {
|
|
"emotion_carry": 10,
|
|
"empathy_depth": 8,
|
|
"emotion_safety": 6,
|
|
"emotion_guidance": 6,
|
|
"fact_mining": 8,
|
|
"info_completeness_guide": 8,
|
|
"info_depth_mining": 9,
|
|
"persona_understanding": 7,
|
|
"persona_consistency_verify": 4,
|
|
"persona_expression_guide": 4,
|
|
"interview_structure": 6,
|
|
"context_memory": 5,
|
|
"rhythm_control": 4,
|
|
"question_quality": 7,
|
|
"follow_up_depth": 5,
|
|
"non_leading": 3,
|
|
"total_score": 100.0,
|
|
"rationale": "整体表现稳定。",
|
|
}
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_judge_conversation_result_preserves_validation_error(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
async def _boom(*args, **kwargs):
|
|
raise LLMCallError(
|
|
"validation",
|
|
"pydantic validation failed: total_score mismatch",
|
|
)
|
|
|
|
monkeypatch.setattr("app.features.evaluation.judge_service.allm_json_call", _boom)
|
|
|
|
svc = EvalJudgeService(object())
|
|
result = await svc.judge_conversation_result(full_transcript="[Turn 1]\n用户: hi\nAI: hello")
|
|
|
|
assert result.output is None
|
|
assert result.error is not None
|
|
assert "结果校验失败" in result.error
|
|
assert "total_score mismatch" in result.error
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_judge_conversation_wrapper_keeps_legacy_shape(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
expected = ConversationJudgeOutput.model_validate(_conversation_payload())
|
|
|
|
async def _ok(*args, **kwargs):
|
|
return expected
|
|
|
|
monkeypatch.setattr("app.features.evaluation.judge_service.allm_json_call", _ok)
|
|
|
|
svc = EvalJudgeService(object())
|
|
out = await svc.judge_conversation(full_transcript="[Turn 1]\n用户: hi\nAI: hello")
|
|
|
|
assert out == expected
|
|
"""Evaluation judge prompt assembly tests."""
|
|
|
|
from app.features.evaluation.judge_service import _build_memoir_judge_prompt
|
|
|
|
|
|
def test_build_memoir_prompt_includes_source_and_reference_evidence() -> None:
|
|
prompt = _build_memoir_judge_prompt(
|
|
memoir_markdown="# 当前正文\n他后来去了南方。",
|
|
source_transcript="用户: 我后来去了深圳工作。",
|
|
reference_memoir_markdown="# 导出基线\n他去了深圳。",
|
|
evidence_notes="必须严格核对真实性。",
|
|
)
|
|
|
|
assert "【评审说明】" in prompt
|
|
assert "【原始访谈/证据】" in prompt
|
|
assert "用户: 我后来去了深圳工作。" in prompt
|
|
assert "【参考基线/导出成稿】" in prompt
|
|
assert "【当前回忆录正文】" in prompt
|
|
|
|
|
|
def test_build_memoir_prompt_requires_conservative_scoring_without_evidence() -> None:
|
|
prompt = _build_memoir_judge_prompt(
|
|
memoir_markdown="# 当前正文\n他后来去了南方。"
|
|
)
|
|
|
|
assert "无可用原始访谈证据" in prompt
|
|
assert "必须保守打分" in prompt
|