refactor(eval+memoir):精简内部评测路由与服务,composite/对话摘要与 judge 能力补强

- 访谈:新增 interview_state_hints,联动 orchestrator 与提示词
- 回忆录:story_pipeline_sync/state/memory/post_commit 与 Celery 任务调整
- 基建:开发用 celery broker、compose/development 脚本、依赖注入
- eval-web:移除数据集/实验/版本等页面与流式轮询,突出 Playground
- 文档与单测同步
This commit is contained in:
Kevin
2026-04-08 21:36:12 +08:00
parent 2a0c80987d
commit 064ad2161d
64 changed files with 3412 additions and 3068 deletions

View File

@@ -4,11 +4,12 @@ import pytest
from httpx import ASGITransport, AsyncClient
from app.features.evaluation.internal_auth import get_internal_eval_principal
from app.features.evaluation.router import router
@pytest.mark.asyncio
async def test_internal_eval_list_sets_requires_config(monkeypatch: pytest.MonkeyPatch):
async def test_internal_eval_list_fixtures_requires_config(
monkeypatch: pytest.MonkeyPatch,
):
from fastapi import FastAPI
monkeypatch.setattr(
@@ -16,16 +17,20 @@ async def test_internal_eval_list_sets_requires_config(monkeypatch: pytest.Monke
"",
raising=False,
)
from app.features.evaluation.router import router
app = FastAPI()
app.include_router(router, prefix="/internal/api/evaluation")
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://t") as client:
r = await client.get("/internal/api/evaluation/regression-sets")
r = await client.get("/internal/api/evaluation/fixtures/user-exports")
assert r.status_code == 503
@pytest.mark.asyncio
async def test_internal_eval_with_override_lists_empty(monkeypatch: pytest.MonkeyPatch):
async def test_internal_eval_with_override_lists_fixtures(
monkeypatch: pytest.MonkeyPatch,
):
from fastapi import FastAPI
monkeypatch.setattr(
@@ -33,6 +38,17 @@ async def test_internal_eval_with_override_lists_empty(monkeypatch: pytest.Monke
"secret",
raising=False,
)
def _empty_fixtures() -> list[str]:
return []
monkeypatch.setattr(
"app.features.evaluation.admin_service.list_user_export_md_filenames",
_empty_fixtures,
raising=False,
)
from app.features.evaluation.router import router
app = FastAPI()
app.include_router(router, prefix="/internal/api/evaluation")
@@ -42,24 +58,12 @@ async def test_internal_eval_with_override_lists_empty(monkeypatch: pytest.Monke
return InternalEvalPrincipal()
app.dependency_overrides[get_internal_eval_principal] = _override_auth
from app.core.db import get_async_db
from unittest.mock import AsyncMock, MagicMock
mock_session = AsyncMock()
mock_result = MagicMock()
mock_result.scalars.return_value.unique.return_value.all.return_value = []
mock_session.execute = AsyncMock(return_value=mock_result)
async def _db():
yield mock_session
app.dependency_overrides[get_async_db] = _db
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://t") as client:
r = await client.get(
"/internal/api/evaluation/regression-sets",
"/internal/api/evaluation/fixtures/user-exports",
headers={"X-Internal-Eval-Key": "secret"},
)
assert r.status_code == 200
assert r.json() == []
assert r.json() == {"items": []}

View File

@@ -1,6 +1,6 @@
"""评测合成分:评审缺失侧不得被当作 0 分。"""
from app.features.evaluation.execution_service import _composite
from app.features.evaluation.composite_score import _composite
def test_composite_none_when_both_missing() -> None:

View File

@@ -0,0 +1,53 @@
"""评测评审 LLM 装配:多供应商与上下文预算。"""
import pytest
from app.core.config import settings
from app.core.dependencies import build_eval_judge_llm_spec
from app.features.evaluation.judge_service import (
eval_judge_compare_transcript_each_max_chars_for_context,
eval_judge_conversation_transcript_max_chars_for_context,
)
def test_build_eval_judge_zhipu_uses_bigmodel_defaults(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(settings, "eval_judge_api_key", "")
monkeypatch.setattr(settings, "zhipu_api_key", "z-test")
monkeypatch.setattr(settings, "eval_judge_model", "glm-5")
spec = build_eval_judge_llm_spec("zhipu", None)
assert spec is not None
assert spec.provider == "zhipu"
assert spec.resolved_model == "glm-5"
assert spec.llm is not None
assert spec.context_window_tokens == settings.eval_judge_context_window_tokens
def test_build_eval_judge_zhipu_request_model_override(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(settings, "eval_judge_api_key", "e-test")
monkeypatch.setattr(settings, "eval_judge_model", "glm-5")
spec = build_eval_judge_llm_spec("zhipu", "glm-4-plus")
assert spec is not None
assert spec.resolved_model == "glm-4-plus"
def test_build_eval_judge_deepseek_requires_key(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(settings, "deepseek_api_key", "")
monkeypatch.setattr(settings, "llm_api_key", "")
assert build_eval_judge_llm_spec("deepseek", None) is None
def test_build_eval_judge_deepseek_context_budget(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(settings, "deepseek_api_key", "d-test")
monkeypatch.setattr(settings, "eval_judge_deepseek_model", "deepseek-reasoner")
monkeypatch.setattr(settings, "eval_judge_deepseek_context_window_tokens", 64_000)
spec = build_eval_judge_llm_spec("deepseek", None)
assert spec is not None
assert spec.provider == "deepseek"
assert spec.resolved_model == "deepseek-reasoner"
assert spec.context_window_tokens == 64_000
n = eval_judge_conversation_transcript_max_chars_for_context(64_000)
glm_n = eval_judge_conversation_transcript_max_chars_for_context(200_000)
assert n < glm_n
each_ds = eval_judge_compare_transcript_each_max_chars_for_context(64_000)
each_glm = eval_judge_compare_transcript_each_max_chars_for_context(200_000)
assert each_ds < each_glm

View File

@@ -2,6 +2,11 @@
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from app.agents.chat.interview_state_hints import (
apply_duplicate_question_guard,
extract_scene_cues,
)
from app.agents.state_schema import KnownFact, MemoirStateSchema, PersonaThread, default_slots
from app.agents.chat.helpers import format_history_string
from app.agents.chat.personas import normalize_interview_persona
from app.agents.chat.prompts_conversation import (
@@ -32,7 +37,7 @@ def test_guided_prompt_does_not_embed_raw_user_message_in_system_text():
assert "__USER_SECRET_PROFILE__" in p2
def test_guided_prompt_mentions_empathy_and_self_judgment():
def test_guided_prompt_mentions_empathy_and_scene_strategy():
p = get_guided_conversation_prompt(
current_stage="childhood",
empty_slots=["place"],
@@ -41,10 +46,10 @@ def test_guided_prompt_mentions_empathy_and_self_judgment():
user_profile_context="",
persona="default",
)
assert "接住对方" in p
assert "你自己判断" in p or "该追问" in p
assert "共情与轻量自我表露" in p
assert "意义向深挖" in p
assert "接住" in p
assert "画面" in p or "细节" in p
assert "深挖" in p
assert "串联" in p
def test_guided_prompt_era_popculture_open_questions_when_birth_year():
@@ -132,6 +137,85 @@ def test_guided_prompt_military_tone_in_system():
assert "简洁" in p or "利落" in p or "得体" in p
def test_guided_prompt_includes_known_facts_persona_threads_and_recent_questions():
p = get_guided_conversation_prompt(
current_stage="career",
empty_slots=["job", "decision"],
filled_slots={"growth": "越做越确定自己适合产品"},
detected_user_stage="career",
user_profile_context="",
persona="default",
known_facts=[
KnownFact(label="本轮新信息", value="我后来去了瑞士读书", stage="education"),
],
persona_threads=[
PersonaThread(trait="执着坚持", evidence="为了训练咬牙坚持了很多年"),
],
recent_questions=["你当时为什么会想去瑞士?"],
)
assert "已确认事实" in p
assert "我后来去了瑞士读书" in p
assert "人物主线" in p
assert "执着坚持" in p
assert "最近已经问过的问题" in p
assert "为什么会想去瑞士" in p
def test_prompt_empty_slots_excludes_slots_already_covered_by_known_facts():
state = MemoirStateSchema(
stage_order=["education"],
current_stage="education",
covered_stages=[],
slots={"education": default_slots()["education"]},
known_facts=[
KnownFact(
label="求学城市",
value="后来在瑞士读书",
stage="education",
slot_name="city",
)
],
)
assert "city" not in state.prompt_empty_slots_for_current_stage()
assert "school" in state.prompt_empty_slots_for_current_stage()
def test_duplicate_question_guard_downgrades_recent_repeat_question():
state = MemoirStateSchema(
stage_order=["education"],
current_stage="education",
covered_stages=[],
slots={"education": default_slots()["education"]},
known_facts=[
KnownFact(label="本轮新信息", value="我后来去了瑞士读书", stage="education")
],
)
cleaned, touched = apply_duplicate_question_guard(
["我记住了。你后来去了瑞士读书吗?"],
state=state,
recent_questions=["你后来去了瑞士读书吗?"],
)
assert touched is True
assert cleaned == ["我记住了。"]
def test_extract_scene_cues_picks_up_sensory_keywords():
cues = extract_scene_cues("我们小时候在河里游泳,冬天溜冰")
assert any("" in c or "" in c for c in cues)
assert any("" in c or "咔嚓" in c for c in cues)
def test_extract_scene_cues_empty_for_abstract_text():
assert extract_scene_cues("我觉得人生需要坚持") == []
def test_default_persona_now_has_tone_hint():
from app.agents.chat.personas import get_interview_persona_tone_hint
hint = get_interview_persona_tone_hint("default")
assert hint
assert "画面" in hint or "细节" in hint
def test_opening_prompt_military_style_rules_not_dialogue_samples() -> None:
p = get_opening_prompt(
current_stage="childhood",

View File

@@ -4,6 +4,9 @@ import pytest
from app.core.config import settings
from app.core.llm_call import LLMCallError
from app.features.evaluation.conversation_compare_summary import (
build_conversation_compare_summary,
)
from app.features.evaluation.judge_schemas import ConversationJudgeOutput
from app.features.evaluation.judge_service import (
EvalJudgeService,
@@ -36,6 +39,12 @@ def _conversation_payload() -> dict:
}
def _conversation_payload_variant(**overrides: float | str) -> dict:
data = _conversation_payload()
data.update(overrides)
return data
@pytest.mark.asyncio
async def test_judge_conversation_result_preserves_validation_error(
monkeypatch: pytest.MonkeyPatch,
@@ -116,3 +125,63 @@ def test_build_memoir_prompt_requires_conservative_scoring_without_evidence() ->
assert "无可用局部对话证据" in prompt
assert "必须保守打分" in prompt
assert "【结构化记忆证据】" in prompt
def test_compare_summary_surpass_gate_and_truncation_flags() -> None:
baseline = ConversationJudgeOutput.model_validate(_conversation_payload())
replay = ConversationJudgeOutput.model_validate(
_conversation_payload_variant(
emotion_carry=10,
empathy_depth=8,
emotion_safety=6,
emotion_guidance=6,
fact_mining=8,
info_completeness_guide=8,
info_depth_mining=9,
persona_understanding=7,
persona_consistency_verify=4,
persona_expression_guide=4,
interview_structure=6,
context_memory=5,
rhythm_control=4,
question_quality=7,
follow_up_depth=5,
non_leading=3,
rationale="更稳定。",
)
)
summary = build_conversation_compare_summary(
baseline_judge=baseline,
replay_judge=replay,
baseline_transcript="A" * 400,
replay_transcript="B" * 1200,
conv_cap=1000,
compare_cap_each=500,
fixture_filename="golden.md",
)
assert summary["mode"] == "ab"
assert summary["gate"]["status"] in {"parity", "surpass"}
assert summary["truncation"]["replay_truncated_for_compare"] is True
assert "group_deltas" in summary
def test_compare_summary_flags_repeat_issue_as_regression() -> None:
baseline = ConversationJudgeOutput.model_validate(_conversation_payload())
replay = ConversationJudgeOutput.model_validate(
_conversation_payload_variant(
context_memory=3,
rhythm_control=3,
total_score=0,
major_issues=["存在重复盘问,忽略已答信息"],
)
)
summary = build_conversation_compare_summary(
baseline_judge=baseline,
replay_judge=replay,
baseline_transcript="[Turn 1]",
replay_transcript="[Turn 1]",
conv_cap=1000,
compare_cap_each=500,
)
assert summary["repeat_issue_detected"] is True
assert summary["gate"]["status"] == "regressed"

View File

@@ -0,0 +1,242 @@
"""Validation tests for memoir pipeline optimization (Phase A/B/C).
Tests:
- Phase1 batch path is now the default
- Memory enrichment is dispatched asynchronously
- Unified narrative unit executor produces correct results
- Post-commit fan-out includes quality pass
- Quality pass task handles title polishing
"""
from __future__ import annotations
from unittest.mock import MagicMock, patch
import pytest
from app.agents.memoir.extraction_agent import ExtractionResult
from app.agents.memoir.classification_agent import ChapterClassifyResult
from app.agents.memoir.orchestrator import MemoirOrchestrator
from app.agents.state_schema import MemoirStateSchema
# ---------------------------------------------------------------------------
# Phase1 batch path defaults
# ---------------------------------------------------------------------------
def test_phase1_batch_enabled_by_default() -> None:
"""memoir_phase1_batch_llm_enabled should default to True after optimization."""
from app.core.config import Settings
s = Settings()
assert s.memoir_phase1_batch_llm_enabled is True
def test_quality_pass_enabled_by_default() -> None:
from app.core.config import Settings
s = Settings()
assert s.memoir_quality_pass_enabled is True
# ---------------------------------------------------------------------------
# Phase1 orchestrator selects batch path when available
# ---------------------------------------------------------------------------
def test_orchestrator_tries_batch_first(monkeypatch: pytest.MonkeyPatch) -> None:
"""When batch LLM is enabled and LLM is available, batch path should be attempted."""
monkeypatch.setattr(
"app.agents.memoir.orchestrator.settings.memoir_phase1_batch_llm_enabled",
True,
)
orch = MemoirOrchestrator()
batch_called = {"flag": False}
def fake_batch(*args, **kwargs):
batch_called["flag"] = True
return MagicMock(
state=MemoirStateSchema(
stage_order=["childhood"],
current_stage="childhood",
covered_stages=[],
slots={},
),
category_to_segments={},
segment_skip_story_ids=set(),
segment_chapter_category={},
)
orch._prepare_batches_via_batch_llm = fake_batch
class _Seg:
def __init__(self, sid: str) -> None:
self.id = sid
self.user_input_text = "test"
st = MemoirStateSchema(
stage_order=["childhood"],
current_stage="childhood",
covered_stages=[],
slots={},
)
orch.prepare_batches(
segments=[_Seg("s1")],
llm=MagicMock(),
llm_fast=MagicMock(),
get_or_create_state=lambda: st,
update_slot=lambda *a: st,
)
assert batch_called["flag"] is True
def test_orchestrator_fallback_to_sequential(monkeypatch: pytest.MonkeyPatch) -> None:
"""If batch path raises, should fall back to sequential extraction."""
monkeypatch.setattr(
"app.agents.memoir.orchestrator.settings.memoir_phase1_batch_llm_enabled",
True,
)
orch = MemoirOrchestrator()
def fail_batch(*args, **kwargs):
raise RuntimeError("batch LLM unavailable")
orch._prepare_batches_via_batch_llm = fail_batch
orch.extraction_agent.extract = MagicMock(
return_value=ExtractionResult(detected_stage="childhood", slots={"toy": "ball"})
)
orch.classification_agent.classify = MagicMock(
return_value=ChapterClassifyResult(category="childhood", llm_said_none=False)
)
st = MemoirStateSchema(
stage_order=["childhood"],
current_stage="childhood",
covered_stages=[],
slots={},
)
class _Seg:
def __init__(self, sid: str, text: str) -> None:
self.id = sid
self.user_input_text = text
result = orch.prepare_batches(
segments=[_Seg("s1", "我小时候玩球")],
llm=MagicMock(),
llm_fast=MagicMock(),
get_or_create_state=lambda: st,
update_slot=lambda *a: st,
)
assert "s1" in result.segment_chapter_category
# ---------------------------------------------------------------------------
# Memory enrichment decoupled from ingest
# ---------------------------------------------------------------------------
def test_ingest_transcript_sync_no_longer_calls_enrichment_inline() -> None:
"""After decoupling, ingest_transcript_sync should NOT import enrichment inline."""
import inspect
from app.features.memory.service import ingest_transcript_sync
source = inspect.getsource(ingest_transcript_sync)
assert "enrich_memory_after_ingest_sync" not in source
assert "enrich_memory_source" in source
# ---------------------------------------------------------------------------
# Post-commit unified fan-out
# ---------------------------------------------------------------------------
def test_post_commit_result_includes_quality_pass() -> None:
"""PostCommitResult should have quality_pass_scheduled field."""
from app.features.story.post_commit import PostCommitResult
r = PostCommitResult()
assert hasattr(r, "quality_pass_scheduled")
assert r.quality_pass_scheduled is False
def test_post_commit_signature_accepts_quality_pass() -> None:
"""enqueue_story_post_commit_effects should accept need_quality_pass kwarg."""
import inspect
from app.features.story.post_commit import enqueue_story_post_commit_effects
sig = inspect.signature(enqueue_story_post_commit_effects)
assert "need_quality_pass" in sig.parameters
assert "memoir_correlation_id" in sig.parameters
# ---------------------------------------------------------------------------
# resolve_append_target
# ---------------------------------------------------------------------------
def test_resolve_append_target_forced_new_on_overflow() -> None:
"""When canonical exceeds limit, should force new story."""
from app.features.memoir.story_pipeline_sync import _resolve_append_target
session = MagicMock()
big_story = MagicMock()
big_story.user_id = "u1"
big_story.id = "story-1"
big_story.canonical_markdown = "x" * 200_000
session.get.return_value = big_story
with patch(
"app.features.memoir.story_pipeline_sync.count_story_versions_sync",
return_value=1,
):
tid, existing, dsrc = _resolve_append_target(
session,
route_decision="append_story",
route_target_story_id="story-1",
user_id="u1",
chapter_category="childhood",
oral_norm="short text",
candidate_stories=[],
story_meta={},
decision_source="test",
memoir_correlation_id=None,
)
assert tid is None
assert dsrc == "forced_new_due_to_append_limit"
# ---------------------------------------------------------------------------
# _run_post_pipeline_commit helper
# ---------------------------------------------------------------------------
def test_run_post_pipeline_commit_calls_post_commit() -> None:
"""Shared helper should call enqueue_story_post_commit_effects."""
from app.tasks.memoir_tasks import _run_post_pipeline_commit
with patch(
"app.features.story.post_commit.enqueue_story_post_commit_effects"
) as mock_pc, patch(
"app.features.memoir.memoir_images.settings.MemoirImageSettings"
) as mock_img:
mock_pc.return_value = MagicMock(
enqueued_story_image_count=0,
enqueued_chapter_recompose_count=0,
compaction_scheduled=False,
quality_pass_scheduled=True,
errors=[],
)
mock_img.from_env.return_value = MagicMock(enabled=False)
_run_post_pipeline_commit(
user_id="u1",
story_dispatch_ids={"s1"},
recompose_chapter_ids={"c1"},
cover_chapter_ids=set(),
trigger_source="test",
need_compaction=False,
need_quality_pass=True,
memoir_correlation_id="cid-1",
)
mock_pc.assert_called_once()
call_kwargs = mock_pc.call_args
assert call_kwargs.kwargs["need_quality_pass"] is True
assert call_kwargs.kwargs["memoir_correlation_id"] == "cid-1"