feat(eval): memoir A/B chapter judging and eval-web parity with dialogue

- Judge baseline excerpt and library chapter separately; build_memoir_compare_summary for gate, nine-dim and leaf deltas.

- Memoir SSE chapter payload: baseline_judge, compare_summary, baseline_judge_error.

- MemoirJudgeOutput: loose score coercion and post-validate clamp; memoir judge prompt caps from settings.

- app-eval-web: two-column MemoirScoreCard layout, MemoirCompareSummary, chapter blocks and CSS.

- Add memoir_compare_summary, log_events, celery_log_context, memoir_pipeline_progress; tests and migration 0014.

- Misc: memory/evidence and enrichment paths, task/orchestrator updates, internal-eval docs, env examples.
This commit is contained in:
Kevin
2026-04-10 10:23:43 +08:00
parent b0251e5b26
commit ac49bc7f23
59 changed files with 4773 additions and 696 deletions

View File

@@ -0,0 +1,101 @@
"""GET /users/{user_id}/memoir-pipeline-run快照读取"""
import pytest
from httpx import ASGITransport, AsyncClient
from app.features.evaluation.internal_auth import get_internal_eval_principal
@pytest.mark.asyncio
async def test_memoir_pipeline_run_ok_by_phase1_task(
monkeypatch: pytest.MonkeyPatch,
) -> None:
from fastapi import FastAPI
monkeypatch.setattr(
"app.core.config.settings.internal_eval_api_key",
"secret",
raising=False,
)
from app.features.evaluation.router import router
def _fake_eval(user_id: str, **kwargs: object):
assert user_id == "u1"
assert kwargs.get("phase1_task_id") == "tid-z"
return {
"memoir_correlation_id": "cid-z",
"user_id": "u1",
"started_at_utc": "2026-04-09T00:00:00Z",
"phase1": {"task_id": "tid-z", "status": "running", "step": "started"},
"phase2": [],
"fanout": {
"story_images": [],
"recompose_chapters": [],
"memory_enrichment": [],
"quality_pass": None,
"compaction": None,
},
}
monkeypatch.setattr(
"app.features.evaluation.router.get_pipeline_run_for_eval",
_fake_eval,
)
app = FastAPI()
app.include_router(router, prefix="/internal/api/evaluation")
async def _override_auth():
from app.features.evaluation.internal_auth import InternalEvalPrincipal
return InternalEvalPrincipal()
app.dependency_overrides[get_internal_eval_principal] = _override_auth
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://t") as client:
r = await client.get(
"/internal/api/evaluation/users/u1/memoir-pipeline-run",
headers={"X-Internal-Eval-Key": "secret"},
params={"phase1_task_id": "tid-z"},
)
assert r.status_code == 200
body = r.json()
assert body["memoir_correlation_id"] == "cid-z"
assert body["phase1"]["task_id"] == "tid-z"
@pytest.mark.asyncio
async def test_memoir_pipeline_run_400_both_ids(
monkeypatch: pytest.MonkeyPatch,
) -> None:
from fastapi import FastAPI
monkeypatch.setattr(
"app.core.config.settings.internal_eval_api_key",
"secret",
raising=False,
)
from app.features.evaluation.router import router
app = FastAPI()
app.include_router(router, prefix="/internal/api/evaluation")
async def _override_auth():
from app.features.evaluation.internal_auth import InternalEvalPrincipal
return InternalEvalPrincipal()
app.dependency_overrides[get_internal_eval_principal] = _override_auth
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://t") as client:
r = await client.get(
"/internal/api/evaluation/users/u1/memoir-pipeline-run",
headers={"X-Internal-Eval-Key": "secret"},
params={
"phase1_task_id": "a",
"memoir_correlation_id": "b",
},
)
assert r.status_code == 400

View File

@@ -0,0 +1,73 @@
"""agent_logging: DEBUG 下载荷、hash_only、去重。"""
from __future__ import annotations
import app.core.agent_logging as agent_logging
class _StubLogger:
def __init__(self) -> None:
self.debug_calls: list[tuple[str, tuple[object, ...]]] = []
def debug(self, msg: str, *args: object, **kwargs: object) -> None:
self.debug_calls.append((msg, args))
def _clear_dedup() -> None:
with agent_logging._dedup_lock:
agent_logging._last_prompt_sha256_by_label.clear()
def test_log_agent_payload_skips_when_not_verbose(monkeypatch: object) -> None:
monkeypatch.setattr("app.core.config.settings.log_level", "INFO")
log = _StubLogger()
agent_logging.log_agent_payload(log, "x.prompt", "hello")
assert log.debug_calls == []
def test_log_agent_payload_preview_includes_sha12(monkeypatch: object) -> None:
monkeypatch.setattr("app.core.config.settings.log_level", "DEBUG")
monkeypatch.setattr("app.core.config.settings.agent_log_prompt_mode", "preview")
monkeypatch.setattr("app.core.config.settings.agent_log_prompt_dedup", False)
monkeypatch.setattr("app.core.config.settings.agent_log_max_chars", 100)
_clear_dedup()
log = _StubLogger()
agent_logging.log_agent_payload(log, "Unit.prompt", "hello world")
assert len(log.debug_calls) == 1
msg, args = log.debug_calls[0]
assert "agent_payload" in msg
assert "sha12=" in msg
assert args[0] == "Unit.prompt"
assert args[4] == "hello world"
def test_log_agent_payload_hash_only_no_preview(monkeypatch: object) -> None:
monkeypatch.setattr("app.core.config.settings.log_level", "DEBUG")
monkeypatch.setattr("app.core.config.settings.agent_log_prompt_mode", "hash_only")
monkeypatch.setattr("app.core.config.settings.agent_log_prompt_dedup", False)
_clear_dedup()
log = _StubLogger()
body = "x" * 500
agent_logging.log_agent_payload(log, "Unit.prompt", body)
assert len(log.debug_calls) == 1
msg, args = log.debug_calls[0]
assert "mode=hash_only" in msg
assert args[0] == "Unit.prompt"
assert args[1] == 500
assert isinstance(args[2], str) and len(args[2]) == 12
def test_log_agent_payload_dedup_second_call_skipped(monkeypatch: object) -> None:
monkeypatch.setattr("app.core.config.settings.log_level", "DEBUG")
monkeypatch.setattr("app.core.config.settings.agent_log_prompt_mode", "preview")
monkeypatch.setattr("app.core.config.settings.agent_log_prompt_dedup", True)
monkeypatch.setattr("app.core.config.settings.agent_log_max_chars", 200)
_clear_dedup()
log = _StubLogger()
agent_logging.log_agent_payload(log, "DedupLabel.prompt", "same text")
agent_logging.log_agent_payload(log, "DedupLabel.prompt", "same text")
assert len(log.debug_calls) == 2
assert "agent_payload_skipped" in log.debug_calls[1][0]
skip_args = log.debug_calls[1][1]
assert skip_args[0] == "DedupLabel.prompt"
assert skip_args[2] == len("same text")

View File

@@ -138,6 +138,24 @@ def test_memoir_judge_coerces_string_lists_from_llm() -> None:
assert m.insufficient_evidence == []
def test_memoir_judge_clamps_leaf_scores_over_max_from_llm() -> None:
"""细项略超满分(如 rich_diversity=2.5)时钳制到 rubric 上限,避免 validation 整单失败。"""
leaves = _full_memoir_leaves_max()
leaves["rich_diversity"] = 2.5
m = MemoirJudgeOutput.model_validate(
{
**leaves,
"total_score": 100.0,
"rationale": "",
"major_strengths": [],
"major_issues": [],
"insufficient_evidence": [],
"evidence_refs": [],
}
)
assert m.rich_diversity == 2.0
def test_conversation_judge_meta_fields_default() -> None:
leaves = {
"emotion_carry": 10,

View File

@@ -0,0 +1,62 @@
"""log_eventsformat_log_event 与 celery_prerun_extras。"""
from __future__ import annotations
from app.core.log_events import (
celery_prerun_extras,
correlation_bind_kwargs,
format_log_event,
)
def test_format_log_event_msg_last() -> None:
s = format_log_event(
"demo",
z_last=1,
a_first="x",
msg="你好 世界",
)
assert s.startswith("event=demo ")
assert s.endswith(" msg=你好 世界")
assert "a_first=x" in s
assert "z_last=1" in s
def test_format_log_event_skips_empty() -> None:
s = format_log_event("x", empty="", none_val=None, ok=5)
assert "empty=" not in s
assert "none_val=" not in s
assert "ok=5" in s
def test_format_log_event_float() -> None:
s = format_log_event("t", duration_ms=12.3456)
assert "duration_ms=12.3" in s
def test_correlation_bind_kwargs() -> None:
d = correlation_bind_kwargs(
user_id="u1",
memoir_correlation_id="c1",
)
assert d == {"user_id": "u1", "correlation_id": "c1"}
def test_celery_prerun_extras_from_kwargs() -> None:
ex = celery_prerun_extras(
"app.tasks.memory_enrichment_tasks.enrich_memory_source",
("uid", "sid"),
{"memoir_correlation_id": "mc"},
)
assert ex["user_id"] == "uid"
assert ex["source_id"] == "sid"
assert ex["correlation_id"] == "mc"
def test_celery_prerun_extras_positional_only() -> None:
ex = celery_prerun_extras(
"app.tasks.chapter_compose_tasks.recompose_chapter",
("chap-1",),
{},
)
assert ex == {"chapter_id": "chap-1"}

View File

@@ -144,7 +144,7 @@ def test_ingest_transcript_sync_no_longer_calls_enrichment_inline() -> None:
source = inspect.getsource(ingest_transcript_sync)
assert "enrich_memory_after_ingest_sync" not in source
assert "enrich_memory_source" in source
assert "schedule_memory_enrichment" in source
# ---------------------------------------------------------------------------

View File

@@ -0,0 +1,105 @@
"""memoir_pipeline_progress合并与读取逻辑假 Redis 客户端)。"""
import json
import pytest
import app.core.memoir_pipeline_progress as mpp
class _FakeRedis:
def __init__(self) -> None:
self.store: dict[str, str] = {}
def get(self, key: str) -> str | None:
return self.store.get(key)
def setex(self, key: str, _ttl: int, value: str) -> None:
self.store[key] = value
@pytest.fixture
def fake_redis(monkeypatch: pytest.MonkeyPatch) -> _FakeRedis:
fr = _FakeRedis()
monkeypatch.setattr(mpp, "_client", fr)
return fr
def test_merge_pipeline_run_creates_doc(fake_redis: _FakeRedis) -> None:
mpp.merge_pipeline_run("cid-1", {"phase1": {"step": "memory_ingest"}})
raw = fake_redis.store.get("memoir_pipeline_run:cid-1")
assert raw
doc = json.loads(raw)
assert doc["memoir_correlation_id"] == "cid-1"
assert doc["phase1"]["step"] == "memory_ingest"
def test_merge_phase2_merges_by_task_id(fake_redis: _FakeRedis) -> None:
mpp.merge_pipeline_run(
"cid-2",
{
"phase2": [
{"chapter_category": "a", "task_id": "t1", "status": "enqueued"},
],
},
)
mpp.merge_pipeline_run(
"cid-2",
{"phase2": [{"task_id": "t1", "status": "running"}]},
)
raw = fake_redis.store["memoir_pipeline_run:cid-2"]
doc = json.loads(raw)
assert len(doc["phase2"]) == 1
assert doc["phase2"][0]["task_id"] == "t1"
assert doc["phase2"][0]["status"] == "running"
assert doc["phase2"][0]["chapter_category"] == "a"
def test_merge_fanout_lists_merge_by_id(fake_redis: _FakeRedis) -> None:
mpp.merge_pipeline_run(
"cid-3",
{
"fanout": {
"story_images": [
{"story_id": "s1", "task_id": "img1", "status": "enqueued"},
],
},
},
)
mpp.merge_pipeline_run(
"cid-3",
{
"fanout": {
"story_images": [
{"story_id": "s1", "status": "success"},
],
},
},
)
doc = json.loads(fake_redis.store["memoir_pipeline_run:cid-3"])
assert len(doc["fanout"]["story_images"]) == 1
assert doc["fanout"]["story_images"][0]["task_id"] == "img1"
assert doc["fanout"]["story_images"][0]["status"] == "success"
def test_init_and_index_resolve(fake_redis: _FakeRedis) -> None:
mpp.init_pipeline_run_from_phase1(
"user-a", "cid-4", "p1tid", segment_count=3
)
cid = mpp.resolve_correlation_id_for_phase1_task("p1tid")
assert cid == "cid-4"
snap = mpp.get_pipeline_run_for_eval(
"user-a", phase1_task_id="p1tid"
)
assert snap is not None
assert snap["user_id"] == "user-a"
assert snap["phase1"]["task_id"] == "p1tid"
def test_get_pipeline_run_for_eval_user_mismatch(fake_redis: _FakeRedis) -> None:
mpp.init_pipeline_run_from_phase1(
"user-a", "cid-5", "p1b", segment_count=1
)
assert (
mpp.get_pipeline_run_for_eval("other", phase1_task_id="p1b") is None
)

View File

@@ -0,0 +1,116 @@
"""Baseline memory enrichment: single LLM call → session summary + facts."""
from __future__ import annotations
from types import SimpleNamespace
import pytest
from app.features.memory.enrichment import enrich_memory_after_ingest_sync
from app.features.memory.llm_schemas import EnrichmentPayload, parse_json_payload
from app.features.memory.models import MemorySource
from app.features.user.models import User
def test_enrichment_payload_roundtrip() -> None:
raw = (
'{"summary":"要点摘要",'
'"facts":[{"fact_type":"event","subject":"王伟","predicate":"",'
'"object_json":{"value":"北京","approximate_era":"1990年代"},'
'"confidence":0.85,"source_chunk_id":"ch-1"}]}'
)
p = parse_json_payload(raw, EnrichmentPayload)
assert p is not None
assert p.summary == "要点摘要"
assert len(p.facts) == 1
assert p.facts[0].subject == "王伟"
def test_enrich_memory_after_ingest_sync_single_llm_call(monkeypatch: pytest.MonkeyPatch) -> None:
from app.features.memory import enrichment as mod
monkeypatch.setattr("app.core.config.settings.memory_enrichment_enabled", True)
invoke_count = {"n": 0}
def fake_invoke(llm, prompt, max_tokens, agent):
invoke_count["n"] += 1
assert agent == "memory.enrichment_sync"
return (
'{"summary":"本轮要点",'
'"facts":[{"fact_type":"event","subject":"王伟","predicate":"",'
'"object_json":{"value":"上海"},"confidence":0.8,"source_chunk_id":"ch1"}]}'
)
monkeypatch.setattr(mod, "invoke_json_object", fake_invoke)
monkeypatch.setattr(
mod,
"list_chunks_for_source_sync",
lambda s, sid: [SimpleNamespace(id="ch1", content="王伟住在上海。")],
)
summaries: list[dict] = []
facts: list[dict] = []
def capture_summary(session, **kwargs):
summaries.append(kwargs)
def capture_fact(session, **kwargs):
facts.append(kwargs)
monkeypatch.setattr(mod, "create_memory_summary_sync", capture_summary)
monkeypatch.setattr(mod, "create_memory_fact_sync", capture_fact)
class FakeSession:
def get(self, model, key):
if model is User and key == "u1":
return SimpleNamespace(nickname="老王")
if model is MemorySource and key == "src-1":
return SimpleNamespace(lineage_json=None)
return None
enrich_memory_after_ingest_sync(FakeSession(), "u1", "src-1", llm=object())
assert invoke_count["n"] == 1
assert len(summaries) == 1
assert summaries[0]["summary_type"] == "session"
assert summaries[0]["content"] == "本轮要点"
assert summaries[0]["source_chunk_ids"] == ["ch1"]
assert len(facts) == 1
assert facts[0]["predicate"] == ""
assert facts[0]["status"] == "confirmed"
def test_enrich_memory_skips_when_parse_returns_none(monkeypatch: pytest.MonkeyPatch) -> None:
from app.features.memory import enrichment as mod
monkeypatch.setattr("app.core.config.settings.memory_enrichment_enabled", True)
monkeypatch.setattr(mod, "invoke_json_object", lambda *a, **k: "{not json")
monkeypatch.setattr(
mod,
"list_chunks_for_source_sync",
lambda s, sid: [SimpleNamespace(id="c1", content="x")],
)
called = {"summary": False, "fact": False}
monkeypatch.setattr(
mod,
"create_memory_summary_sync",
lambda *a, **k: called.update(summary=True),
)
monkeypatch.setattr(
mod,
"create_memory_fact_sync",
lambda *a, **k: called.update(fact=True),
)
class FakeSession:
def get(self, model, key):
if model is User and key == "u":
return None
if model is MemorySource and key == "s":
return SimpleNamespace(lineage_json=None)
return None
enrich_memory_after_ingest_sync(FakeSession(), "u", "s", llm=object())
assert called == {"summary": False, "fact": False}

View File

@@ -90,13 +90,18 @@ def test_single_segment_decide_receives_only_combined_text_not_evidence() -> Non
patch(
"app.features.memoir.story_pipeline_sync.MemoirImageSettings",
) as mis,
patch(
"app.tasks.story_title_tasks.generate_story_title_after_create.delay",
),
patch(
"app.features.memoir.story_pipeline_sync.refresh_chapter_evidence_snapshot_with_retry_sync",
),
):
route_agent_mock.plan_batch.return_value = None
route_agent_mock.decide.side_effect = decide_capture
na = MagicMock()
nac.return_value = na
na.generate_title.return_value = "章节标题"
na.generate_narrative.return_value = '{"paragraphs": [{"content": "叙事正文段落足够长用于测试合并逻辑避免触发过短回退"}]}'
mock_story = MagicMock()
@@ -200,13 +205,18 @@ def test_decide_receives_only_same_stage_story_candidates() -> None:
patch(
"app.features.memoir.story_pipeline_sync.MemoirImageSettings",
) as mis,
patch(
"app.tasks.story_title_tasks.generate_story_title_after_create.delay",
),
patch(
"app.features.memoir.story_pipeline_sync.refresh_chapter_evidence_snapshot_with_retry_sync",
),
):
route_agent_mock.plan_batch.return_value = None
route_agent_mock.decide.side_effect = decide_capture
na = MagicMock()
nac.return_value = na
na.generate_title.return_value = "章节标题"
na.generate_narrative.return_value = '{"paragraphs": [{"content": "叙事正文段落足够长用于测试合并逻辑避免触发过短回退"}]}'
mock_story = MagicMock()