feat(eval): server-side replay/phase1 timing + memoir phase1 batch chunking

- Replay and memoir-submit responses include started/finished UTC and elapsed_ms; Phase1 poll exposes Redis-backed submit time and elapsed_ms_since_submit. - Phase1 batch LLM splits segments by memoir_phase1_batch_llm_chunk_size with bisect fallback per chunk; Playground shows server timings. Made-with: Cursor
2026-04-09 13:38:53 +08:00
parent 064ad2161d
commit b0251e5b26
14 changed files with 544 additions and 14 deletions
--- a/api/tests/evaluation/test_memoir_readiness_router.py
+++ b/api/tests/evaluation/test_memoir_readiness_router.py
@@ -1,5 +1,7 @@
 """memoir-phase1-ready internal 路由（依赖注入替身）。"""

+from datetime import datetime, timezone
+
 import pytest
 from httpx import ASGITransport, AsyncClient

@@ -54,6 +56,60 @@ async def test_memoir_phase1_ready_returns_bundle(monkeypatch: pytest.MonkeyPatc
    assert body["pending_segment_ids"] == []


+@pytest.mark.asyncio
+async def test_memoir_phase1_ready_includes_server_elapsed_fields(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from fastapi import FastAPI
+
+    monkeypatch.setattr(
+        "app.core.config.settings.internal_eval_api_key",
+        "secret",
+        raising=False,
+    )
+    from app.features.evaluation.deps import get_memoir_readiness_service
+    from app.features.evaluation.router import router
+
+    class _Fake:
+        async def memoir_phase1_ready_for_segments(
+            self, *, conversation_id: str, segment_ids: list[str]
+        ) -> MemoirPhase1ReadyOut:
+            return MemoirPhase1ReadyOut(
+                ready=False,
+                checked_segment_ids=list(segment_ids),
+                pending_segment_ids=["pending-1"],
+                job_submitted_at_utc=datetime(
+                    2026, 4, 9, 8, 0, 0, tzinfo=timezone.utc
+                ),
+                elapsed_ms_since_submit=12_000,
+                durations_ms={"since_playground_submit": 12_000},
+            )
+
+    app = FastAPI()
+    app.include_router(router, prefix="/internal/api/evaluation")
+
+    async def _override_auth():
+        from app.features.evaluation.internal_auth import InternalEvalPrincipal
+
+        return InternalEvalPrincipal()
+
+    app.dependency_overrides[get_internal_eval_principal] = _override_auth
+    app.dependency_overrides[get_memoir_readiness_service] = lambda: _Fake()
+
+    transport = ASGITransport(app=app)
+    async with AsyncClient(transport=transport, base_url="http://t") as client:
+        r = await client.get(
+            "/internal/api/evaluation/sessions/cid-a/memoir-phase1-ready",
+            headers={"X-Internal-Eval-Key": "secret"},
+            params=[("segment_ids", "s1")],
+        )
+    assert r.status_code == 200
+    body = r.json()
+    assert body["elapsed_ms_since_submit"] == 12_000
+    assert body["durations_ms"]["since_playground_submit"] == 12_000
+    assert body["job_submitted_at_utc"] is not None
+
+
@pytest.mark.asyncio
 async def test_memoir_phase1_ready_404_propagates(monkeypatch: pytest.MonkeyPatch) -> None:
    from fastapi import FastAPI
--- a/api/tests/evaluation/test_replay_timing_response.py
+++ b/api/tests/evaluation/test_replay_timing_response.py
@@ -0,0 +1,71 @@
+"""replay/conversation 响应携带服务端 elapsed 字段。"""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+
+import pytest
+from httpx import ASGITransport, AsyncClient
+
+from app.features.evaluation.internal_auth import get_internal_eval_principal
+from app.features.evaluation.replay_service import ReplayServerTiming
+
+
+@pytest.mark.asyncio
+async def test_replay_conversation_includes_server_elapsed_ms(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from fastapi import FastAPI
+
+    monkeypatch.setattr(
+        "app.core.config.settings.internal_eval_api_key",
+        "secret",
+        raising=False,
+    )
+    from app.features.evaluation.deps import get_replay_conversation_service
+    from app.features.evaluation.router import router
+
+    t0 = datetime(2026, 4, 9, 10, 0, 0, tzinfo=timezone.utc)
+    t1 = datetime(2026, 4, 9, 10, 0, 1, tzinfo=timezone.utc)
+
+    class _FakeReplay:
+        async def replay_utterances(self, **kwargs):
+            return (
+                1,
+                ["seg-a"],
+                ReplayServerTiming(
+                    started_at_utc=t0,
+                    finished_at_utc=t1,
+                    elapsed_ms=150,
+                ),
+            )
+
+    app = FastAPI()
+    app.include_router(router, prefix="/internal/api/evaluation")
+
+    async def _override_auth():
+        from app.features.evaluation.internal_auth import InternalEvalPrincipal
+
+        return InternalEvalPrincipal()
+
+    app.dependency_overrides[get_internal_eval_principal] = _override_auth
+    app.dependency_overrides[get_replay_conversation_service] = lambda: _FakeReplay()
+
+    transport = ASGITransport(app=app)
+    async with AsyncClient(transport=transport, base_url="http://t") as client:
+        r = await client.post(
+            "/internal/api/evaluation/replay/conversation",
+            headers={"X-Internal-Eval-Key": "secret"},
+            json={
+                "conversation_id": "00000000-0000-0000-0000-000000000099",
+                "user_utterances": ["hi"],
+                "flush_memoir_after": False,
+                "skip_memoir": True,
+                "skip_tts": True,
+            },
+        )
+    assert r.status_code == 200
+    body = r.json()
+    assert body["elapsed_ms"] == 150
+    assert body["started_at_utc"] is not None
+    assert body["finished_at_utc"] is not None
--- a/api/tests/test_batch_phase1_chunked.py
+++ b/api/tests/test_batch_phase1_chunked.py
@@ -0,0 +1,99 @@
+"""Phase1 批处理 LLM 分块：大量 segment 时拆多次请求并合并 by_id。"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import pytest
+
+from app.agents.memoir.batch_phase1_prep import (
+    BatchPhase1SegmentRow,
+    run_batch_phase1_prep_chunked,
+)
+from app.agents.state_schema import MemoirStateSchema
+
+
+def _state() -> MemoirStateSchema:
+    return MemoirStateSchema(
+        stage_order=["childhood"],
+        current_stage="childhood",
+        covered_stages=[],
+        slots={},
+    )
+
+
+def test_run_batch_phase1_prep_chunked_splits_95_into_four_calls(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    chunk_lengths: list[int] = []
+
+    def fake_prep(
+        segments: list,
+        state: MemoirStateSchema,
+        llm: object,
+    ) -> dict[str, BatchPhase1SegmentRow]:
+        chunk_lengths.append(len(segments))
+        return {
+            str(s.id): BatchPhase1SegmentRow(
+                detected_stage="childhood",
+                slots={},
+                chapter_category_raw="summary",
+            )
+            for s in segments
+        }
+
+    monkeypatch.setattr(
+        "app.agents.memoir.batch_phase1_prep.run_batch_phase1_prep",
+        fake_prep,
+    )
+    segments = [
+        SimpleNamespace(id=f"s{i}", user_input_text="hello") for i in range(95)
+    ]
+    by_id = run_batch_phase1_prep_chunked(
+        segments,
+        _state(),
+        MagicMock(),
+        chunk_size=24,
+    )
+    assert len(by_id) == 95
+    assert chunk_lengths == [24, 24, 24, 23]
+
+
+def test_chunked_bisect_on_value_error(monkeypatch: pytest.MonkeyPatch) -> None:
+    """块内失败时二分重试，仍能拼回全量 id。"""
+    chunk_lengths: list[int] = []
+
+    def fake_prep(
+        segments: list,
+        state: MemoirStateSchema,
+        llm: object,
+    ) -> dict[str, BatchPhase1SegmentRow]:
+        chunk_lengths.append(len(segments))
+        if len(segments) == 4:
+            raise ValueError("simulate length limit")
+        return {
+            str(s.id): BatchPhase1SegmentRow(
+                detected_stage="childhood",
+                slots={},
+                chapter_category_raw="summary",
+            )
+            for s in segments
+        }
+
+    monkeypatch.setattr(
+        "app.agents.memoir.batch_phase1_prep.run_batch_phase1_prep",
+        fake_prep,
+    )
+    segments = [
+        SimpleNamespace(id=f"b{i}", user_input_text="x") for i in range(4)
+    ]
+    by_id = run_batch_phase1_prep_chunked(
+        segments,
+        _state(),
+        MagicMock(),
+        chunk_size=100,
+    )
+    assert len(by_id) == 4
+    assert chunk_lengths[0] == 4
+    assert 2 in chunk_lengths
--- a/api/tests/test_memoir_pipeline_optimization.py
+++ b/api/tests/test_memoir_pipeline_optimization.py
@@ -30,6 +30,7 @@ def test_phase1_batch_enabled_by_default() -> None:

    s = Settings()
    assert s.memoir_phase1_batch_llm_enabled is True
+    assert s.memoir_phase1_batch_llm_chunk_size >= 1


 def test_quality_pass_enabled_by_default() -> None:
--- a/api/tests/test_memoir_skip_story.py
+++ b/api/tests/test_memoir_skip_story.py
@@ -131,6 +131,8 @@ def test_prepare_batches_batch_llm_path_matches_per_segment_skip_logic(
        segments: list,
        state: MemoirStateSchema,
        llm: object,
+        *,
+        chunk_size: int = 24,
    ) -> dict:
        return {
            "mix-1": BatchPhase1SegmentRow(
@@ -146,7 +148,7 @@ def test_prepare_batches_batch_llm_path_matches_per_segment_skip_logic(
        }

    monkeypatch.setattr(
-        "app.agents.memoir.orchestrator.run_batch_phase1_prep",
+        "app.agents.memoir.orchestrator.run_batch_phase1_prep_chunked",
        fake_batch,
    )
    orch = MemoirOrchestrator()