Files
life-echo/api/tests/test_memory_compaction.py
Kevin 41518bda11 聊天和回忆录证据检索都走 pgvector,去掉 Postgres FTS/content_tsv,新迁移删掉 content_tsv 列(部署要先 alembic upgrade)。
Embedding 端口增加 is_available(),聊天和回忆录日志用统一方式表示向量是否真能调用。

记忆整理(compaction)支持 Beat 定期扫用户;

事实抽取提示与 subject 归一化,减少同一人多种称呼;
2026-04-03 11:43:16 +08:00

498 lines
15 KiB
Python

"""Memory compaction 回归测试。"""
from __future__ import annotations
from contextlib import contextmanager
from datetime import datetime, timezone
from types import SimpleNamespace
import pytest
from app.core.config import settings
from app.core import memory_compaction_schedule as schedule
from app.features.memory.compaction_service import (
canonical_score,
count_duplicate_layers,
embedding_layer_match,
metadata_layer_match,
run_memory_compaction_sync,
text_layer_match,
)
from app.features.memory.service import ingest_transcript_sync
from app.tasks.memory_compaction_tasks import (
memory_compaction_run,
memory_compaction_sweep,
)
class FakeRedis:
def __init__(self) -> None:
self._store: dict[str, str] = {}
def get(self, key: str) -> str | None:
return self._store.get(key)
def set(
self,
key: str,
value: str,
*,
nx: bool = False,
ex: int | None = None,
) -> bool:
if nx and key in self._store:
return False
self._store[key] = value
return True
def delete(self, key: str) -> int:
existed = key in self._store
self._store.pop(key, None)
return int(existed)
def test_embedding_layer_uses_cosine_distance() -> None:
assert embedding_layer_match(0.05, similarity_threshold=0.92) is True
assert embedding_layer_match(0.2, similarity_threshold=0.92) is False
def test_text_layer_jaccard() -> None:
a = "我在乡下度过童年 夏天很热"
b = "我在乡下度过童年 夏天很热"
assert text_layer_match(a, b, jaccard_min=0.55) is True
def test_metadata_layer_same_source() -> None:
from unittest.mock import MagicMock
c = MagicMock()
c.source_id = "s1"
c.event_year = None
assert (
metadata_layer_match(
c, {"source_id": "s1", "event_year": None}, event_year_window=1
)
is True
)
assert (
metadata_layer_match(
c, {"source_id": "s2", "event_year": None}, event_year_window=1
)
is False
)
def test_count_duplicate_layers_requires_min() -> None:
from unittest.mock import MagicMock
c = MagicMock()
c.source_id = "a"
c.event_year = 1990
c.content = "hello world test duplicate"
nb = {
"content": "hello world test duplicate",
"source_id": "b",
"event_year": 1991,
}
layers = count_duplicate_layers(
chunk=c,
neighbor=nb,
distance=0.02,
similarity_threshold=0.9,
jaccard_min=0.55,
event_year_window=1,
)
assert layers >= 2
def test_canonical_score_prefers_longer_and_draft() -> None:
s1 = canonical_score(content="short", metadata_json={}, source_type="transcript")
s2 = canonical_score(content="short", metadata_json={}, source_type="draft")
assert s2 > s1
def test_schedule_merges_subsequent_triggers(monkeypatch) -> None:
fake_redis = FakeRedis()
calls: list[tuple[str, dict, int]] = []
monkeypatch.setattr(settings, "memory_compaction_enabled", True)
monkeypatch.setattr(settings, "memory_compaction_debounce_seconds", 30)
monkeypatch.setattr(schedule, "_get_redis", lambda: fake_redis)
monkeypatch.setattr(schedule.time, "time", lambda: 100.0)
monkeypatch.setattr(
schedule,
"_enqueue_memory_compaction_task",
lambda user_id, context, *, countdown: calls.append(
(user_id, context or {}, countdown)
),
)
schedule.schedule_memory_compaction_run("u1", {"trigger_source": "memoir_segments"})
schedule.schedule_memory_compaction_run(
"u1", {"trigger_source": "chapter_recompose"}
)
assert len(calls) == 1
assert fake_redis.get(schedule.scheduler_key("u1")) == "1"
assert fake_redis.get(schedule.debounce_key("u1")) == "130.0"
def test_finalize_reschedules_when_deadline_extended(monkeypatch) -> None:
fake_redis = FakeRedis()
calls: list[tuple[str, dict, int]] = []
monkeypatch.setattr(settings, "memory_compaction_debounce_seconds", 30)
monkeypatch.setattr(schedule, "_get_redis", lambda: fake_redis)
monkeypatch.setattr(schedule.time, "time", lambda: 140.0)
monkeypatch.setattr(
schedule,
"_enqueue_memory_compaction_task",
lambda user_id, context, *, countdown: calls.append(
(user_id, context or {}, countdown)
),
)
fake_redis.set(schedule.debounce_key("u1"), "175.0")
fake_redis.set(schedule.scheduler_key("u1"), "1")
schedule.finalize_memory_compaction_run(
"u1",
observed_deadline_ts=130.0,
context={"trigger_source": "memoir_segments"},
)
assert len(calls) == 1
assert calls[0][2] == 35
assert fake_redis.get(schedule.scheduler_key("u1")) == "1"
assert fake_redis.get(schedule.debounce_key("u1")) == "175.0"
def test_finalize_clears_stale_deadline_when_not_extended(monkeypatch) -> None:
fake_redis = FakeRedis()
monkeypatch.setattr(settings, "memory_compaction_debounce_seconds", 30)
monkeypatch.setattr(schedule, "_get_redis", lambda: fake_redis)
fake_redis.set(schedule.debounce_key("u1"), "130.0")
fake_redis.set(schedule.scheduler_key("u1"), "1")
schedule.finalize_memory_compaction_run(
"u1",
observed_deadline_ts=130.0,
context={"trigger_source": "memoir_segments"},
)
assert fake_redis.get(schedule.scheduler_key("u1")) is None
assert fake_redis.get(schedule.debounce_key("u1")) is None
def test_ingest_transcript_sync_populates_embeddings(monkeypatch) -> None:
class FakeSession:
def __init__(self) -> None:
self.flush_calls = 0
self.commit_calls = 0
def flush(self) -> None:
self.flush_calls += 1
def commit(self) -> None:
self.commit_calls += 1
class FakeEmbeddingProvider:
def is_available(self) -> bool:
return True
async def embed_texts(self, texts: list[str]) -> list[list[float]]:
return [[float(i)] for i, _ in enumerate(texts, start=1)]
def embed_texts_sync(self, texts: list[str]) -> list[list[float]]:
return [[float(i)] for i, _ in enumerate(texts, start=1)]
fake_session = FakeSession()
embedded: list[tuple[str, list[float]]] = []
monkeypatch.setattr(settings, "memory_enrichment_enabled", False)
monkeypatch.setattr(
"app.core.dependencies.get_embedding_provider",
lambda: FakeEmbeddingProvider(),
)
monkeypatch.setattr(
"app.features.memory.chunker.chunk_transcript",
lambda transcript: ["第一段", "第二段"],
)
monkeypatch.setattr(
"app.features.memory.repo.create_source_sync",
lambda *args, **kwargs: SimpleNamespace(id="src-1"),
)
monkeypatch.setattr(
"app.features.memory.repo.create_chunk_sync",
lambda *args, **kwargs: SimpleNamespace(id=f"chunk-{kwargs['chunk_index']}"),
)
monkeypatch.setattr(
"app.features.memory.repo.update_chunk_embedding_sync",
lambda session, chunk_id, embedding: embedded.append((chunk_id, embedding)),
)
source_id = ingest_transcript_sync(
fake_session,
user_id="u1",
conversation_id="conv-1",
transcript="hello",
)
assert source_id == "src-1"
assert [chunk_id for chunk_id, _ in embedded] == ["chunk-0", "chunk-1"]
assert fake_session.commit_calls == 1
def test_run_memory_compaction_stops_before_missing_embedding(monkeypatch) -> None:
chunk = SimpleNamespace(
id="chunk-1",
created_at=datetime(2024, 1, 1, tzinfo=timezone.utc),
)
row = SimpleNamespace(
id="chunk-1",
created_at=datetime(2024, 1, 1, tzinfo=timezone.utc),
is_excluded=False,
embedding=None,
)
monkeypatch.setattr(
"app.features.memory.compaction_service.list_incremental_chunks_for_compaction_sync",
lambda *args, **kwargs: [chunk],
)
monkeypatch.setattr(
"app.features.memory.compaction_service.get_memory_chunk_sync",
lambda *args, **kwargs: row,
)
out = run_memory_compaction_sync(
session=object(),
user_id="u1",
context={
"_cursor_pair_override": (
datetime(1970, 1, 1, tzinfo=timezone.utc),
"00000000-0000-0000-0000-000000000000",
)
},
)
assert out["skipped_reason"] == "awaiting_embeddings"
assert out["pending_chunk_id"] == "chunk-1"
assert out["new_cursor_id"] is None
def test_curation_action_details_include_trigger_context(monkeypatch) -> None:
now = datetime(2024, 1, 1, tzinfo=timezone.utc)
chunk = SimpleNamespace(
id="chunk-1",
created_at=now,
content="hello world duplicate",
metadata_json={"a": 1},
source_id="s1",
event_year=1990,
embedding=[0.1, 0.2],
is_excluded=False,
)
loser = SimpleNamespace(id="chunk-2", is_excluded=False)
captured: list[dict] = []
monkeypatch.setattr(
"app.features.memory.compaction_service.list_incremental_chunks_for_compaction_sync",
lambda *args, **kwargs: [chunk],
)
def fake_get_memory_chunk_sync(_session, chunk_id: str, _user_id: str):
if chunk_id == "chunk-1":
return chunk
if chunk_id == "chunk-2":
return loser
return None
monkeypatch.setattr(
"app.features.memory.compaction_service.get_memory_chunk_sync",
fake_get_memory_chunk_sync,
)
monkeypatch.setattr(
"app.features.memory.compaction_service._source_type_for_chunk",
lambda *args, **kwargs: "draft",
)
monkeypatch.setattr(
"app.features.memory.compaction_service.search_nearest_chunks_for_compaction_sync",
lambda *args, **kwargs: [
{
"id": "chunk-2",
"content": "hello world duplicate",
"source_id": "s1",
"event_year": 1990,
"metadata_json": {"b": 2},
"source_type": "transcript",
"created_at": now,
"distance": 0.01,
}
],
)
monkeypatch.setattr(
"app.features.memory.compaction_service.set_chunk_excluded_sync",
lambda *args, **kwargs: True,
)
monkeypatch.setattr(
"app.features.memory.compaction_service.create_curation_action_sync",
lambda _session, **kwargs: captured.append(kwargs["details"]),
)
monkeypatch.setattr(
"app.features.memory.compaction_service.mark_facts_stale_for_excluded_chunk_sync",
lambda *_a, **_k: 0,
)
out = run_memory_compaction_sync(
session=object(),
user_id="u1",
context={
"_cursor_pair_override": (
datetime(1970, 1, 1, tzinfo=timezone.utc),
"00000000-0000-0000-0000-000000000000",
),
"trigger_source": "memoir_segments",
"trigger_time": "2026-03-30T00:00:00+00:00",
"pipeline_run_id": "run-1",
"request_id": "req-1",
"story_dispatch_ids": ["story-1"],
"candidate_source_ids": ["s1"],
"chapters_to_enqueue": ["chapter-1"],
},
)
assert out["chunks_excluded"] == 1
assert captured[0]["trigger_time"] == "2026-03-30T00:00:00+00:00"
assert captured[0]["candidate_source_ids"] == ["s1"]
assert captured[0]["chapters_to_enqueue"] == ["chapter-1"]
@pytest.mark.parametrize("failure_stage", ["run", "commit"])
def test_memory_compaction_run_releases_gate_and_retries_on_failure(
monkeypatch, failure_stage: str
) -> None:
events: list[str] = []
class RetryTriggered(RuntimeError):
pass
class FakeSession:
def commit(self) -> None:
events.append("commit")
if failure_stage == "commit":
raise RuntimeError("db hiccup")
@contextmanager
def fake_get_sync_db():
yield FakeSession()
def fake_run_memory_compaction_sync(session, user_id: str, context: dict | None):
events.append("run")
if failure_stage == "run":
raise RuntimeError("compaction failed")
return {
"chunks_scanned": 0,
"chunks_excluded": 0,
"candidates_considered": 0,
"new_cursor_ts": None,
"new_cursor_id": None,
"duration_ms": 0.0,
"skipped_reason": None,
}
def fake_retry(*, exc):
events.append(f"retry:{type(exc).__name__}")
raise RetryTriggered("retried")
monkeypatch.setattr(settings, "memory_compaction_enabled", True)
monkeypatch.setattr(
"app.tasks.memory_compaction_tasks.read_debounce_deadline_ts",
lambda user_id: 100.0,
)
monkeypatch.setattr(
"app.tasks.memory_compaction_tasks.acquire_redis_lock",
lambda *args, **kwargs: SimpleNamespace(key="lock", token=b"t"),
)
monkeypatch.setattr(
"app.tasks.memory_compaction_tasks.release_redis_lock",
lambda lock: events.append("release_lock"),
)
monkeypatch.setattr(
"app.tasks.memory_compaction_tasks.release_scheduler_gate",
lambda user_id: events.append("release_gate"),
)
monkeypatch.setattr(
"app.tasks.memory_compaction_tasks.get_sync_db",
fake_get_sync_db,
)
monkeypatch.setattr(
"app.tasks.memory_compaction_tasks.run_memory_compaction_sync",
fake_run_memory_compaction_sync,
)
monkeypatch.setattr(
memory_compaction_run,
"retry",
fake_retry,
)
with pytest.raises(RetryTriggered):
memory_compaction_run.run("u1", {"trigger_source": "memoir_segments"})
assert "release_gate" in events
assert "retry:RuntimeError" in events
assert "release_lock" in events
assert events.index("release_gate") < events.index("retry:RuntimeError")
def test_memory_compaction_sweep_skipped_when_disabled(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "memory_compaction_enabled", False)
out = memory_compaction_sweep()
assert out == {"skipped": True, "reason": "disabled"}
def test_memory_compaction_sweep_schedules_recent_users(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings, "memory_compaction_enabled", True)
monkeypatch.setattr(settings, "memory_compaction_sweep_recent_hours", 24)
scheduled: list[tuple[str, dict]] = []
class _DbCtx:
def __enter__(self):
return object()
def __exit__(self, *args):
return None
monkeypatch.setattr(
"app.tasks.memory_compaction_tasks.get_sync_db",
lambda: _DbCtx(),
)
def fake_list(session, *, hours):
assert hours == 24
return ["user-a", "user-b"]
monkeypatch.setattr(
"app.tasks.memory_compaction_tasks.list_users_with_recent_chunks_sync",
fake_list,
)
monkeypatch.setattr(
"app.tasks.memory_compaction_tasks.schedule_memory_compaction_run",
lambda uid, ctx: scheduled.append((uid, dict(ctx))),
)
out = memory_compaction_sweep()
assert out["scheduled"] == 2
assert set(out["user_ids"]) == {"user-a", "user-b"}
assert {u for u, _ in scheduled} == {"user-a", "user-b"}
for _, ctx in scheduled:
assert ctx.get("trigger_source") == "beat"
assert ctx.get("sweep_hours") == 24