Files
life-echo/api/app/features/memory/evidence.py
Kevin 309a051038 feat: 回忆录证据血缘与内部评测可追溯,顺带对齐本地评测台与 CI
数据库与模型:新增多版迁移(章节证据快照、对话血缘、记忆事实/时间线 lineage 等),把「成稿 ↔ 对话/记忆」的溯源信息落到表结构里。
业务链路:会话与 WS、回忆录/故事流水线、记忆写入与 enrichment 等跟着接上线索与快照;新增章节证据快照与评测侧 EvalTraceService 等模块,方便组评审用的证据包。
内部评测:自动化 run 与手工 memoir 评审共用可追溯证据;rubric/ judge 相关脚本与文档有配套调整。
app-eval-web:Memoir/实验详情里能展开看证据摘要与 evidence_trace(含对话轮次 id);Vite 代理与 development.sh 注入的 API 端口与当前默认内部评测端口一致,避免改端口后页面连错服务。
工程杂项:GitHub Actions / 仓库说明有更新;各适配器与支付/配额/plan 等多处为小改动或跟随主改动的收尾;新增/扩充了?
2026-04-08 15:37:09 +08:00

288 lines
8.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
证据包组装:跨 memory + story 的检索结果合并(业务层,非纯 repo
权威层级(可靠性 hardening
- **Chunk 原文**(未 excluded为首要证据rolling 摘要/故事摘录为便利视图,不得压过冲突的 chunk。
- **MemoryFact**`confirmed` 为检索默认集;`candidate` 可被上游提升;`stale` 由 compaction 等标出,检索时应排除。
- 事实 ILIKE 无命中时是否退回「最近事实」由 `memory_fact_search_use_recent_fallback` 控制(默认可避免串台)。
Celery 使用 sync + 向量 chunks`HybridRetriever` 使用 async + 向量 chunks。
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import Session
from app.core.config import settings
from app.core.logging import get_logger
from app.features.memory.repo import (
list_summaries_for_evidence_async,
list_summaries_for_evidence_sync,
search_chunks_vector_sync,
search_facts_for_user_async,
search_facts_for_user_sync,
search_timeline_events_for_user_async,
search_timeline_events_for_user_sync,
)
from app.features.story.repo import (
list_recent_stories_for_evidence,
list_recent_stories_for_evidence_sync,
)
if TYPE_CHECKING:
from app.ports.embedding import EmbeddingProvider
logger = get_logger(__name__)
EMPTY_EVIDENCE_BUNDLE: dict = {
"relevant_chunks": [],
"relevant_summaries": [],
"relevant_facts": [],
"timeline_hints": [],
"relevant_stories": [],
}
def _facts_to_dicts(facts) -> list[dict]:
return [
{
"id": f.id,
"fact_type": f.fact_type,
"subject": f.subject,
"predicate": f.predicate,
"object_json": f.object_json,
}
for f in facts
]
def _timeline_to_dicts(events) -> list[dict]:
return [
{
"id": e.id,
"event_year": e.event_year,
"event_date": e.event_date,
"title": e.title,
"description": e.description,
}
for e in events
]
def _stories_to_dicts(story_rows) -> list[dict]:
return [
{
"id": s.id,
"title": s.title,
"summary": s.summary,
"stage": s.stage,
"story_type": s.story_type,
}
for s in story_rows
]
def fetch_evidence_metadata_sync(
session: Session, user_id: str, q: str, top_k: int
) -> dict:
"""非 chunk 证据摘要、事实、时间线、故事sync"""
facts = search_facts_for_user_sync(session, user_id, q, top_k)
events = search_timeline_events_for_user_sync(session, user_id, q, top_k)
relevant_summaries = list_summaries_for_evidence_sync(
session, user_id=user_id, q=q, limit=top_k
)
story_rows = list_recent_stories_for_evidence_sync(
session, user_id, query=q, limit=top_k
)
return {
"relevant_facts": _facts_to_dicts(facts),
"timeline_hints": _timeline_to_dicts(events),
"relevant_summaries": relevant_summaries,
"relevant_stories": _stories_to_dicts(story_rows),
}
async def fetch_evidence_metadata_async(
db: AsyncSession, user_id: str, q: str, top_k: int
) -> dict:
"""非 chunk 证据async"""
facts = await search_facts_for_user_async(db, user_id, q, top_k)
events = await search_timeline_events_for_user_async(db, user_id, q, top_k)
relevant_summaries = await list_summaries_for_evidence_async(
db, user_id=user_id, q=q, limit=top_k
)
story_rows = await list_recent_stories_for_evidence(
db, user_id=user_id, query=q, limit=top_k
)
return {
"relevant_facts": _facts_to_dicts(facts),
"timeline_hints": _timeline_to_dicts(events),
"relevant_summaries": relevant_summaries,
"relevant_stories": _stories_to_dicts(story_rows),
}
def _empty_query_bundle_sync(session: Session, user_id: str, top_k: int) -> dict:
"""空 query 时的「浏览」降级rolling 摘要 + 事实/时间线 fallback。"""
from sqlalchemy import select
from app.features.memory.models import MemorySummary
from app.features.memory.repo import (
get_facts_for_user_sync,
get_timeline_events_for_user_sync,
)
rolling = (
session.execute(
select(MemorySummary)
.where(
MemorySummary.user_id == user_id,
MemorySummary.summary_type == "rolling",
)
.order_by(MemorySummary.updated_at.desc())
.limit(1)
)
.unique()
.scalar_one_or_none()
)
summaries = []
if rolling:
summaries = [
{
"id": rolling.id,
"summary_type": rolling.summary_type,
"content": rolling.content,
"source_chunk_ids": rolling.source_chunk_ids,
}
]
facts = get_facts_for_user_sync(session, user_id, top_k)
events = get_timeline_events_for_user_sync(session, user_id, top_k)
return {
"relevant_chunks": [],
"relevant_summaries": summaries,
"relevant_facts": _facts_to_dicts(facts),
"timeline_hints": _timeline_to_dicts(events),
"relevant_stories": [],
}
async def _empty_query_bundle_async(db: AsyncSession, user_id: str, top_k: int) -> dict:
from sqlalchemy import select
from app.features.memory.models import MemorySummary
from app.features.memory.repo import (
get_facts_for_user,
get_timeline_events_for_user,
)
roll_stmt = (
select(MemorySummary)
.where(
MemorySummary.user_id == user_id,
MemorySummary.summary_type == "rolling",
)
.order_by(MemorySummary.updated_at.desc())
.limit(1)
)
r_result = await db.execute(roll_stmt)
rolling = r_result.unique().scalar_one_or_none()
summaries = []
if rolling:
summaries = [
{
"id": rolling.id,
"summary_type": rolling.summary_type,
"content": rolling.content,
"source_chunk_ids": rolling.source_chunk_ids,
}
]
facts = await get_facts_for_user(db, user_id=user_id, limit=top_k)
events = await get_timeline_events_for_user(db, user_id=user_id, limit=top_k)
return {
"relevant_chunks": [],
"relevant_summaries": summaries,
"relevant_facts": _facts_to_dicts(facts),
"timeline_hints": _timeline_to_dicts(events),
"relevant_stories": [],
}
def retrieve_evidence_bundle_sync(
session: Session,
user_id: str,
query: str,
*,
top_k: int = 10,
embedding_provider: "EmbeddingProvider | None" = None,
) -> dict:
"""Celery / 叙事流水线:向量 chunks + 元数据(需 embedding_provider"""
if not query or not query.strip():
if settings.memory_evidence_empty_query_include_rolling:
return _empty_query_bundle_sync(session, user_id, top_k)
return dict(EMPTY_EVIDENCE_BUNDLE)
q = query.strip()
relevant_chunks: list[dict] = []
if embedding_provider is not None:
try:
q_emb = embedding_provider.embed_text_sync(q)
except Exception as exc:
logger.warning(
"retrieve_evidence_bundle_sync embed failed user_id={} err={}",
user_id,
exc,
)
q_emb = []
if q_emb:
chunk_rows = search_chunks_vector_sync(session, user_id, q_emb, top_k)
relevant_chunks = [
{
"id": r["id"],
"content": r["content"],
"chunk_index": r["chunk_index"],
}
for r in chunk_rows
]
else:
logger.warning(
"retrieve_evidence_bundle_sync empty_query_embedding user_id={}",
user_id,
)
else:
logger.warning(
"retrieve_evidence_bundle_sync no_embedding_provider user_id={}",
user_id,
)
meta = fetch_evidence_metadata_sync(session, user_id, q, top_k)
return {
"relevant_chunks": relevant_chunks,
**meta,
}
async def retrieve_evidence_bundle_async(
db: AsyncSession,
user_id: str,
query: str,
*,
top_k: int = 10,
merged_chunk_dicts: list[dict],
) -> dict:
"""
异步路径chunk 已由调用方(如 HybridRetriever向量检索填入此处只拼元数据。
merged_chunk_dicts: [{"id","content","chunk_index"}, ...]
"""
if not query or not query.strip():
if settings.memory_evidence_empty_query_include_rolling:
return await _empty_query_bundle_async(db, user_id, top_k)
return dict(EMPTY_EVIDENCE_BUNDLE)
q = query.strip()
meta = await fetch_evidence_metadata_async(db, user_id, q, top_k)
return {
"relevant_chunks": merged_chunk_dicts,
**meta,
}