Files
life-echo/api/app/features/memory/extractor.py
Kevin 41518bda11 聊天和回忆录证据检索都走 pgvector,去掉 Postgres FTS/content_tsv,新迁移删掉 content_tsv 列(部署要先 alembic upgrade)。
Embedding 端口增加 is_available(),聊天和回忆录日志用统一方式表示向量是否真能调用。

记忆整理(compaction)支持 Beat 定期扫用户;

事实抽取提示与 subject 归一化,减少同一人多种称呼;
2026-04-03 11:43:16 +08:00

125 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""从 transcript 块中抽取结构化事实LLM + JSON"""
from __future__ import annotations
from typing import Any
from app.core.langchain_llm import ainvoke_json_object, invoke_json_object
from app.core.logging import get_logger
from app.features.memory.llm_schemas import (
FactsExtractionPayload,
facts_payload_to_dicts,
parse_json_payload,
)
logger = get_logger(__name__)
def _max_transcript_chars() -> int:
from app.core.config import settings
return settings.memory_enrichment_max_chars
def _facts_extraction_instructions(narrator_label: str) -> str:
return (
"你是回忆录事实抽取助手。用户正在口述人生回忆,所有内容默认是**过去发生的事**"
"而非当前或未来计划(除非原文明确说「现在」「打算」「准备将要」等)。\n\n"
"## 抽取规则\n"
"1. subject 必须用明确的人名或固定称谓:\n"
f" - 叙述者本人统一用「{narrator_label}\n"
" - 其他人用全名或稳定专名(如「王伟」),禁止用「他」「她」「我」「我们大伙」等代词作 subject"
"若代词在上下文中可唯一解析为某人,则 subject 写该人姓名/专名\n"
"2. 事件、职务变动、地点迁移等一律按**过去回忆**理解travel/调动/命令类表述勿写成「即将要做」"
"除非原文明确为未来时态\n"
"3. 若可推断大约年代或人生阶段,将 approximate_era 写入 object_json与 value 等字段并存),"
'例如 "1990年代""2001年""退休后""30岁前后"\n'
"4. fact_type: person|event|relation|place|milestone\n"
"5. predicate简短中文谓语如「出生地」「担任职务」「调往」\n"
"6. object_json字符串或对象可含 value、approximate_era 等\n"
"7. confidence 0..1source_chunk_id 必须等于某段 [chunk_id=...] 中的 id\n\n"
'只输出 JSON{"facts":[...]},无事实则 {"facts":[]}。\n\n'
)
def extract_facts_from_transcript_sync(
llm: Any,
numbered_blocks: str,
*,
narrator_name: str | None = None,
) -> list[dict]:
"""同步:带 chunk_id 标记的文本 → 事实列表。"""
if not llm or not (numbered_blocks or "").strip():
return []
text = numbered_blocks.strip()[: _max_transcript_chars()]
narrator_label = (narrator_name or "").strip() or "叙述者"
prompt = _facts_extraction_instructions(narrator_label) + text
try:
raw = invoke_json_object(
llm,
prompt,
max_tokens=4096,
agent="memory.extract_facts_sync",
)
parsed = parse_json_payload(raw, FactsExtractionPayload)
if parsed is None:
return []
return facts_payload_to_dicts(parsed)
except (TypeError, ValueError) as e:
logger.warning("extract_facts_from_transcript_sync 解析失败: {}", e)
return []
async def extract_facts_from_transcript_async(
llm: Any,
numbered_blocks: str,
*,
narrator_name: str | None = None,
) -> list[dict]:
"""异步版。"""
if not llm or not (numbered_blocks or "").strip():
return []
text = numbered_blocks.strip()[: _max_transcript_chars()]
narrator_label = (narrator_name or "").strip() or "叙述者"
prompt = _facts_extraction_instructions(narrator_label) + text
try:
raw = await ainvoke_json_object(
llm,
prompt,
max_tokens=4096,
agent="memory.extract_facts_async",
)
parsed = parse_json_payload(raw, FactsExtractionPayload)
if parsed is None:
return []
return facts_payload_to_dicts(parsed)
except (TypeError, ValueError) as e:
logger.warning("extract_facts_from_transcript_async 解析失败: {}", e)
return []
async def extract_facts(chunk_text: str, *, user_id: str) -> list[dict]:
"""兼容旧接口:单块文本(无 chunk id 时传空 source_chunk_id"""
from app.core.db import AsyncSessionLocal
from app.core.dependencies import get_llm_provider_fast
from app.features.user.models import User
llm = get_llm_provider_fast().langchain_llm
narrator_name: str | None = None
try:
async with AsyncSessionLocal() as db:
u = await db.get(User, user_id)
if u and (u.nickname or "").strip():
narrator_name = (u.nickname or "").strip()
except Exception:
pass
blocks = f"[chunk_id=null]\n{chunk_text}"
facts = await extract_facts_from_transcript_async(
llm, blocks, narrator_name=narrator_name
)
for f in facts:
if f.get("source_chunk_id") in (None, "null", ""):
f["source_chunk_id"] = None
return facts