数据库 - 新增迁移 0003:timeline_events.memory_source_id 外键 → memory_sources,便于按 ingest 源做时间线幂等 后端 - 记忆 - 新增 ingest 后 LLM 富化(摘要/事实/时间线),可配置开关与最大字符数 - 新增证据包组装:合并 chunk、摘要、事实、时间线、故事等检索结果;支持空 query 时是否仍带 rolling 等开关 - repo/retriever/service/router/schemas/summarizer/timeline/extractor 等扩展;文档 memory-retrieval.md 更新 后端 - 对话 WS - 增加 PING/PONG;分段 ASR 日志与空音频处理;转写失败与「无助手回复」错误提示更明确 - 助手多段回复持久化使用统一分隔符,与分段逻辑一致 后端 - Agent - reply_limits:按 [SPLIT] 与段落拆段,并保证非空 fallback,供 WS 与 TTS 多段下发 后端 - 回忆录任务 - transcript ingest 记录 source_id;任务成功结?
93 lines
3.3 KiB
Python
93 lines
3.3 KiB
Python
"""从 transcript 块中抽取结构化事实(LLM + JSON)。"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from typing import Any
|
||
|
||
from app.core.langchain_llm import ainvoke_json_object, invoke_json_object
|
||
from app.core.logging import get_logger
|
||
from app.features.memory.llm_schemas import (
|
||
FactsExtractionPayload,
|
||
facts_payload_to_dicts,
|
||
parse_json_payload,
|
||
)
|
||
|
||
logger = get_logger(__name__)
|
||
|
||
|
||
def _max_transcript_chars() -> int:
|
||
from app.core.config import settings
|
||
|
||
return settings.memory_enrichment_max_chars
|
||
|
||
|
||
def extract_facts_from_transcript_sync(llm: Any, numbered_blocks: str) -> list[dict]:
|
||
"""同步:带 chunk_id 标记的文本 → 事实列表。"""
|
||
if not llm or not (numbered_blocks or "").strip():
|
||
return []
|
||
text = numbered_blocks.strip()[: _max_transcript_chars()]
|
||
prompt = (
|
||
"你是回忆录记忆抽取助手。阅读下列带 [chunk_id=...] 的文本块,抽取可核查的事实。\n"
|
||
"每个事实含 fact_type: person|event|relation|place|milestone;subject;predicate;"
|
||
"object_json(可为字符串或对象);confidence 0..1;source_chunk_id 必须等于某段的 chunk id。\n"
|
||
'只输出 JSON:{"facts":[...]},无事实则 {"facts":[]}。\n\n'
|
||
f"{text}"
|
||
)
|
||
try:
|
||
raw = invoke_json_object(
|
||
llm,
|
||
prompt,
|
||
max_tokens=4096,
|
||
agent="memory.extract_facts_sync",
|
||
)
|
||
parsed = parse_json_payload(raw, FactsExtractionPayload)
|
||
if parsed is None:
|
||
return []
|
||
return facts_payload_to_dicts(parsed)
|
||
except (TypeError, ValueError) as e:
|
||
logger.warning("extract_facts_from_transcript_sync 解析失败: {}", e)
|
||
return []
|
||
|
||
|
||
async def extract_facts_from_transcript_async(
|
||
llm: Any, numbered_blocks: str
|
||
) -> list[dict]:
|
||
"""异步版。"""
|
||
if not llm or not (numbered_blocks or "").strip():
|
||
return []
|
||
text = numbered_blocks.strip()[: _max_transcript_chars()]
|
||
prompt = (
|
||
"你是回忆录记忆抽取助手。阅读下列带 [chunk_id=...] 的文本块,抽取可核查的事实。\n"
|
||
"每个事实含 fact_type: person|event|relation|place|milestone;subject;predicate;"
|
||
"object_json;confidence 0..1;source_chunk_id 必须等于某段的 chunk id。\n"
|
||
'只输出 JSON:{"facts":[...]},无事实则 {"facts":[]}。\n\n'
|
||
f"{text}"
|
||
)
|
||
try:
|
||
raw = await ainvoke_json_object(
|
||
llm,
|
||
prompt,
|
||
max_tokens=4096,
|
||
agent="memory.extract_facts_async",
|
||
)
|
||
parsed = parse_json_payload(raw, FactsExtractionPayload)
|
||
if parsed is None:
|
||
return []
|
||
return facts_payload_to_dicts(parsed)
|
||
except (TypeError, ValueError) as e:
|
||
logger.warning("extract_facts_from_transcript_async 解析失败: {}", e)
|
||
return []
|
||
|
||
|
||
async def extract_facts(chunk_text: str, *, user_id: str) -> list[dict]:
|
||
"""兼容旧接口:单块文本(无 chunk id 时传空 source_chunk_id)。"""
|
||
from app.core.dependencies import get_llm_provider
|
||
|
||
llm = get_llm_provider().langchain_llm
|
||
blocks = f"[chunk_id=null]\n{chunk_text}"
|
||
facts = await extract_facts_from_transcript_async(llm, blocks)
|
||
for f in facts:
|
||
if f.get("source_chunk_id") in (None, "null", ""):
|
||
f["source_chunk_id"] = None
|
||
return facts
|