Files
life-echo/api/app/features/memory/extractor.py

93 lines
3.3 KiB
Python
Raw Normal View History

"""从 transcript 块中抽取结构化事实LLM + JSON"""
from __future__ import annotations
from typing import Any
from app.core.langchain_llm import ainvoke_json_object, invoke_json_object
from app.core.logging import get_logger
from app.features.memory.llm_schemas import (
FactsExtractionPayload,
facts_payload_to_dicts,
parse_json_payload,
)
logger = get_logger(__name__)
def _max_transcript_chars() -> int:
from app.core.config import settings
return settings.memory_enrichment_max_chars
def extract_facts_from_transcript_sync(llm: Any, numbered_blocks: str) -> list[dict]:
"""同步:带 chunk_id 标记的文本 → 事实列表。"""
if not llm or not (numbered_blocks or "").strip():
return []
text = numbered_blocks.strip()[: _max_transcript_chars()]
prompt = (
"你是回忆录记忆抽取助手。阅读下列带 [chunk_id=...] 的文本块,抽取可核查的事实。\n"
"每个事实含 fact_type: person|event|relation|place|milestonesubjectpredicate"
"object_json可为字符串或对象confidence 0..1source_chunk_id 必须等于某段的 chunk id。\n"
'只输出 JSON{"facts":[...]},无事实则 {"facts":[]}。\n\n'
f"{text}"
)
try:
raw = invoke_json_object(
llm,
prompt,
max_tokens=4096,
agent="memory.extract_facts_sync",
)
parsed = parse_json_payload(raw, FactsExtractionPayload)
if parsed is None:
return []
return facts_payload_to_dicts(parsed)
except (TypeError, ValueError) as e:
logger.warning("extract_facts_from_transcript_sync 解析失败: {}", e)
return []
async def extract_facts_from_transcript_async(
llm: Any, numbered_blocks: str
) -> list[dict]:
"""异步版。"""
if not llm or not (numbered_blocks or "").strip():
return []
text = numbered_blocks.strip()[: _max_transcript_chars()]
prompt = (
"你是回忆录记忆抽取助手。阅读下列带 [chunk_id=...] 的文本块,抽取可核查的事实。\n"
"每个事实含 fact_type: person|event|relation|place|milestonesubjectpredicate"
"object_jsonconfidence 0..1source_chunk_id 必须等于某段的 chunk id。\n"
'只输出 JSON{"facts":[...]},无事实则 {"facts":[]}。\n\n'
f"{text}"
)
try:
raw = await ainvoke_json_object(
llm,
prompt,
max_tokens=4096,
agent="memory.extract_facts_async",
)
parsed = parse_json_payload(raw, FactsExtractionPayload)
if parsed is None:
return []
return facts_payload_to_dicts(parsed)
except (TypeError, ValueError) as e:
logger.warning("extract_facts_from_transcript_async 解析失败: {}", e)
return []
async def extract_facts(chunk_text: str, *, user_id: str) -> list[dict]:
"""兼容旧接口:单块文本(无 chunk id 时传空 source_chunk_id"""
from app.core.dependencies import get_llm_provider
llm = get_llm_provider().langchain_llm
blocks = f"[chunk_id=null]\n{chunk_text}"
facts = await extract_facts_from_transcript_async(llm, blocks)
for f in facts:
if f.get("source_chunk_id") in (None, "null", ""):
f["source_chunk_id"] = None
return facts