Files
life-echo/api/app/features/memory/extractor.py

127 lines
4.8 KiB
Python
Raw Normal View History

"""从 transcript 块中抽取结构化事实LLM + JSON"""
from __future__ import annotations
from typing import Any
from app.core.langchain_llm import ainvoke_json_object, invoke_json_object
from app.core.llm_gateway import LlmGateway, LlmUseCase
from app.core.logging import get_logger
from app.features.memory.llm_schemas import (
FactsExtractionPayload,
facts_payload_to_dicts,
parse_json_payload,
)
logger = get_logger(__name__)
def _max_transcript_chars() -> int:
from app.core.config import settings
return settings.memory_enrichment_max_chars
def _facts_extraction_instructions(narrator_label: str) -> str:
return (
"你是回忆录事实抽取助手。用户正在口述人生回忆,所有内容默认是**过去发生的事**"
"而非当前或未来计划(除非原文明确说「现在」「打算」「准备将要」等)。\n\n"
"## 抽取规则\n"
"1. subject 必须用明确的人名或固定称谓:\n"
f" - 叙述者本人统一用「{narrator_label}\n"
" - 其他人用全名或稳定专名(如「王伟」),禁止用「他」「她」「我」「我们大伙」等代词作 subject"
"若代词在上下文中可唯一解析为某人,则 subject 写该人姓名/专名\n"
"2. 事件、职务变动、地点迁移等一律按**过去回忆**理解travel/调动/命令类表述勿写成「即将要做」"
"除非原文明确为未来时态\n"
"3. 若可推断大约年代或人生阶段,将 approximate_era 写入 object_json与 value 等字段并存),"
'例如 "1990年代""2001年""退休后""30岁前后"\n'
"4. fact_type: person|event|relation|place|milestone\n"
"5. predicate简短中文谓语如「出生地」「担任职务」「调往」\n"
"6. object_json字符串或对象可含 value、approximate_era 等\n"
"7. confidence 0..1source_chunk_id 必须等于某段 [chunk_id=...] 中的 id\n\n"
'只输出 JSON{"facts":[...]},无事实则 {"facts":[]}。\n\n'
)
def extract_facts_from_transcript_sync(
llm: Any,
numbered_blocks: str,
*,
narrator_name: str | None = None,
) -> list[dict]:
"""同步:带 chunk_id 标记的文本 → 事实列表。"""
if not llm or not (numbered_blocks or "").strip():
return []
text = numbered_blocks.strip()[: _max_transcript_chars()]
narrator_label = (narrator_name or "").strip() or "叙述者"
prompt = _facts_extraction_instructions(narrator_label) + text
try:
raw = invoke_json_object(
llm,
prompt,
max_tokens=4096,
agent="memory.extract_facts_sync",
)
parsed = parse_json_payload(raw, FactsExtractionPayload)
if parsed is None:
return []
return facts_payload_to_dicts(parsed)
except (TypeError, ValueError) as e:
logger.warning("extract_facts_from_transcript_sync 解析失败: {}", e)
return []
async def extract_facts_from_transcript_async(
llm: Any,
numbered_blocks: str,
*,
narrator_name: str | None = None,
) -> list[dict]:
"""异步版。"""
if not llm or not (numbered_blocks or "").strip():
return []
text = numbered_blocks.strip()[: _max_transcript_chars()]
narrator_label = (narrator_name or "").strip() or "叙述者"
prompt = _facts_extraction_instructions(narrator_label) + text
try:
raw = await ainvoke_json_object(
llm,
prompt,
max_tokens=4096,
agent="memory.extract_facts_async",
)
parsed = parse_json_payload(raw, FactsExtractionPayload)
if parsed is None:
return []
return facts_payload_to_dicts(parsed)
except (TypeError, ValueError) as e:
logger.warning("extract_facts_from_transcript_async 解析失败: {}", e)
return []
async def extract_facts(chunk_text: str, *, user_id: str) -> list[dict]:
"""兼容旧接口:单块文本(无 chunk id 时传空 source_chunk_id"""
from app.core.db import AsyncSessionLocal
from app.features.user.models import User
llm = LlmGateway().langchain_llm_for(
LlmUseCase("memory.extract_facts.compat", fast=True)
)
narrator_name: str | None = None
try:
async with AsyncSessionLocal() as db:
u = await db.get(User, user_id)
if u and (u.nickname or "").strip():
narrator_name = (u.nickname or "").strip()
except Exception:
pass
blocks = f"[chunk_id=null]\n{chunk_text}"
facts = await extract_facts_from_transcript_async(
llm, blocks, narrator_name=narrator_name
)
for f in facts:
if f.get("source_chunk_id") in (None, "null", ""):
f["source_chunk_id"] = None
return facts