2026-03-27 16:01:28 +08:00
|
|
|
|
"""从 transcript 块中抽取结构化事实(LLM + JSON)。"""
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
|
|
from app.core.langchain_llm import ainvoke_json_object, invoke_json_object
|
|
|
|
|
|
from app.core.logging import get_logger
|
|
|
|
|
|
from app.features.memory.llm_schemas import (
|
|
|
|
|
|
FactsExtractionPayload,
|
|
|
|
|
|
facts_payload_to_dicts,
|
|
|
|
|
|
parse_json_payload,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _max_transcript_chars() -> int:
|
|
|
|
|
|
from app.core.config import settings
|
|
|
|
|
|
|
|
|
|
|
|
return settings.memory_enrichment_max_chars
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-04-03 11:43:16 +08:00
|
|
|
|
def _facts_extraction_instructions(narrator_label: str) -> str:
|
|
|
|
|
|
return (
|
|
|
|
|
|
"你是回忆录事实抽取助手。用户正在口述人生回忆,所有内容默认是**过去发生的事**,"
|
|
|
|
|
|
"而非当前或未来计划(除非原文明确说「现在」「打算」「准备将要」等)。\n\n"
|
|
|
|
|
|
"## 抽取规则\n"
|
|
|
|
|
|
"1. subject 必须用明确的人名或固定称谓:\n"
|
|
|
|
|
|
f" - 叙述者本人统一用「{narrator_label}」\n"
|
|
|
|
|
|
" - 其他人用全名或稳定专名(如「王伟」),禁止用「他」「她」「我」「我们大伙」等代词作 subject;"
|
|
|
|
|
|
"若代词在上下文中可唯一解析为某人,则 subject 写该人姓名/专名\n"
|
|
|
|
|
|
"2. 事件、职务变动、地点迁移等一律按**过去回忆**理解;travel/调动/命令类表述勿写成「即将要做」"
|
|
|
|
|
|
"除非原文明确为未来时态\n"
|
|
|
|
|
|
"3. 若可推断大约年代或人生阶段,将 approximate_era 写入 object_json(与 value 等字段并存),"
|
|
|
|
|
|
'例如 "1990年代"、"2001年"、"退休后"、"30岁前后"\n'
|
|
|
|
|
|
"4. fact_type: person|event|relation|place|milestone\n"
|
|
|
|
|
|
"5. predicate:简短中文谓语(如「出生地」「担任职务」「调往」)\n"
|
|
|
|
|
|
"6. object_json:字符串或对象;可含 value、approximate_era 等\n"
|
|
|
|
|
|
"7. confidence 0..1;source_chunk_id 必须等于某段 [chunk_id=...] 中的 id\n\n"
|
|
|
|
|
|
'只输出 JSON:{"facts":[...]},无事实则 {"facts":[]}。\n\n'
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_facts_from_transcript_sync(
|
|
|
|
|
|
llm: Any,
|
|
|
|
|
|
numbered_blocks: str,
|
|
|
|
|
|
*,
|
|
|
|
|
|
narrator_name: str | None = None,
|
|
|
|
|
|
) -> list[dict]:
|
2026-03-27 16:01:28 +08:00
|
|
|
|
"""同步:带 chunk_id 标记的文本 → 事实列表。"""
|
|
|
|
|
|
if not llm or not (numbered_blocks or "").strip():
|
|
|
|
|
|
return []
|
|
|
|
|
|
text = numbered_blocks.strip()[: _max_transcript_chars()]
|
2026-04-03 11:43:16 +08:00
|
|
|
|
narrator_label = (narrator_name or "").strip() or "叙述者"
|
|
|
|
|
|
prompt = _facts_extraction_instructions(narrator_label) + text
|
2026-03-27 16:01:28 +08:00
|
|
|
|
try:
|
|
|
|
|
|
raw = invoke_json_object(
|
|
|
|
|
|
llm,
|
|
|
|
|
|
prompt,
|
|
|
|
|
|
max_tokens=4096,
|
|
|
|
|
|
agent="memory.extract_facts_sync",
|
|
|
|
|
|
)
|
|
|
|
|
|
parsed = parse_json_payload(raw, FactsExtractionPayload)
|
|
|
|
|
|
if parsed is None:
|
|
|
|
|
|
return []
|
|
|
|
|
|
return facts_payload_to_dicts(parsed)
|
|
|
|
|
|
except (TypeError, ValueError) as e:
|
|
|
|
|
|
logger.warning("extract_facts_from_transcript_sync 解析失败: {}", e)
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def extract_facts_from_transcript_async(
|
2026-04-03 11:43:16 +08:00
|
|
|
|
llm: Any,
|
|
|
|
|
|
numbered_blocks: str,
|
|
|
|
|
|
*,
|
|
|
|
|
|
narrator_name: str | None = None,
|
2026-03-27 16:01:28 +08:00
|
|
|
|
) -> list[dict]:
|
|
|
|
|
|
"""异步版。"""
|
|
|
|
|
|
if not llm or not (numbered_blocks or "").strip():
|
|
|
|
|
|
return []
|
|
|
|
|
|
text = numbered_blocks.strip()[: _max_transcript_chars()]
|
2026-04-03 11:43:16 +08:00
|
|
|
|
narrator_label = (narrator_name or "").strip() or "叙述者"
|
|
|
|
|
|
prompt = _facts_extraction_instructions(narrator_label) + text
|
2026-03-27 16:01:28 +08:00
|
|
|
|
try:
|
|
|
|
|
|
raw = await ainvoke_json_object(
|
|
|
|
|
|
llm,
|
|
|
|
|
|
prompt,
|
|
|
|
|
|
max_tokens=4096,
|
|
|
|
|
|
agent="memory.extract_facts_async",
|
|
|
|
|
|
)
|
|
|
|
|
|
parsed = parse_json_payload(raw, FactsExtractionPayload)
|
|
|
|
|
|
if parsed is None:
|
|
|
|
|
|
return []
|
|
|
|
|
|
return facts_payload_to_dicts(parsed)
|
|
|
|
|
|
except (TypeError, ValueError) as e:
|
|
|
|
|
|
logger.warning("extract_facts_from_transcript_async 解析失败: {}", e)
|
|
|
|
|
|
return []
|
2026-03-18 17:18:23 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def extract_facts(chunk_text: str, *, user_id: str) -> list[dict]:
|
2026-03-27 16:01:28 +08:00
|
|
|
|
"""兼容旧接口:单块文本(无 chunk id 时传空 source_chunk_id)。"""
|
2026-04-03 11:43:16 +08:00
|
|
|
|
from app.core.db import AsyncSessionLocal
|
2026-04-02 12:00:00 +08:00
|
|
|
|
from app.core.dependencies import get_llm_provider_fast
|
2026-04-03 11:43:16 +08:00
|
|
|
|
from app.features.user.models import User
|
2026-03-27 16:01:28 +08:00
|
|
|
|
|
2026-04-02 12:00:00 +08:00
|
|
|
|
llm = get_llm_provider_fast().langchain_llm
|
2026-04-03 11:43:16 +08:00
|
|
|
|
narrator_name: str | None = None
|
|
|
|
|
|
try:
|
|
|
|
|
|
async with AsyncSessionLocal() as db:
|
|
|
|
|
|
u = await db.get(User, user_id)
|
|
|
|
|
|
if u and (u.nickname or "").strip():
|
|
|
|
|
|
narrator_name = (u.nickname or "").strip()
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
2026-03-27 16:01:28 +08:00
|
|
|
|
blocks = f"[chunk_id=null]\n{chunk_text}"
|
2026-04-03 11:43:16 +08:00
|
|
|
|
facts = await extract_facts_from_transcript_async(
|
|
|
|
|
|
llm, blocks, narrator_name=narrator_name
|
|
|
|
|
|
)
|
2026-03-27 16:01:28 +08:00
|
|
|
|
for f in facts:
|
|
|
|
|
|
if f.get("source_chunk_id") in (None, "null", ""):
|
|
|
|
|
|
f["source_chunk_id"] = None
|
|
|
|
|
|
return facts
|