"""从 transcript 块中抽取结构化事实(LLM + JSON)。""" from __future__ import annotations from typing import Any from app.core.langchain_llm import ainvoke_json_object, invoke_json_object from app.core.logging import get_logger from app.features.memory.llm_schemas import ( FactsExtractionPayload, facts_payload_to_dicts, parse_json_payload, ) logger = get_logger(__name__) def _max_transcript_chars() -> int: from app.core.config import settings return settings.memory_enrichment_max_chars def extract_facts_from_transcript_sync(llm: Any, numbered_blocks: str) -> list[dict]: """同步:带 chunk_id 标记的文本 → 事实列表。""" if not llm or not (numbered_blocks or "").strip(): return [] text = numbered_blocks.strip()[: _max_transcript_chars()] prompt = ( "你是回忆录记忆抽取助手。阅读下列带 [chunk_id=...] 的文本块,抽取可核查的事实。\n" "每个事实含 fact_type: person|event|relation|place|milestone;subject;predicate;" "object_json(可为字符串或对象);confidence 0..1;source_chunk_id 必须等于某段的 chunk id。\n" '只输出 JSON:{"facts":[...]},无事实则 {"facts":[]}。\n\n' f"{text}" ) try: raw = invoke_json_object( llm, prompt, max_tokens=4096, agent="memory.extract_facts_sync", ) parsed = parse_json_payload(raw, FactsExtractionPayload) if parsed is None: return [] return facts_payload_to_dicts(parsed) except (TypeError, ValueError) as e: logger.warning("extract_facts_from_transcript_sync 解析失败: {}", e) return [] async def extract_facts_from_transcript_async( llm: Any, numbered_blocks: str ) -> list[dict]: """异步版。""" if not llm or not (numbered_blocks or "").strip(): return [] text = numbered_blocks.strip()[: _max_transcript_chars()] prompt = ( "你是回忆录记忆抽取助手。阅读下列带 [chunk_id=...] 的文本块,抽取可核查的事实。\n" "每个事实含 fact_type: person|event|relation|place|milestone;subject;predicate;" "object_json;confidence 0..1;source_chunk_id 必须等于某段的 chunk id。\n" '只输出 JSON:{"facts":[...]},无事实则 {"facts":[]}。\n\n' f"{text}" ) try: raw = await ainvoke_json_object( llm, prompt, max_tokens=4096, agent="memory.extract_facts_async", ) parsed = parse_json_payload(raw, FactsExtractionPayload) if parsed is None: return [] return facts_payload_to_dicts(parsed) except (TypeError, ValueError) as e: logger.warning("extract_facts_from_transcript_async 解析失败: {}", e) return [] async def extract_facts(chunk_text: str, *, user_id: str) -> list[dict]: """兼容旧接口:单块文本(无 chunk id 时传空 source_chunk_id)。""" from app.core.dependencies import get_llm_provider_fast llm = get_llm_provider_fast().langchain_llm blocks = f"[chunk_id=null]\n{chunk_text}" facts = await extract_facts_from_transcript_async(llm, blocks) for f in facts: if f.get("source_chunk_id") in (None, "null", ""): f["source_chunk_id"] = None return facts