Files
life-echo/api/app/features/memory/extractor.py
Kevin ac436b87a2 feat(api): 收敛对话与记忆流程边界,引入 LLM 网关与专用服务
- MemoryService 异步路径委托 MemoryIngestService / MemoryRetrievalService;富化派发经 MemoryEnrichmentScheduler
- WebSocket pipeline 经 ChatTurnService 与显式 DTO 编排单轮对话;回忆录片段入队由 MemoirIngestScheduler 封装
- 新增 LlmGateway(LlmUseCase),各 agent、任务与适配器对齐 ports
- 补充 memory 提示适配、runtime 类型、memory-retrieval 文档、ai-touchpoints 说明与扫描脚本及配套测试

Made-with: Cursor
2026-04-30 09:17:01 +08:00

127 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""从 transcript 块中抽取结构化事实LLM + JSON"""
from __future__ import annotations
from typing import Any
from app.core.langchain_llm import ainvoke_json_object, invoke_json_object
from app.core.llm_gateway import LlmGateway, LlmUseCase
from app.core.logging import get_logger
from app.features.memory.llm_schemas import (
FactsExtractionPayload,
facts_payload_to_dicts,
parse_json_payload,
)
logger = get_logger(__name__)
def _max_transcript_chars() -> int:
from app.core.config import settings
return settings.memory_enrichment_max_chars
def _facts_extraction_instructions(narrator_label: str) -> str:
return (
"你是回忆录事实抽取助手。用户正在口述人生回忆,所有内容默认是**过去发生的事**"
"而非当前或未来计划(除非原文明确说「现在」「打算」「准备将要」等)。\n\n"
"## 抽取规则\n"
"1. subject 必须用明确的人名或固定称谓:\n"
f" - 叙述者本人统一用「{narrator_label}\n"
" - 其他人用全名或稳定专名(如「王伟」),禁止用「他」「她」「我」「我们大伙」等代词作 subject"
"若代词在上下文中可唯一解析为某人,则 subject 写该人姓名/专名\n"
"2. 事件、职务变动、地点迁移等一律按**过去回忆**理解travel/调动/命令类表述勿写成「即将要做」"
"除非原文明确为未来时态\n"
"3. 若可推断大约年代或人生阶段,将 approximate_era 写入 object_json与 value 等字段并存),"
'例如 "1990年代""2001年""退休后""30岁前后"\n'
"4. fact_type: person|event|relation|place|milestone\n"
"5. predicate简短中文谓语如「出生地」「担任职务」「调往」\n"
"6. object_json字符串或对象可含 value、approximate_era 等\n"
"7. confidence 0..1source_chunk_id 必须等于某段 [chunk_id=...] 中的 id\n\n"
'只输出 JSON{"facts":[...]},无事实则 {"facts":[]}。\n\n'
)
def extract_facts_from_transcript_sync(
llm: Any,
numbered_blocks: str,
*,
narrator_name: str | None = None,
) -> list[dict]:
"""同步:带 chunk_id 标记的文本 → 事实列表。"""
if not llm or not (numbered_blocks or "").strip():
return []
text = numbered_blocks.strip()[: _max_transcript_chars()]
narrator_label = (narrator_name or "").strip() or "叙述者"
prompt = _facts_extraction_instructions(narrator_label) + text
try:
raw = invoke_json_object(
llm,
prompt,
max_tokens=4096,
agent="memory.extract_facts_sync",
)
parsed = parse_json_payload(raw, FactsExtractionPayload)
if parsed is None:
return []
return facts_payload_to_dicts(parsed)
except (TypeError, ValueError) as e:
logger.warning("extract_facts_from_transcript_sync 解析失败: {}", e)
return []
async def extract_facts_from_transcript_async(
llm: Any,
numbered_blocks: str,
*,
narrator_name: str | None = None,
) -> list[dict]:
"""异步版。"""
if not llm or not (numbered_blocks or "").strip():
return []
text = numbered_blocks.strip()[: _max_transcript_chars()]
narrator_label = (narrator_name or "").strip() or "叙述者"
prompt = _facts_extraction_instructions(narrator_label) + text
try:
raw = await ainvoke_json_object(
llm,
prompt,
max_tokens=4096,
agent="memory.extract_facts_async",
)
parsed = parse_json_payload(raw, FactsExtractionPayload)
if parsed is None:
return []
return facts_payload_to_dicts(parsed)
except (TypeError, ValueError) as e:
logger.warning("extract_facts_from_transcript_async 解析失败: {}", e)
return []
async def extract_facts(chunk_text: str, *, user_id: str) -> list[dict]:
"""兼容旧接口:单块文本(无 chunk id 时传空 source_chunk_id"""
from app.core.db import AsyncSessionLocal
from app.features.user.models import User
llm = LlmGateway().langchain_llm_for(
LlmUseCase("memory.extract_facts.compat", fast=True)
)
narrator_name: str | None = None
try:
async with AsyncSessionLocal() as db:
u = await db.get(User, user_id)
if u and (u.nickname or "").strip():
narrator_name = (u.nickname or "").strip()
except Exception:
pass
blocks = f"[chunk_id=null]\n{chunk_text}"
facts = await extract_facts_from_transcript_async(
llm, blocks, narrator_name=narrator_name
)
for f in facts:
if f.get("source_chunk_id") in (None, "null", ""):
f["source_chunk_id"] = None
return facts