Files
life-echo/api/app/agents/memoir/classification_agent.py
Kevin e4bf0710c7 feat(memory,conversation): 记忆富化/证据包、时间线幂等字段与对话分段全链路
数据库
- 新增迁移 0003:timeline_events.memory_source_id 外键 → memory_sources,便于按 ingest 源做时间线幂等

后端 - 记忆
- 新增 ingest 后 LLM 富化(摘要/事实/时间线),可配置开关与最大字符数
- 新增证据包组装:合并 chunk、摘要、事实、时间线、故事等检索结果;支持空 query 时是否仍带 rolling 等开关
- repo/retriever/service/router/schemas/summarizer/timeline/extractor 等扩展;文档 memory-retrieval.md 更新

后端 - 对话 WS
- 增加 PING/PONG;分段 ASR 日志与空音频处理;转写失败与「无助手回复」错误提示更明确
- 助手多段回复持久化使用统一分隔符,与分段逻辑一致

后端 - Agent
- reply_limits:按 [SPLIT] 与段落拆段,并保证非空 fallback,供 WS 与 TTS 多段下发

后端 - 回忆录任务
- transcript ingest 记录 source_id;任务成功结?
2026-03-27 16:24:43 +08:00

167 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
ClassificationAgent将内容分类到 8 个章节类别之一。
原「LLM 返回 none / 零散档案启发式」不再跳过 Story统一映射为 ``summary`` 章节,
仍走叙事流水线落库;与 StoryRoute 仍兼容(批次内 new/append 规划不变)。
Memory ingest 由 Celery 任务在批次级先行完成,与分类结果独立。
"""
from __future__ import annotations
import json
import re
from typing import Any
from app.agents.memoir.prompts import (
CHAPTER_CATEGORIES,
get_chapter_classification_json_prompt,
)
from app.core.langchain_llm import invoke_json_object
from app.core.logging import get_logger
from app.features.memoir.memoir_images.json_payload import extract_json_payload
logger = get_logger(__name__)
# 模型判定 none 或启发式命中零散档案时,仍写入回忆录正文所用的兜底章节
_SUMMARY_FALLBACK_CATEGORY = "summary"
# 与「仅档案句式」组合使用;过短但明显为叙事句的仍交 LLM 判断
_FRAGMENT_SHORT_MAX_LEN = 48
# 整段仅为出生年份/年份声明(零散档案,不成故事)
_BIRTH_YEAR_LINE = re.compile(
r"^[\s\u200b]*(?:我)?\d{4}\s*年\s*(出生|生的|生)?\s*[。.!]?[\s\u200b]*$",
re.UNICODE,
)
# 极短且为「我是某地人」式籍贯标签,无过程描写
_SHORT_HUKOU_STYLE = re.compile(
r"^[\s\u200b]*(?:我)?是[\u4e00-\u9fff]{1,10}(人|籍)\s*[。.!]?[\s\u200b]*$",
re.UNICODE,
)
# 5-stage 关键词(用于 LLM 失败时的兜底);注意勿含易与「仅年份句」共现的泛词,以免误推类别
STAGE_KEYWORDS = {
"childhood": ["童年", "小时候", "家乡", "小镇"],
"education": ["上学", "学校", "老师", "同学", "教育", "大学"],
"career": ["工作", "职业", "事业", "公司", "同事", "创业"],
"family": ["伴侣", "孩子", "家庭", "家人", "结婚", "父母"],
"belief": ["信念", "价值观", "座右铭", "坚持", "原则"],
}
# 5-stage → 默认 8-category 映射LLM 分类失败时的兜底)
_STAGE_TO_DEFAULT_CATEGORY = {
"childhood": "childhood",
"education": "education",
"career": "career_early",
"family": "family",
"belief": "beliefs",
}
def _detect_stage(text: str, fallback_stage: str) -> str:
"""根据关键词检测消息所属的 5-stage 阶段"""
message = (text or "").lower()
for stage, keywords in STAGE_KEYWORDS.items():
if any(word in message for word in keywords):
return stage
return fallback_stage
def _looks_like_fragment_only(text: str) -> bool:
"""
保守启发式:明显为档案点/标签句。
命中时仍进回忆录正文,章节映射为 ``summary``(与 LLM 返回 none 一致)。
"""
s = (text or "").strip()
if not s:
return True
if _BIRTH_YEAR_LINE.match(s):
return True
if len(s) <= _FRAGMENT_SHORT_MAX_LEN and _SHORT_HUKOU_STYLE.match(s):
return True
return False
def _normalize_llm_category(raw: str) -> str:
"""去掉模型偶发的引号、反引号包裹。"""
s = (raw or "").strip().lower()
if s.startswith("`"):
s = s.strip("`").strip()
if (s.startswith('"') and s.endswith('"')) or (
s.startswith("'") and s.endswith("'")
):
s = s[1:-1].strip()
return s
def _parse_category_from_llm_response(raw: str) -> str:
"""优先解析 JSON ``{"category": "..."}``,失败则按纯文本 key 处理。"""
s = (raw or "").strip()
if not s:
return ""
try:
data = json.loads(extract_json_payload(s))
if isinstance(data, dict) and "category" in data:
return _normalize_llm_category(str(data["category"]))
except (json.JSONDecodeError, TypeError, ValueError):
pass
return _normalize_llm_category(s)
class ClassificationAgent:
"""将内容分类到 8 个章节类别之一none/零散档案映射为 ``summary`` 仍进 Story。"""
def classify(
self,
text: str,
fallback_stage: str,
llm: Any,
*,
segment_id: str | None = None,
) -> str:
"""
分类到 8 个章节类别之一。
LLM 返回 none 或启发式为零散档案时,返回 ``summary``(仍走回忆录流水线)。
llm 需支持 .invoke(prompt) 同步调用。
"""
if _looks_like_fragment_only(text):
logger.info(
"event=chapter_classification_summary_fallback reason=fragment_heuristic "
"segment_id={} text_len={} category={}",
segment_id or "",
len(text or ""),
_SUMMARY_FALLBACK_CATEGORY,
)
return _SUMMARY_FALLBACK_CATEGORY
if llm:
try:
prompt = get_chapter_classification_json_prompt(text)
raw = invoke_json_object(
llm,
prompt,
max_tokens=256,
agent="ClassificationAgent.classify",
)
category = _parse_category_from_llm_response(raw)
if category == "none":
logger.info(
"event=chapter_classification_summary_fallback reason=llm_none "
"segment_id={} text_len={} category={}",
segment_id or "",
len(text or ""),
_SUMMARY_FALLBACK_CATEGORY,
)
return _SUMMARY_FALLBACK_CATEGORY
if category in CHAPTER_CATEGORIES:
return category
except Exception as e:
logger.warning("ClassificationAgent LLM 章节分类失败: {}", e)
stage = _detect_stage(text, fallback_stage)
return _STAGE_TO_DEFAULT_CATEGORY.get(
stage,
_STAGE_TO_DEFAULT_CATEGORY.get(fallback_stage, "childhood"),
)