Files
life-echo/api/app/agents/memoir/classification_agent.py
Kevin a3f61fcc0f feat(api+app): 对话阶段化、回忆录流水线与客户端会话体验
- DB: segments 用户输入文本(Alembic 0002)
- Chat: 阶段检测/阶段提示/回复限制,编排与访谈/画像 prompts 调整
- Memoir: 忠实度检查 agent,叙事与分类等链路更新
- Core: agent 日志、Alembic 启动、LangChain/日志/配置等
- Story: time_hints;Memory 检索与相关测试
- Expo: 助手头像、会话页与消息拆分、实时会话与文案/i18n
- Docs/scripts/tests: 迁移脚本、LLM JSON/记忆检索文档、新增单测
2026-03-26 12:13:36 +08:00

158 lines
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
ClassificationAgent将内容分类到 8 个章节类别,或判定无价值返回 None。
对应现有逻辑_classify_chapter_category
返回 None 表示本段不进入回忆录 Story/章节流水线;与 StoryRoute 中「可独立讲述的一段人生经历」
(见 prompts.get_story_route_prompt在标准上对齐零散档案点不进 Story记忆与 slot 抽取仍由上游完成。
"""
from __future__ import annotations
import json
import re
from typing import Any, Optional
from app.agents.memoir.prompts import (
CHAPTER_CATEGORIES,
get_chapter_classification_json_prompt,
)
from app.core.langchain_llm import invoke_json_object
from app.core.logging import get_logger
from app.features.memoir.memoir_images.json_payload import extract_json_payload
logger = get_logger(__name__)
# 与「仅档案句式」组合使用;过短但明显为叙事句的仍交 LLM 判断
_FRAGMENT_SHORT_MAX_LEN = 48
# 整段仅为出生年份/年份声明(零散档案,不成故事)
_BIRTH_YEAR_LINE = re.compile(
r"^[\s\u200b]*(?:我)?\d{4}\s*年\s*(出生|生的|生)?\s*[。.!]?[\s\u200b]*$",
re.UNICODE,
)
# 极短且为「我是某地人」式籍贯标签,无过程描写
_SHORT_HUKOU_STYLE = re.compile(
r"^[\s\u200b]*(?:我)?是[\u4e00-\u9fff]{1,10}(人|籍)\s*[。.!]?[\s\u200b]*$",
re.UNICODE,
)
# 5-stage 关键词(用于 LLM 失败时的兜底);注意勿含易与「仅年份句」共现的泛词,以免误推类别
STAGE_KEYWORDS = {
"childhood": ["童年", "小时候", "家乡", "小镇"],
"education": ["上学", "学校", "老师", "同学", "教育", "大学"],
"career": ["工作", "职业", "事业", "公司", "同事", "创业"],
"family": ["伴侣", "孩子", "家庭", "家人", "结婚", "父母"],
"belief": ["信念", "价值观", "座右铭", "坚持", "原则"],
}
# 5-stage → 默认 8-category 映射LLM 分类失败时的兜底)
_STAGE_TO_DEFAULT_CATEGORY = {
"childhood": "childhood",
"education": "education",
"career": "career_early",
"family": "family",
"belief": "beliefs",
}
def _detect_stage(text: str, fallback_stage: str) -> str:
"""根据关键词检测消息所属的 5-stage 阶段"""
message = (text or "").lower()
for stage, keywords in STAGE_KEYWORDS.items():
if any(word in message for word in keywords):
return stage
return fallback_stage
def _looks_like_fragment_only(text: str) -> bool:
"""
保守启发式:明显为档案点/标签句,不足以作为 Story 叙事单元。
与 get_chapter_classification_prompt 中「应返回 none」的情形一致误判风险通过窄正则控制。
"""
s = (text or "").strip()
if not s:
return True
if _BIRTH_YEAR_LINE.match(s):
return True
if len(s) <= _FRAGMENT_SHORT_MAX_LEN and _SHORT_HUKOU_STYLE.match(s):
return True
return False
def _normalize_llm_category(raw: str) -> str:
"""去掉模型偶发的引号、反引号包裹。"""
s = (raw or "").strip().lower()
if s.startswith("`"):
s = s.strip("`").strip()
if (s.startswith('"') and s.endswith('"')) or (
s.startswith("'") and s.endswith("'")
):
s = s[1:-1].strip()
return s
def _parse_category_from_llm_response(raw: str) -> str:
"""优先解析 JSON ``{"category": "..."}``,失败则按纯文本 key 处理。"""
s = (raw or "").strip()
if not s:
return ""
try:
data = json.loads(extract_json_payload(s))
if isinstance(data, dict) and "category" in data:
return _normalize_llm_category(str(data["category"]))
except (json.JSONDecodeError, TypeError, ValueError):
pass
return _normalize_llm_category(s)
class ClassificationAgent:
"""将内容分类到 8 个章节类别之一,或判定无价值返回 None"""
def classify(
self,
text: str,
fallback_stage: str,
llm: Any,
) -> Optional[str]:
"""
分类到 8 个章节类别之一。
若 LLM 判定内容不足以独立成篇none或启发式判定为零散档案点返回 None。
llm 需支持 .invoke(prompt) 同步调用。
"""
if _looks_like_fragment_only(text):
logger.debug(
"零散档案/极短标签句,跳过回忆录 Story: text_len={} text={}",
len(text or ""),
text or "",
)
return None
if llm:
try:
prompt = get_chapter_classification_json_prompt(text)
raw = invoke_json_object(
llm,
prompt,
max_tokens=256,
agent="ClassificationAgent.classify",
)
category = _parse_category_from_llm_response(raw)
if category == "none":
logger.debug(
"LLM 判定内容不足以成篇,跳过: text_len={} text={}",
len(text or ""),
text or "",
)
return None
if category in CHAPTER_CATEGORIES:
return category
except Exception as e:
logger.warning("ClassificationAgent LLM 章节分类失败: {}", e)
stage = _detect_stage(text, fallback_stage)
return _STAGE_TO_DEFAULT_CATEGORY.get(
stage,
_STAGE_TO_DEFAULT_CATEGORY.get(fallback_stage, "childhood"),
)