Files
life-echo/api/app/agents/memoir/classification_agent.py
2026-03-23 13:54:41 +08:00

136 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
ClassificationAgent将内容分类到 8 个章节类别,或判定无价值返回 None。
对应现有逻辑_classify_chapter_category
返回 None 表示本段不进入回忆录 Story/章节流水线;与 StoryRoute 中「可独立讲述的一段人生经历」
(见 prompts.get_story_route_prompt在标准上对齐零散档案点不进 Story记忆与 slot 抽取仍由上游完成。
"""
from __future__ import annotations
import re
from typing import Any, Optional
from app.agents.memoir.prompts import (
CHAPTER_CATEGORIES,
get_chapter_classification_prompt,
)
from app.core.logging import get_logger
logger = get_logger(__name__)
# 与「仅档案句式」组合使用;过短但明显为叙事句的仍交 LLM 判断
_FRAGMENT_SHORT_MAX_LEN = 48
# 整段仅为出生年份/年份声明(零散档案,不成故事)
_BIRTH_YEAR_LINE = re.compile(
r"^[\s\u200b]*(?:我)?\d{4}\s*年\s*(出生|生的|生)?\s*[。.!]?[\s\u200b]*$",
re.UNICODE,
)
# 极短且为「我是某地人」式籍贯标签,无过程描写
_SHORT_HUKOU_STYLE = re.compile(
r"^[\s\u200b]*(?:我)?是[\u4e00-\u9fff]{1,10}(人|籍)\s*[。.!]?[\s\u200b]*$",
re.UNICODE,
)
# 5-stage 关键词(用于 LLM 失败时的兜底);注意勿含易与「仅年份句」共现的泛词,以免误推类别
STAGE_KEYWORDS = {
"childhood": ["童年", "小时候", "家乡", "小镇"],
"education": ["上学", "学校", "老师", "同学", "教育", "大学"],
"career": ["工作", "职业", "事业", "公司", "同事", "创业"],
"family": ["伴侣", "孩子", "家庭", "家人", "结婚", "父母"],
"belief": ["信念", "价值观", "座右铭", "坚持", "原则"],
}
# 5-stage → 默认 8-category 映射LLM 分类失败时的兜底)
_STAGE_TO_DEFAULT_CATEGORY = {
"childhood": "childhood",
"education": "education",
"career": "career_early",
"family": "family",
"belief": "beliefs",
}
def _detect_stage(text: str, fallback_stage: str) -> str:
"""根据关键词检测消息所属的 5-stage 阶段"""
message = (text or "").lower()
for stage, keywords in STAGE_KEYWORDS.items():
if any(word in message for word in keywords):
return stage
return fallback_stage
def _looks_like_fragment_only(text: str) -> bool:
"""
保守启发式:明显为档案点/标签句,不足以作为 Story 叙事单元。
与 get_chapter_classification_prompt 中「应返回 none」的情形一致误判风险通过窄正则控制。
"""
s = (text or "").strip()
if not s:
return True
if _BIRTH_YEAR_LINE.match(s):
return True
if len(s) <= _FRAGMENT_SHORT_MAX_LEN and _SHORT_HUKOU_STYLE.match(s):
return True
return False
def _normalize_llm_category(raw: str) -> str:
"""去掉模型偶发的引号、反引号包裹。"""
s = (raw or "").strip().lower()
if s.startswith("`"):
s = s.strip("`").strip()
if (s.startswith('"') and s.endswith('"')) or (
s.startswith("'") and s.endswith("'")
):
s = s[1:-1].strip()
return s
class ClassificationAgent:
"""将内容分类到 8 个章节类别之一,或判定无价值返回 None"""
def classify(
self,
text: str,
fallback_stage: str,
llm: Any,
) -> Optional[str]:
"""
分类到 8 个章节类别之一。
若 LLM 判定内容不足以独立成篇none或启发式判定为零散档案点返回 None。
llm 需支持 .invoke(prompt) 同步调用。
"""
if _looks_like_fragment_only(text):
logger.debug(
"零散档案/极短标签句,跳过回忆录 Story: text_len=%s text=%s",
len(text or ""),
text or "",
)
return None
if llm:
try:
prompt = get_chapter_classification_prompt(text)
response = llm.invoke(prompt)
category = _normalize_llm_category(response.content or "")
if category == "none":
logger.debug(
"LLM 判定内容不足以成篇,跳过: text_len=%s text=%s",
len(text or ""),
text or "",
)
return None
if category in CHAPTER_CATEGORIES:
return category
except Exception as e:
logger.warning("ClassificationAgent LLM 章节分类失败: %s", e)
stage = _detect_stage(text, fallback_stage)
return _STAGE_TO_DEFAULT_CATEGORY.get(
stage,
_STAGE_TO_DEFAULT_CATEGORY.get(fallback_stage, "childhood"),
)