""" ClassificationAgent:将内容分类到 8 个章节类别,或判定无价值返回 None。 对应现有逻辑:_classify_chapter_category 返回 None 表示本段不进入回忆录 Story/章节流水线;与 StoryRoute 中「可独立讲述的一段人生经历」 (见 prompts.get_story_route_prompt)在标准上对齐:零散档案点不进 Story,记忆与 slot 抽取仍由上游完成。 """ from __future__ import annotations import json import re from typing import Any, Optional from app.agents.memoir.prompts import ( CHAPTER_CATEGORIES, get_chapter_classification_json_prompt, ) from app.core.langchain_llm import invoke_json_object from app.core.logging import get_logger from app.features.memoir.memoir_images.json_payload import extract_json_payload logger = get_logger(__name__) # 与「仅档案句式」组合使用;过短但明显为叙事句的仍交 LLM 判断 _FRAGMENT_SHORT_MAX_LEN = 48 # 整段仅为出生年份/年份声明(零散档案,不成故事) _BIRTH_YEAR_LINE = re.compile( r"^[\s\u200b]*(?:我)?\d{4}\s*年\s*(出生|生的|生)?\s*[。.!!]?[\s\u200b]*$", re.UNICODE, ) # 极短且为「我是某地人」式籍贯标签,无过程描写 _SHORT_HUKOU_STYLE = re.compile( r"^[\s\u200b]*(?:我)?是[\u4e00-\u9fff]{1,10}(人|籍)\s*[。.!!]?[\s\u200b]*$", re.UNICODE, ) # 5-stage 关键词(用于 LLM 失败时的兜底);注意勿含易与「仅年份句」共现的泛词,以免误推类别 STAGE_KEYWORDS = { "childhood": ["童年", "小时候", "家乡", "小镇"], "education": ["上学", "学校", "老师", "同学", "教育", "大学"], "career": ["工作", "职业", "事业", "公司", "同事", "创业"], "family": ["伴侣", "孩子", "家庭", "家人", "结婚", "父母"], "belief": ["信念", "价值观", "座右铭", "坚持", "原则"], } # 5-stage → 默认 8-category 映射(LLM 分类失败时的兜底) _STAGE_TO_DEFAULT_CATEGORY = { "childhood": "childhood", "education": "education", "career": "career_early", "family": "family", "belief": "beliefs", } def _detect_stage(text: str, fallback_stage: str) -> str: """根据关键词检测消息所属的 5-stage 阶段""" message = (text or "").lower() for stage, keywords in STAGE_KEYWORDS.items(): if any(word in message for word in keywords): return stage return fallback_stage def _looks_like_fragment_only(text: str) -> bool: """ 保守启发式:明显为档案点/标签句,不足以作为 Story 叙事单元。 与 get_chapter_classification_prompt 中「应返回 none」的情形一致;误判风险通过窄正则控制。 """ s = (text or "").strip() if not s: return True if _BIRTH_YEAR_LINE.match(s): return True if len(s) <= _FRAGMENT_SHORT_MAX_LEN and _SHORT_HUKOU_STYLE.match(s): return True return False def _normalize_llm_category(raw: str) -> str: """去掉模型偶发的引号、反引号包裹。""" s = (raw or "").strip().lower() if s.startswith("`"): s = s.strip("`").strip() if (s.startswith('"') and s.endswith('"')) or ( s.startswith("'") and s.endswith("'") ): s = s[1:-1].strip() return s def _parse_category_from_llm_response(raw: str) -> str: """优先解析 JSON ``{"category": "..."}``,失败则按纯文本 key 处理。""" s = (raw or "").strip() if not s: return "" try: data = json.loads(extract_json_payload(s)) if isinstance(data, dict) and "category" in data: return _normalize_llm_category(str(data["category"])) except (json.JSONDecodeError, TypeError, ValueError): pass return _normalize_llm_category(s) class ClassificationAgent: """将内容分类到 8 个章节类别之一,或判定无价值返回 None""" def classify( self, text: str, fallback_stage: str, llm: Any, ) -> Optional[str]: """ 分类到 8 个章节类别之一。 若 LLM 判定内容不足以独立成篇(none)或启发式判定为零散档案点,返回 None。 llm 需支持 .invoke(prompt) 同步调用。 """ if _looks_like_fragment_only(text): logger.debug( "零散档案/极短标签句,跳过回忆录 Story: text_len={} text={}", len(text or ""), text or "", ) return None if llm: try: prompt = get_chapter_classification_json_prompt(text) raw = invoke_json_object( llm, prompt, max_tokens=256, agent="ClassificationAgent.classify", ) category = _parse_category_from_llm_response(raw) if category == "none": logger.debug( "LLM 判定内容不足以成篇,跳过: text_len={} text={}", len(text or ""), text or "", ) return None if category in CHAPTER_CATEGORIES: return category except Exception as e: logger.warning("ClassificationAgent LLM 章节分类失败: {}", e) stage = _detect_stage(text, fallback_stage) return _STAGE_TO_DEFAULT_CATEGORY.get( stage, _STAGE_TO_DEFAULT_CATEGORY.get(fallback_stage, "childhood"), )