""" ClassificationAgent:将内容分类到 8 个章节类别之一。 原「LLM 返回 none / 零散档案启发式」不再跳过 Story:统一映射为 ``summary`` 章节, 仍走叙事流水线落库;与 StoryRoute 仍兼容(批次内 new/append 规划不变)。 Memory ingest 由 Celery 任务在批次级先行完成,与分类结果独立。 """ from __future__ import annotations import json import re from dataclasses import dataclass from typing import Any from app.agents.memoir.prompts import ( CHAPTER_CATEGORIES, get_chapter_classification_json_prompt, ) from app.core.langchain_llm import invoke_json_object from app.core.logging import get_logger from app.features.memoir.memoir_images.json_payload import extract_json_payload logger = get_logger(__name__) # 模型判定 none 或启发式命中零散档案时,仍写入回忆录正文所用的兜底章节 _SUMMARY_FALLBACK_CATEGORY = "summary" # 与「仅档案句式」组合使用;过短但明显为叙事句的仍交 LLM 判断 _FRAGMENT_SHORT_MAX_LEN = 48 # 整段仅为出生年份/年份声明(零散档案,不成故事) _BIRTH_YEAR_LINE = re.compile( r"^[\s\u200b]*(?:我)?\d{4}\s*年\s*(出生|生的|生)?\s*[。.!!]?[\s\u200b]*$", re.UNICODE, ) # 极短且为「我是某地人」式籍贯标签,无过程描写 _SHORT_HUKOU_STYLE = re.compile( r"^[\s\u200b]*(?:我)?是[\u4e00-\u9fff]{1,10}(人|籍)\s*[。.!!]?[\s\u200b]*$", re.UNICODE, ) # 5-stage 关键词(用于 LLM 失败时的兜底);注意勿含易与「仅年份句」共现的泛词,以免误推类别 STAGE_KEYWORDS = { "childhood": ["童年", "小时候", "家乡", "小镇"], "education": ["上学", "学校", "老师", "同学", "教育", "大学"], "career": ["工作", "职业", "事业", "公司", "同事", "创业"], "family": ["伴侣", "孩子", "家庭", "家人", "结婚", "父母"], "belief": ["信念", "价值观", "座右铭", "坚持", "原则"], } # 5-stage → 默认 8-category 映射(LLM 分类失败时的兜底) _STAGE_TO_DEFAULT_CATEGORY = { "childhood": "childhood", "education": "education", "career": "career_early", "family": "family", "belief": "beliefs", } def _detect_stage(text: str, fallback_stage: str) -> str: """根据关键词检测消息所属的 5-stage 阶段""" message = (text or "").lower() for stage, keywords in STAGE_KEYWORDS.items(): if any(word in message for word in keywords): return stage return fallback_stage def _looks_like_fragment_only(text: str) -> bool: """ 保守启发式:明显为档案点/标签句。 命中时仍进回忆录正文,章节映射为 ``summary``(与 LLM 返回 none 一致)。 """ s = (text or "").strip() if not s: return True if _BIRTH_YEAR_LINE.match(s): return True if len(s) <= _FRAGMENT_SHORT_MAX_LEN and _SHORT_HUKOU_STYLE.match(s): return True return False def _normalize_llm_category(raw: str) -> str: """去掉模型偶发的引号、反引号包裹。""" s = (raw or "").strip().lower() if s.startswith("`"): s = s.strip("`").strip() if (s.startswith('"') and s.endswith('"')) or ( s.startswith("'") and s.endswith("'") ): s = s[1:-1].strip() return s @dataclass(frozen=True) class ChapterClassifyResult: """章节分类结果;``llm_said_none`` 仅当走 LLM 且解析为 none 时为 True(fragment 启发式不为 True)。""" category: str llm_said_none: bool = False def _parse_category_from_llm_response(raw: str) -> str: """优先解析 JSON ``{"category": "..."}``,失败则按纯文本 key 处理。""" s = (raw or "").strip() if not s: return "" try: data = json.loads(extract_json_payload(s)) if isinstance(data, dict) and "category" in data: return _normalize_llm_category(str(data["category"])) except (json.JSONDecodeError, TypeError, ValueError): pass return _normalize_llm_category(s) class ClassificationAgent: """将内容分类到 8 个章节类别之一;none/零散档案映射为 ``summary`` 仍进 Story。""" def classify( self, text: str, fallback_stage: str, llm: Any, *, segment_id: str | None = None, ) -> ChapterClassifyResult: """ 分类到 8 个章节类别之一。 LLM 返回 none 或启发式为零散档案时,``category`` 为 ``summary``(仍可走回忆录流水线; ``llm_said_none`` 仅在 LLM 明确返回 none 时为 True,供空转抑制判断)。 llm 需支持 .invoke(prompt) 同步调用。 """ if _looks_like_fragment_only(text): logger.info( "event=chapter_classification_summary_fallback reason=fragment_heuristic " "segment_id={} text_len={} category={}", segment_id or "", len(text or ""), _SUMMARY_FALLBACK_CATEGORY, ) return ChapterClassifyResult( category=_SUMMARY_FALLBACK_CATEGORY, llm_said_none=False, ) if llm: try: prompt = get_chapter_classification_json_prompt(text) raw = invoke_json_object( llm, prompt, max_tokens=256, agent="ClassificationAgent.classify", ) category = _parse_category_from_llm_response(raw) if category == "none": logger.info( "event=chapter_classification_summary_fallback reason=llm_none " "segment_id={} text_len={} category={}", segment_id or "", len(text or ""), _SUMMARY_FALLBACK_CATEGORY, ) return ChapterClassifyResult( category=_SUMMARY_FALLBACK_CATEGORY, llm_said_none=True, ) if category in CHAPTER_CATEGORIES: return ChapterClassifyResult(category=category, llm_said_none=False) except Exception as e: logger.warning("ClassificationAgent LLM 章节分类失败: {}", e) stage = _detect_stage(text, fallback_stage) cat = _STAGE_TO_DEFAULT_CATEGORY.get( stage, _STAGE_TO_DEFAULT_CATEGORY.get(fallback_stage, "childhood"), ) return ChapterClassifyResult(category=cat, llm_said_none=False)