""" ClassificationAgent:将内容分类到 8 个章节类别之一。 原「LLM 返回 none / 零散档案启发式」不再跳过 Story:统一映射为 ``summary`` 章节, 仍走叙事流水线落库;与 StoryRoute 仍兼容(批次内 new/append 规划不变)。 Memory ingest 由 Celery 任务在批次级先行完成,与分类结果独立。 """ from __future__ import annotations import json import re from dataclasses import dataclass from typing import Any from pydantic import ValidationError from app.agents.memoir.prompts import get_chapter_classification_json_prompt from app.agents.memoir.schemas import ClassificationOutput from app.agents.stage_constants import ( CHAPTER_CATEGORIES, STAGE_KEYWORD_WEIGHTS, STAGE_TO_DEFAULT_CATEGORY, ) from app.core.config import settings from app.core.json_utils import extract_json_payload from app.core.llm_call import LLMCallError, llm_json_call from app.core.logging import get_logger logger = get_logger(__name__) # 模型判定 none 或启发式命中零散档案时,仍写入回忆录正文所用的兜底章节 _SUMMARY_FALLBACK_CATEGORY = "summary" # 与「仅档案句式」组合使用;过短但明显为叙事句的仍交 LLM 判断 _FRAGMENT_SHORT_MAX_LEN = 48 # 整段仅为出生年份/年份声明(零散档案,不成故事) _BIRTH_YEAR_LINE = re.compile( r"^[\s\u200b]*(?:我)?\d{4}\s*年\s*(出生|生的|生)?\s*[。.!!]?[\s\u200b]*$", re.UNICODE, ) # 极短且为「我是某地人」式籍贯标签,无过程描写 _SHORT_HUKOU_STYLE = re.compile( r"^[\s\u200b]*(?:我)?是[\u4e00-\u9fff]{1,10}(人|籍)\s*[。.!!]?[\s\u200b]*$", re.UNICODE, ) def _detect_stage(text: str, fallback_stage: str) -> str: """根据关键词检测消息所属的 5-stage 阶段(与 stage_constants.STAGE_KEYWORD_WEIGHTS 同源;匹配方式为子串,非加权)。""" message = (text or "").lower() for stage, pairs in STAGE_KEYWORD_WEIGHTS.items(): if any(word in message for word, _w in pairs): return stage return fallback_stage def _looks_like_fragment_only(text: str) -> bool: """ 保守启发式:明显为档案点/标签句。 命中时仍进回忆录正文,章节映射为 ``summary``(与 LLM 返回 none 一致)。 """ s = (text or "").strip() if not s: return True if _BIRTH_YEAR_LINE.match(s): return True if len(s) <= _FRAGMENT_SHORT_MAX_LEN and _SHORT_HUKOU_STYLE.match(s): return True return False def _normalize_llm_category(raw: str) -> str: """去掉模型偶发的引号、反引号包裹。""" s = (raw or "").strip().lower() if s.startswith("`"): s = s.strip("`").strip() if (s.startswith('"') and s.endswith('"')) or ( s.startswith("'") and s.endswith("'") ): s = s[1:-1].strip() return s @dataclass(frozen=True) class ChapterClassifyResult: """章节分类结果;``llm_said_none`` 仅当走 LLM 且解析为 none 时为 True(fragment 启发式不为 True)。""" category: str llm_said_none: bool = False def _parse_category_from_llm_response(raw: str) -> str: """优先解析 JSON ``{"category": "..."}``,失败则按纯文本 key 处理。""" s = (raw or "").strip() if not s: return "" try: data = json.loads(extract_json_payload(s)) if isinstance(data, dict) and "category" in data: return _normalize_llm_category(str(data["category"])) except (json.JSONDecodeError, TypeError, ValueError): pass return _normalize_llm_category(s) class ClassificationAgent: """将内容分类到 8 个章节类别之一;none/零散档案映射为 ``summary`` 仍进 Story。""" def classify( self, text: str, fallback_stage: str, llm: Any, *, segment_id: str | None = None, ) -> ChapterClassifyResult: """ 分类到 8 个章节类别之一。 LLM 返回 none 或启发式为零散档案时,``category`` 为 ``summary``(仍可走回忆录流水线; ``llm_said_none`` 仅在 LLM 明确返回 none 时为 True,供空转抑制判断)。 llm 需支持 .invoke(prompt) 同步调用。 """ if _looks_like_fragment_only(text): logger.info( "event=chapter_classification_summary_fallback reason=fragment_heuristic " "segment_id={} text_len={} category={}", segment_id or "", len(text or ""), _SUMMARY_FALLBACK_CATEGORY, ) return ChapterClassifyResult( category=_SUMMARY_FALLBACK_CATEGORY, llm_said_none=False, ) if llm: try: prompt = get_chapter_classification_json_prompt(text) out = llm_json_call( llm, prompt, ClassificationOutput, max_tokens=settings.memoir_classification_max_tokens, agent="ClassificationAgent.classify", ) category = _normalize_llm_category(out.category) if category == "none": logger.info( "event=chapter_classification_summary_fallback reason=llm_none " "segment_id={} text_len={} category={}", segment_id or "", len(text or ""), _SUMMARY_FALLBACK_CATEGORY, ) return ChapterClassifyResult( category=_SUMMARY_FALLBACK_CATEGORY, llm_said_none=True, ) if category in CHAPTER_CATEGORIES: return ChapterClassifyResult(category=category, llm_said_none=False) except (LLMCallError, ValidationError, ValueError, KeyError) as e: logger.warning("ClassificationAgent LLM 章节分类失败: {}", e) stage = _detect_stage(text, fallback_stage) cat = STAGE_TO_DEFAULT_CATEGORY.get( stage, STAGE_TO_DEFAULT_CATEGORY.get(fallback_stage, "childhood"), ) return ChapterClassifyResult(category=cat, llm_said_none=False)