2026-03-19 10:38:11 +08:00
|
|
|
|
"""
|
|
|
|
|
|
ClassificationAgent:将内容分类到 8 个章节类别,或判定无价值返回 None。
|
|
|
|
|
|
对应现有逻辑:_classify_chapter_category
|
2026-03-23 13:54:41 +08:00
|
|
|
|
|
|
|
|
|
|
返回 None 表示本段不进入回忆录 Story/章节流水线;与 StoryRoute 中「可独立讲述的一段人生经历」
|
|
|
|
|
|
(见 prompts.get_story_route_prompt)在标准上对齐:零散档案点不进 Story,记忆与 slot 抽取仍由上游完成。
|
2026-03-19 10:38:11 +08:00
|
|
|
|
"""
|
2026-03-19 14:36:14 +08:00
|
|
|
|
|
2026-03-19 10:38:11 +08:00
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
2026-03-26 12:13:36 +08:00
|
|
|
|
import json
|
2026-03-23 13:54:41 +08:00
|
|
|
|
import re
|
2026-03-19 10:38:11 +08:00
|
|
|
|
from typing import Any, Optional
|
|
|
|
|
|
|
2026-03-19 10:54:48 +08:00
|
|
|
|
from app.agents.memoir.prompts import (
|
2026-03-19 10:38:11 +08:00
|
|
|
|
CHAPTER_CATEGORIES,
|
2026-03-26 12:13:36 +08:00
|
|
|
|
get_chapter_classification_json_prompt,
|
2026-03-19 10:38:11 +08:00
|
|
|
|
)
|
2026-03-26 12:13:36 +08:00
|
|
|
|
from app.core.langchain_llm import invoke_json_object
|
2026-03-22 16:45:57 +08:00
|
|
|
|
from app.core.logging import get_logger
|
2026-03-26 12:13:36 +08:00
|
|
|
|
from app.features.memoir.memoir_images.json_payload import extract_json_payload
|
2026-03-19 10:38:11 +08:00
|
|
|
|
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
|
2026-03-23 13:54:41 +08:00
|
|
|
|
# 与「仅档案句式」组合使用;过短但明显为叙事句的仍交 LLM 判断
|
|
|
|
|
|
_FRAGMENT_SHORT_MAX_LEN = 48
|
|
|
|
|
|
|
|
|
|
|
|
# 整段仅为出生年份/年份声明(零散档案,不成故事)
|
|
|
|
|
|
_BIRTH_YEAR_LINE = re.compile(
|
|
|
|
|
|
r"^[\s\u200b]*(?:我)?\d{4}\s*年\s*(出生|生的|生)?\s*[。.!!]?[\s\u200b]*$",
|
|
|
|
|
|
re.UNICODE,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 极短且为「我是某地人」式籍贯标签,无过程描写
|
|
|
|
|
|
_SHORT_HUKOU_STYLE = re.compile(
|
|
|
|
|
|
r"^[\s\u200b]*(?:我)?是[\u4e00-\u9fff]{1,10}(人|籍)\s*[。.!!]?[\s\u200b]*$",
|
|
|
|
|
|
re.UNICODE,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 5-stage 关键词(用于 LLM 失败时的兜底);注意勿含易与「仅年份句」共现的泛词,以免误推类别
|
2026-03-19 10:38:11 +08:00
|
|
|
|
STAGE_KEYWORDS = {
|
2026-03-23 13:54:41 +08:00
|
|
|
|
"childhood": ["童年", "小时候", "家乡", "小镇"],
|
2026-03-19 10:38:11 +08:00
|
|
|
|
"education": ["上学", "学校", "老师", "同学", "教育", "大学"],
|
|
|
|
|
|
"career": ["工作", "职业", "事业", "公司", "同事", "创业"],
|
|
|
|
|
|
"family": ["伴侣", "孩子", "家庭", "家人", "结婚", "父母"],
|
|
|
|
|
|
"belief": ["信念", "价值观", "座右铭", "坚持", "原则"],
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 5-stage → 默认 8-category 映射(LLM 分类失败时的兜底)
|
|
|
|
|
|
_STAGE_TO_DEFAULT_CATEGORY = {
|
|
|
|
|
|
"childhood": "childhood",
|
|
|
|
|
|
"education": "education",
|
|
|
|
|
|
"career": "career_early",
|
|
|
|
|
|
"family": "family",
|
|
|
|
|
|
"belief": "beliefs",
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _detect_stage(text: str, fallback_stage: str) -> str:
|
|
|
|
|
|
"""根据关键词检测消息所属的 5-stage 阶段"""
|
|
|
|
|
|
message = (text or "").lower()
|
|
|
|
|
|
for stage, keywords in STAGE_KEYWORDS.items():
|
|
|
|
|
|
if any(word in message for word in keywords):
|
|
|
|
|
|
return stage
|
|
|
|
|
|
return fallback_stage
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-23 13:54:41 +08:00
|
|
|
|
def _looks_like_fragment_only(text: str) -> bool:
|
|
|
|
|
|
"""
|
|
|
|
|
|
保守启发式:明显为档案点/标签句,不足以作为 Story 叙事单元。
|
|
|
|
|
|
与 get_chapter_classification_prompt 中「应返回 none」的情形一致;误判风险通过窄正则控制。
|
|
|
|
|
|
"""
|
|
|
|
|
|
s = (text or "").strip()
|
|
|
|
|
|
if not s:
|
|
|
|
|
|
return True
|
|
|
|
|
|
if _BIRTH_YEAR_LINE.match(s):
|
|
|
|
|
|
return True
|
|
|
|
|
|
if len(s) <= _FRAGMENT_SHORT_MAX_LEN and _SHORT_HUKOU_STYLE.match(s):
|
|
|
|
|
|
return True
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _normalize_llm_category(raw: str) -> str:
|
|
|
|
|
|
"""去掉模型偶发的引号、反引号包裹。"""
|
|
|
|
|
|
s = (raw or "").strip().lower()
|
|
|
|
|
|
if s.startswith("`"):
|
|
|
|
|
|
s = s.strip("`").strip()
|
|
|
|
|
|
if (s.startswith('"') and s.endswith('"')) or (
|
|
|
|
|
|
s.startswith("'") and s.endswith("'")
|
|
|
|
|
|
):
|
|
|
|
|
|
s = s[1:-1].strip()
|
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-26 12:13:36 +08:00
|
|
|
|
def _parse_category_from_llm_response(raw: str) -> str:
|
|
|
|
|
|
"""优先解析 JSON ``{"category": "..."}``,失败则按纯文本 key 处理。"""
|
|
|
|
|
|
s = (raw or "").strip()
|
|
|
|
|
|
if not s:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
try:
|
|
|
|
|
|
data = json.loads(extract_json_payload(s))
|
|
|
|
|
|
if isinstance(data, dict) and "category" in data:
|
|
|
|
|
|
return _normalize_llm_category(str(data["category"]))
|
|
|
|
|
|
except (json.JSONDecodeError, TypeError, ValueError):
|
|
|
|
|
|
pass
|
|
|
|
|
|
return _normalize_llm_category(s)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-19 10:38:11 +08:00
|
|
|
|
class ClassificationAgent:
|
|
|
|
|
|
"""将内容分类到 8 个章节类别之一,或判定无价值返回 None"""
|
|
|
|
|
|
|
|
|
|
|
|
def classify(
|
|
|
|
|
|
self,
|
|
|
|
|
|
text: str,
|
|
|
|
|
|
fallback_stage: str,
|
|
|
|
|
|
llm: Any,
|
|
|
|
|
|
) -> Optional[str]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
分类到 8 个章节类别之一。
|
2026-03-23 13:54:41 +08:00
|
|
|
|
若 LLM 判定内容不足以独立成篇(none)或启发式判定为零散档案点,返回 None。
|
2026-03-19 10:38:11 +08:00
|
|
|
|
llm 需支持 .invoke(prompt) 同步调用。
|
|
|
|
|
|
"""
|
2026-03-23 13:54:41 +08:00
|
|
|
|
if _looks_like_fragment_only(text):
|
|
|
|
|
|
logger.debug(
|
2026-03-26 12:13:36 +08:00
|
|
|
|
"零散档案/极短标签句,跳过回忆录 Story: text_len={} text={}",
|
2026-03-23 13:54:41 +08:00
|
|
|
|
len(text or ""),
|
|
|
|
|
|
text or "",
|
|
|
|
|
|
)
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
2026-03-19 10:38:11 +08:00
|
|
|
|
if llm:
|
|
|
|
|
|
try:
|
2026-03-26 12:13:36 +08:00
|
|
|
|
prompt = get_chapter_classification_json_prompt(text)
|
|
|
|
|
|
raw = invoke_json_object(
|
|
|
|
|
|
llm,
|
|
|
|
|
|
prompt,
|
|
|
|
|
|
max_tokens=256,
|
|
|
|
|
|
agent="ClassificationAgent.classify",
|
|
|
|
|
|
)
|
|
|
|
|
|
category = _parse_category_from_llm_response(raw)
|
2026-03-19 10:38:11 +08:00
|
|
|
|
if category == "none":
|
2026-03-22 16:45:57 +08:00
|
|
|
|
logger.debug(
|
2026-03-26 12:13:36 +08:00
|
|
|
|
"LLM 判定内容不足以成篇,跳过: text_len={} text={}",
|
2026-03-22 16:45:57 +08:00
|
|
|
|
len(text or ""),
|
|
|
|
|
|
text or "",
|
2026-03-19 14:36:14 +08:00
|
|
|
|
)
|
2026-03-19 10:38:11 +08:00
|
|
|
|
return None
|
|
|
|
|
|
if category in CHAPTER_CATEGORIES:
|
|
|
|
|
|
return category
|
|
|
|
|
|
except Exception as e:
|
2026-03-26 12:13:36 +08:00
|
|
|
|
logger.warning("ClassificationAgent LLM 章节分类失败: {}", e)
|
2026-03-19 10:38:11 +08:00
|
|
|
|
|
|
|
|
|
|
stage = _detect_stage(text, fallback_stage)
|
|
|
|
|
|
return _STAGE_TO_DEFAULT_CATEGORY.get(
|
|
|
|
|
|
stage,
|
|
|
|
|
|
_STAGE_TO_DEFAULT_CATEGORY.get(fallback_stage, "childhood"),
|
|
|
|
|
|
)
|