api/app/agents/memoir/prompts.py

"""
回忆录整理 Agent 提示词模板
"""

import json
import re
from typing import Optional

CHAPTER_CATEGORIES = {
    "childhood": "童年与成长背景",
    "education": "教育经历与青年时期",
    "career_early": "崭露头角",
    "career_achievement": "主要成就与巅峰时刻",
    "career_challenge": "挫折、挑战与重大转折",
    "family": "家庭与情感",
    "beliefs": "信念与价值观",
    "summary": "人生总结",
}

CHAPTER_ORDER = [
    "childhood",
    "education",
    "career_early",
    "career_achievement",
    "career_challenge",
    "family",
    "beliefs",
    "summary",
]

STAGE_TO_ORDER = {
    "childhood": 0,
    "education": 1,
    "career": 2,
    "career_early": 2,
    "career_achievement": 3,
    "career_challenge": 4,
    "family": 5,
    "belief": 6,
    "beliefs": 6,
    "summary": 7,
}

IMAGE_PLACEHOLDER_TEMPLATE = (
    "温暖怀旧风格，年代感复古色调，柔和光影，朴素温馨氛围，安静治愈，低饱和度，"
    "质感柔和细腻，简约构图，充满岁月沉淀感与故事感，高清唯美插画封面，不要包含文字，"
    "要适合老年人审美，画面要真实可信、让老年人产生共鸣与代入感，"
    "场景环境、建筑风格、服饰器物必须严格符合所述时代背景和地域特色，"
    "有朦胧怀旧的年代感。"
)

_IMAGE_PLACEHOLDER_ANY_BRACES_RE = re.compile(
    r"(\{\{)+IMAGE:\s*([^}]+)(\}\})+",
    re.DOTALL,
)


def inject_image_placeholder_template(content: str) -> str:
    """
    对正文中的 IMAGE 占位符拼上固定风格模板（四层花括号）。
    **线上写路径已不使用**；保留供离线迁移脚本处理历史数据。
    """
    if not content or not content.strip():
        return content

    def replace_one(match: re.Match) -> str:
        inner = (match.group(2) or "").strip()
        if not inner:
            return match.group(0)
        if inner.startswith(IMAGE_PLACEHOLDER_TEMPLATE):
            desc = inner[len(IMAGE_PLACEHOLDER_TEMPLATE) :].lstrip("。").strip()
            return (
                "{{{{IMAGE:"
                + IMAGE_PLACEHOLDER_TEMPLATE
                + ("。" + desc if desc else "")
                + "}}}}"
            )
        return "{{{{IMAGE:" + IMAGE_PLACEHOLDER_TEMPLATE + "。" + inner + "}}}}"

    content = _IMAGE_PLACEHOLDER_ANY_BRACES_RE.sub(replace_one, content)
    return content


def get_system_prompt() -> str:
    """获取整理 Agent 的系统提示词"""
    return """你是一位专业的传记作家和文字编辑，擅长将口语化的对话内容整理成优雅的书面语回忆录章节。

你的任务：
1. 接收对话段落文本（口语化，可能来自语音转写）
2. **先提炼对话中与人生经历相关的核心内容**，过滤掉无关信息
3. 识别内容主题，归类到对应章节（童年/教育/事业/家庭/信念/总结）
4. 将口语化表达改写为书面语，保持原意和情感
5. 生成合适的章节标题和段落结构
6. 提取关键信息，形成连贯的叙述

## 内容筛选原则（最重要）
对话中往往夹杂大量与回忆录无关的噪音，你必须严格筛选，只保留有价值的内容：

应该保留的内容：
- 具体的人生事件、经历、故事
- 提到的人物及其关系（家人、朋友、同事、恩师等）
- 地点、时间、场景描写
- 用户的情感表达、内心感受
- 人生感悟、价值观、信念
- 具体的细节（食物、声音、画面等）

应该过滤掉的内容：
- 语气词、填充词（嗯、啊、那个、就是说、对对对、然后呢等）
- 对话中的寒暄、问候（你好、谢谢、好的等）
- 用户与AI助手之间的交互指令（你帮我、我想问、你说得对等）
- 重复、冗余的表述（取核心含义即可）
- 与个人经历完全无关的闲聊内容

## 改写原则
- 保持用户的真实情感
- 使用优雅但不失亲切的书面语，不要直接引用对话原话
- 适当添加过渡句，使段落连贯
- 保留生动的细节，但将口语表达改写为书面叙述
- 去除口语中的填充词和无意义重复
- 保持时间顺序和逻辑清晰

## 章节分类规则
- 童年相关 → "童年与成长背景"
- 学校、老师、同学 → "教育经历与青年时期"
- 工作、职业、成就 → "主要成就与巅峰时刻" 或 "崭露头角"
- 困难、挫折 → "挫折、挑战与重大转折"
- 伴侣、孩子、家庭生活 → "家庭与情感"
- 价值观、信念、座右铭 → "信念与价值观"
- 总结、感悟、展望 → "人生总结"
"""


def get_chapter_classification_prompt(segments_text: str) -> str:
    """获取章节分类的提示词"""
    return f"""{get_system_prompt()}

请分析以下对话内容，**忽略其中的语气词、寒暄和无关对话**，只关注涉及人生经历的实质内容，判断应该归类到哪个章节类别：
- childhood: 童年与成长背景
- education: 教育经历与青年时期
- career_early: 崭露头角（早期事业）
- career_achievement: 主要成就与巅峰时刻
- career_challenge: 挫折、挑战与重大转折
- family: 家庭与情感
- beliefs: 信念与价值观
- summary: 人生总结

对话内容：
{segments_text}

请只返回章节类别（如：childhood），不要返回其他内容。
如果对话内容中没有任何与人生经历相关的实质内容，返回 none。"""


def get_text_rewrite_prompt(
    segments_text: str, chapter_category: str, existing_content: str = ""
) -> str:
    """获取文本改写的提示词"""
    chapter_name = CHAPTER_CATEGORIES.get(chapter_category, chapter_category)
    existing_section = (
        f"\n\n已有章节内容：\n{existing_content}" if existing_content else ""
    )
    return f"""{get_system_prompt()}

请将以下口语化的对话内容改写为书面语，归类到"{chapter_name}"章节。

对话内容：
{segments_text}
{existing_section}

请按照以下格式返回 JSON：
{{
    "title": "章节标题",
    "content": "改写后的书面语内容",
    "summary": "章节摘要（50字以内）"
}}

要求：
1. 标题要简洁有力，能概括章节主题
2. 内容要流畅自然，保持原意和情感
3. 如果已有章节内容，请将新内容与已有内容自然融合"""


def get_state_extraction_prompt(
    user_message: str, current_stage: str, stage_slots: dict
) -> str:
    """抽取结构化信息并判断阶段"""
    slot_keys = list(stage_slots.keys())
    all_stage_slots = {
        "childhood": ["place", "people", "daily_life", "emotion", "turning_event"],
        "education": ["school", "city", "motivation", "challenge", "change"],
        "career": ["job", "environment", "decision", "pressure", "growth"],
        "family": ["relationship", "conflict", "support", "responsibility", "change"],
        "belief": ["value", "regret", "pride", "lesson"],
    }

    return f"""{get_system_prompt()}

你需要从用户话语中**先提炼与人生经历相关的核心内容**，然后抽取结构化信息，并判断用户实际在谈论哪个人生阶段。

系统当前跟踪的阶段：{current_stage}
该阶段可填 slots：{slot_keys}

所有阶段及其 slots 参考：
{json.dumps(all_stage_slots, ensure_ascii=False, indent=2)}

用户话语：
{user_message}

请只返回 JSON，格式如下：
{{
  "detected_stage": "childhood|education|career|family|belief",
  "slots": {{
    "slot_key": "snippet"
  }},
  "emotion": "neutral|warm|low|highlight",
  "is_new_chapter": true
}}

要求：
1. **先忽略话语中的语气词、填充词、寒暄、与AI的交互指令等无关内容**，只关注涉及人生经历的实质信息
2. **detected_stage 必须根据用户话语的实际内容判断**，不要默认沿用系统当前阶段。用户可能在聊不同阶段的事情
3. slots 的 key 必须属于 detected_stage 对应的 slot 列表
4. slots 只填写确实提到的、与人生经历相关的实质内容
5. **snippet 应是提炼后的核心信息**，去除语气词和冗余表达，50 字以内
6. 如果用户话语中没有任何与人生经历相关的实质内容（如纯粹的寒暄、指令、语气词），slots 为空对象
"""


def _build_age_hint(stage: str, birth_year: Optional[int] = None) -> str:
    """根据人生阶段和出生年份推算大致年龄区间"""
    if not birth_year:
        return ""
    stage_age_ranges = {
        "childhood": (0, 12),
        "education": (6, 22),
        "career": (18, 60),
        "career_early": (18, 30),
        "career_achievement": (25, 55),
        "career_challenge": (20, 55),
        "family": (20, 60),
        "belief": (30, 70),
        "beliefs": (30, 70),
        "summary": (50, 80),
    }
    age_range = stage_age_ranges.get(stage)
    if not age_range:
        return ""
    year_start = birth_year + age_range[0]
    year_end = birth_year + age_range[1]
    return f"大约 {year_start}-{year_end} 年（{age_range[0]}-{age_range[1]} 岁）"


def get_creative_title_prompt(
    stage: str,
    emotion: str,
    slots: dict,
    user_profile: str = "",
    birth_year: Optional[int] = None,
) -> str:
    """生成有创意的章节标题，包含年龄/时间信息"""
    age_hint = _build_age_hint(stage, birth_year)
    profile_section = f"\n用户基本信息：\n{user_profile}" if user_profile else ""
    time_section = f"\n时间参考：{age_hint}" if age_hint else ""

    return f"""{get_system_prompt()}

请根据阶段和情绪生成 1 个有创意的章节标题。
阶段：{stage}
情绪：{emotion}
可用信息：{slots}{profile_section}{time_section}

要求：
1. 标题格式：「时间标注 · 标题正文」
   - 时间标注用年龄或年代表示，如"6-12岁"、"1980年代"、"二十出头"
   - 标题正文 12-18 字以内
2. 情绪 + 人生阶段 + 意象
3. 示例风格：
   - 《6-12岁 · 那条巷子尽头的蝉鸣》
   - 《18岁 · 第一次离开家的夏天》
   - 《25-35岁 · 在陌生城市站稳脚跟》
   - 《四十不惑 · 慢下来，人生开始发声》
   - 《1990年代 · 不是所有选择都被理解》

只输出标题文字，不要加引号或书名号。
"""


def get_narrative_prompt(
    stage: str,
    slots: dict,
    new_content: str,
    existing_content: str = "",
    user_profile: str = "",
    birth_year: Optional[int] = None,
    archived_summaries: str = "",
) -> str:
    """将新对话改写为叙述（只输出新内容的改写，不重复已有内容）"""
    context_tail = ""
    if existing_content:
        context_tail = (
            existing_content[-300:] if len(existing_content) > 300 else existing_content
        )
    context_section = (
        f"\n\n【衔接上下文（已有内容的末尾，仅供参考衔接，不要重复）】：\n{context_tail}"
        if context_tail
        else ""
    )
    archived_section = (
        f"\n\n【已删除的该类别历史章节（仅供参考，请勿直接使用或重复）】：\n{archived_summaries}"
        if archived_summaries
        else ""
    )

    profile_section = f"\n\n用户基本信息：\n{user_profile}" if user_profile else ""
    age_hint = _build_age_hint(stage, birth_year)
    time_section = f"\n时间参考：{age_hint}" if age_hint else ""

    return f"""{get_system_prompt()}

请将以下新的对话内容改写为第一人称文学叙述。
阶段：{stage}
可用信息：{slots}{profile_section}{time_section}

新的对话内容：
{new_content}
{context_section}
{archived_section}

## 第一步：提炼核心内容
在改写之前，请先从对话内容中提炼出与人生经历相关的核心信息：
- 提取具体的事件、人物、地点、时间、感受
- 丢弃语气词（嗯、啊、那个、就是说）、寒暄（你好、谢谢）、与AI的交互（你帮我整理一下、对对对你说得对）、无意义的重复
- 如果对话内容中几乎没有与人生经历相关的实质内容，请输出空字符串

## 第二步：改写为叙述
基于提炼后的核心内容进行文学改写：
1. 使用第一人称叙述
2. **不要直接引用对话原话**，将所有内容改写为流畅的书面叙述
3. **只输出新内容的改写结果**，不要重复已有内容
4. 如果有衔接上下文，确保新内容与之自然衔接（语气、时间线连贯）
5. 语气自然，有情绪
6. 如果有用户的基本信息（出生地、成长地等），在叙述中自然融入地域文化和时代背景
8. **不要将对话中的交互性语言（如"我跟你说"、"你知道吗"）写入叙述**
9. **不要在正文中插入章节标题或分类标签**（如"章节：信念与价值观"、"## 童年与成长背景"等），章节标题由系统单独管理
10. **不要使用 Markdown 表格**（不要用 `|` 管道表格）；故事标题由系统单独管理，**不要用 `#`、`##` 在正文里写故事标题**

只输出新对话内容的改写结果。如果对话中没有值得记录的人生经历内容，输出空字符串。
"""


def get_narrative_json_prompt(
    stage: str,
    slots: dict,
    new_content: str,
    existing_content: str = "",
    user_profile: str = "",
    birth_year: Optional[int] = None,
) -> str:
    """将新对话改写为叙述，输出 JSON 格式（paragraphs: [{content, image_description}]）"""
    context_tail = ""
    if existing_content:
        context_tail = (
            existing_content[-300:] if len(existing_content) > 300 else existing_content
        )
    context_section = (
        f"\n\n【衔接上下文（已有内容的末尾，仅供参考衔接，不要重复）】：\n{context_tail}"
        if context_tail
        else ""
    )
    profile_section = f"\n\n用户基本信息：\n{user_profile}" if user_profile else ""
    age_hint = _build_age_hint(stage, birth_year)
    time_section = f"\n时间参考：{age_hint}" if age_hint else ""

    return f"""{get_system_prompt()}

请将以下新的对话内容改写为第一人称文学叙述，并输出 **纯 JSON**，不要包含任何其他文字或 markdown 代码块。

阶段：{stage}
可用信息：{slots}{profile_section}{time_section}

新的对话内容：
{new_content}
{context_section}

## 要求
1. 从对话中提炼与人生经历相关的核心内容，过滤语气词、寒暄、与AI的交互
2. 使用第一人称，改写为流畅的书面叙述，不要直接引用对话原话
3. 只输出新内容的改写，不要重复已有内容
4. **本批输入对应一个独立叙事单元**：只围绕同一主题/事件链展开，不要写入与上述对话无关的其他话题或回忆
5. 每 200-300 字左右一个段落
6. 如有衔接上下文，确保新内容与之自然衔接
7. **不要使用 Markdown 表格**（不要用 `|` 管道表格）
8. **不要用 `#`、`##` 写故事或章节标题**；标题由系统管理

## 输出格式（严格 JSON）
{{
  "paragraphs": [
    {{"content": "段落正文"}},
    ...
  ]
}}

- content: 本段纯正文

如果对话中没有值得记录的人生经历内容，输出：{{"paragraphs": []}}
"""


def get_story_route_prompt(
    *,
    chapter_category: str,
    chapter_title: str,
    batch_transcript: str,
    candidate_stories_json: str,
) -> str:
    """Celery 批次：判断写入新 story 还是追加已有 story。输出严格 JSON。"""
    return f"""你是回忆录编辑助手。根据本批用户口述与候选故事列表，决定：
- append_story：内容明显延续、补充某一已有故事的主题与时间线，且能对应到具体 candidate id
- new_story：新话题、新人生阶段片段，或与所有候选故事都不够贴合

「故事」在此指：**可独立讲述的一段人生经历**——单一主题或同一事件链；不要假设本批里包含多个互不相关的故事（多段由系统其它步骤处理）。

当前章节（写作容器）：
- category: {chapter_category}
- title: {chapter_title}

【本批口述合并文本】
{batch_transcript}

【候选故事】（仅允许在 append 时选择其中的 id；id 必须原样复制）
{candidate_stories_json}

## 输出 JSON（仅此一个对象，不要 markdown）
{{
  "decision": "new_story" | "append_story",
  "target_story_id": "<uuid 或 null；append 时必填且必须来自候选>",
  "new_story_title": "<短标题，6-20 字；new_story 时必填，append 时可 null>",
  "reason": "<一句中文理由>"
}}

规则：
- 若无法自信匹配某一候选，选 new_story
- new_story_title 应概括本批新内容，不要与候选标题重复
"""


def get_story_batch_plan_prompt(
    *,
    chapter_category: str,
    chapter_title: str,
    segments_json: str,
    candidate_stories_json: str,
) -> str:
    """同一章节类别下多 segment：划分为若干写入单元（每单元 new 或 append）。输出严格 JSON。"""
    return f"""你是回忆录编辑助手。下面同一章节类别下有一批**按时间顺序**的用户口述片段（每段有 id 与文本）。

## 「故事」定义（必须遵守）
一段「故事」= **可独立讲述的一段人生经历**：单一主题或同一事件链，能单独成篇。若话题切换、时间线跳到另一件事、人物/主线明显变化，应作为**新的故事**（new_story），而不是塞进同一段 append。

## 任务
将本批 segment **划分为连续若干块**（每块包含至少一个 segment，顺序不能打乱；每个 segment 必须恰好属于一块）。对每一块决定：
- **append_story**：内容明显延续、补充**某一已有候选故事**的主题与时间线，且能对应到具体 candidate id
- **new_story**：新话题、与所有候选故事都不够贴合、或应独立成篇的片段

当前章节（写作容器）：
- category: {chapter_category}
- title: {chapter_title}

【本批口述片段】（JSON 数组，顺序即口述顺序）
{segments_json}

【候选故事】（仅允许在 append 时选择其中的 id；id 必须原样复制）
{candidate_stories_json}

## 输出 JSON（仅此一个对象，不要 markdown）
{{
  "units": [
    {{
      "segment_ids": ["<按顺序列出本块包含的 segment id>"],
      "decision": "new_story" | "append_story",
      "target_story_id": "<uuid 或 null；append 时必填且必须来自候选>",
      "new_story_title": "<短标题，6-20 字；new_story 时必填，append 时可 null>",
      "reason": "<一句中文理由，可选>"
    }}
  ]
}}

规则：
- `units` 中所有 `segment_ids` 拼接后，必须**不重不漏**地覆盖本批全部 id，且顺序与【本批口述片段】数组一致
- 若无法自信匹配某一候选，对该块选 new_story
- new_story_title 应概括该块内容，不要与候选标题重复
"""


def format_evidence_chunks_for_prompt(evidence: dict) -> str:
    """将 retrieve_evidence 结果格式化为简短文本，供叙事 prompt 使用。"""
    chunks = evidence.get("relevant_chunks") or []
    facts = evidence.get("relevant_facts") or []
    parts: list[str] = []
    for c in chunks[:10]:
        content = (
            c.get("content", "") if isinstance(c, dict) else getattr(c, "content", "")
        )
        if content:
            parts.append(content.strip())
    for f in facts[:5]:
        if isinstance(f, dict):
            subj = f.get("subject", "")
            pred = f.get("predicate", "")
            obj = f.get("object_json", "")
            if subj or pred:
                parts.append(f"{subj} {pred} {obj}")
        else:
            parts.append(f"{getattr(f, 'subject', '')} {getattr(f, 'predicate', '')}")
    return "\n\n".join(parts) if parts else ""