life-echo/api/app/features/story/image_intent_extractor.py

"""
StoryImageIntentExtractor — 从 story markdown 提取唯一主图意图。

每个 story 必须且仅有一张主插图。提取策略：
1. 最具画面感的场景段落
2. 具有人物 + 动作 + 场景 + 时代细节的段落
3. 故事转折点或记忆锚点段落
4. 若 story 过于抽象，则退化为 story title/stage/time_refs/place_refs/people_refs/summary
"""

from __future__ import annotations

import re
from dataclasses import dataclass
from typing import Any


@dataclass
class StoryImageIntentResult:
    """提取出的主图意图。"""

    caption: str
    prompt_brief: str
    source_span: dict[str, Any] | None
    style_profile: str | None


# 画面感相关词汇（用于简单启发式评分）
_SCENE_WORDS = frozenset(
    "坐 站 走 跑 看 望 笑 哭 说 听 拿 放 穿 戴 吃 喝 院子 路 巷 房 屋 树 花 山 水 河 桥 街 镇 村 城 夏天 冬天 春天 秋天 早晨 傍晚 夜晚 童年 少年 青年 中年 老人 奶奶 爷爷 父亲 母亲 孩子 朋友 老师 同学".split()
)


def _score_paragraph(text: str) -> float:
    """对段落做简单画面感评分。"""
    if not text or len(text.strip()) < 20:
        return 0.0
    t = text.strip()
    score = min(len(t) / 100.0, 2.0)  # 长度 0~2 分
    overlap = sum(1 for w in _SCENE_WORDS if w in t)
    score += min(overlap * 0.3, 2.0)  # 场景词 0~2 分
    return score


def extract_primary_image_intent(
    markdown: str,
    *,
    title: str = "",
    stage: str | None = None,
    summary: str | None = None,
    people_refs: list[str] | None = None,
    place_refs: list[str] | None = None,
    time_start: str | None = None,
    time_end: str | None = None,
    style_profile: str | None = None,
) -> StoryImageIntentResult:
    """
    从 story markdown 提取唯一主图意图。

    优先从正文中选取最具画面感的段落；若正文过短或过于抽象，则使用 fallback。
    """
    paragraphs: list[tuple[str, int, int]] = []  # (text, start, end)
    if markdown and markdown.strip():
        parts = re.split(r"\n\n+", markdown.strip())
        offset = 0
        for p in parts:
            t = p.strip()
            if t:
                start = markdown.find(t, offset)
                end = start + len(t)
                paragraphs.append((t, start, end))
                offset = end

    best_caption = ""
    best_prompt_brief = ""
    best_source_span: dict[str, Any] | None = None
    best_score = 0.0

    for text, start, end in paragraphs:
        score = _score_paragraph(text)
        if score > best_score:
            best_score = score
            best_caption = (text[:80] + "…") if len(text) > 80 else text
            best_prompt_brief = text[:500].strip()
            best_source_span = {"start": start, "end": end, "text_preview": text[:100]}

    if best_score >= 0.5:
        return StoryImageIntentResult(
            caption=best_caption,
            prompt_brief=best_prompt_brief,
            source_span=best_source_span,
            style_profile=style_profile,
        )

    # Fallback: story title, stage, time, place, people, summary
    fallback_parts = []
    if title:
        fallback_parts.append(title)
    if stage:
        fallback_parts.append(stage)
    if time_start or time_end:
        fallback_parts.append(f"{time_start or ''}-{time_end or ''}".strip("-"))
    if place_refs:
        fallback_parts.extend(place_refs[:3])
    if people_refs:
        fallback_parts.extend(people_refs[:3])
    if summary:
        fallback_parts.append(summary[:200])
    fallback_text = "，".join(p for p in fallback_parts if p) or "人生故事"
    return StoryImageIntentResult(
        caption=fallback_text[:80],
        prompt_brief=fallback_text,
        source_span=None,
        style_profile=style_profile,
    )