life-echo/api/app/features/memoir/memoir_images/parser.py

import json
import re
from typing import Any

from app.features.memoir.asset_resolver import strip_legacy_image_placeholders

from .json_payload import extract_json_payload
from .schema import IMAGE_STATUS_PENDING

PLACEHOLDER_RE = re.compile(
    r"\{\{\{\{IMAGE:(.*?)\}\}\}\}|\{\{IMAGE:(.*?)\}\}",
    re.DOTALL,
)


def parse_image_placeholders(content: str, max_images: int) -> list[dict[str, Any]]:
    """离线迁移/调试用：解析正文中的 IMAGE 占位符。"""
    items: list[dict[str, Any]] = []
    for match in PLACEHOLDER_RE.finditer(content or ""):
        description = (match.group(1) or match.group(2) or "").strip()
        if not description:
            continue
        items.append(
            {
                "index": len(items),
                "description": description,
                "placeholder": match.group(0),
                "start_offset": match.start(),
            }
        )
        if max_images is not None and len(items) >= max_images:
            break
    return items


def build_initial_image_assets(
    placeholders: list[dict[str, Any]],
    provider: str,
    style: str,
    size: str,
    now_iso: str,
) -> list[dict[str, Any]]:
    return [
        {
            "index": item["index"],
            "placeholder": item["placeholder"],
            "description": item["description"],
            "prompt": None,
            "url": None,
            "status": IMAGE_STATUS_PENDING,
            "provider": provider,
            "style": style,
            "size": size,
            "error": None,
            "created_at": now_iso,
            "updated_at": now_iso,
        }
        for item in placeholders
    ]


def parse_narrative_json(raw: str) -> list[dict[str, Any]]:
    """
    解析 LLM 输出的 JSON 叙事（paragraphs）。
    不根据 image_description 生成配图占位；插图由 story/chapter 结构化流程单独处理。
    """
    if not raw or not str(raw).strip():
        return []
    try:
        payload = extract_json_payload(raw)
        data = json.loads(payload)
        paragraphs = data.get("paragraphs") or []
        if not isinstance(paragraphs, list):
            return []
    except (json.JSONDecodeError, TypeError, AttributeError):
        return []

    result: list[dict[str, Any]] = []
    for p in paragraphs:
        if not isinstance(p, dict):
            continue
        content = (p.get("content") or "").strip()
        if content:
            result.append({"content": content, "placeholder_info": None})
    return result


def split_plain_narrative_into_sections(narrative: str) -> list[dict[str, Any]]:
    """非 JSON 叙事：去掉遗留占位符后按空行拆段，不产生段落配图。"""
    text = strip_legacy_image_placeholders(narrative or "")
    if not text.strip():
        return []
    parts = [p.strip() for p in text.split("\n\n") if p.strip()]
    return [{"content": p, "placeholder_info": None} for p in parts]


def parse_narrative_to_sections(narrative: str) -> list[dict[str, Any]]:
    """
    将 narrative 解析为 sections。
    JSON（paragraphs）走 parse_narrative_json；否则剥离占位符后按段拆分。
    """
    if not narrative or not str(narrative).strip():
        return []
    stripped = narrative.strip()
    if stripped.startswith("{") and "paragraphs" in stripped:
        segments = parse_narrative_json(narrative)
        if segments:
            return segments
    return split_plain_narrative_into_sections(narrative)