life-echo/api/app/features/memoir/memoir_images/parser.py

import json
import re
from typing import Any

from .json_payload import extract_json_payload
from .schema import IMAGE_STATUS_PENDING

PLACEHOLDER_RE = re.compile(
    r"\{\{\{\{IMAGE:(.*?)\}\}\}\}|\{\{IMAGE:(.*?)\}\}",
    re.DOTALL,
)


def parse_image_placeholders(content: str, max_images: int) -> list[dict[str, Any]]:
    items: list[dict[str, Any]] = []
    for match in PLACEHOLDER_RE.finditer(content or ""):
        description = (match.group(1) or match.group(2) or "").strip()
        if not description:
            continue
        items.append(
            {
                "index": len(items),
                "description": description,
                "placeholder": match.group(0),
                "start_offset": match.start(),
            }
        )
        if max_images is not None and len(items) >= max_images:
            break
    return items


def build_initial_image_assets(
    placeholders: list[dict[str, Any]],
    provider: str,
    style: str,
    size: str,
    now_iso: str,
) -> list[dict[str, Any]]:
    return [
        {
            "index": item["index"],
            "placeholder": item["placeholder"],
            "description": item["description"],
            "prompt": None,
            "url": None,
            "status": IMAGE_STATUS_PENDING,
            "provider": provider,
            "style": style,
            "size": size,
            "error": None,
            "created_at": now_iso,
            "updated_at": now_iso,
        }
        for item in placeholders
    ]


def split_narrative_to_sections(narrative: str) -> list[dict[str, Any]]:
    """
    将带 {{IMAGE:...}} 占位符的正文按占位符拆成多段。
    返回 list[dict]，每项含:
    - content: 本段纯文本（不含占位符）
    - placeholder_info: 本段后的配图占位信息，或 None（最后一段无图）
    """
    if not (narrative or narrative.strip()):
        return []
    placeholders = parse_image_placeholders(narrative, max_images=None)
    sections: list[dict[str, Any]] = []
    for i in range(len(placeholders) + 1):
        if i == 0:
            start = 0
        else:
            prev = placeholders[i - 1]
            start = prev["start_offset"] + len(prev["placeholder"])
        if i < len(placeholders):
            end = placeholders[i]["start_offset"]
            placeholder_info = placeholders[i]
        else:
            end = len(narrative)
            placeholder_info = None
        content = narrative[start:end]
        if isinstance(content, str):
            content = content.strip()
        sections.append(
            {"content": content or "", "placeholder_info": placeholder_info}
        )
    return sections


def parse_narrative_json(raw: str) -> list[dict[str, Any]]:
    """
    解析 LLM 输出的 JSON 格式叙事。
    返回与 split_narrative_to_sections 相同结构：list[dict]，每项含 content、placeholder_info。
    """
    if not (raw or raw.strip()):
        return []
    try:
        payload = extract_json_payload(raw)
        data = json.loads(payload)
        paragraphs = data.get("paragraphs") or []
        if not isinstance(paragraphs, list):
            return []
    except (json.JSONDecodeError, TypeError, AttributeError):
        return []

    result: list[dict[str, Any]] = []
    for i, p in enumerate(paragraphs):
        if not isinstance(p, dict):
            continue
        content = (p.get("content") or "").strip()
        desc = (p.get("image_description") or "").strip()
        placeholder_info = None
        if desc:
            placeholder_info = {
                "placeholder": f"{{{{IMAGE:{desc}}}}}",
                "description": desc,
                "index": i,
                "start_offset": 0,
            }
        result.append({"content": content, "placeholder_info": placeholder_info})
    return result


def parse_narrative_to_sections(narrative: str) -> list[dict[str, Any]]:
    """
    将 narrative 解析为 sections。优先尝试 JSON 格式，失败则回退到占位符解析。
    返回与 split_narrative_to_sections 相同结构。
    """
    if not (narrative or narrative.strip()):
        return []
    stripped = narrative.strip()
    if stripped.startswith("{") and "paragraphs" in stripped:
        segments = parse_narrative_json(narrative)
        if segments:
            return segments
    return split_narrative_to_sections(narrative)