Files
life-echo/api/app/features/memoir/memoir_images/parser.py
2026-03-19 14:36:40 +08:00

138 lines
4.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import re
from typing import Any
from .json_payload import extract_json_payload
from .schema import IMAGE_STATUS_PENDING
PLACEHOLDER_RE = re.compile(
r"\{\{\{\{IMAGE:(.*?)\}\}\}\}|\{\{IMAGE:(.*?)\}\}",
re.DOTALL,
)
def parse_image_placeholders(content: str, max_images: int) -> list[dict[str, Any]]:
items: list[dict[str, Any]] = []
for match in PLACEHOLDER_RE.finditer(content or ""):
description = (match.group(1) or match.group(2) or "").strip()
if not description:
continue
items.append(
{
"index": len(items),
"description": description,
"placeholder": match.group(0),
"start_offset": match.start(),
}
)
if max_images is not None and len(items) >= max_images:
break
return items
def build_initial_image_assets(
placeholders: list[dict[str, Any]],
provider: str,
style: str,
size: str,
now_iso: str,
) -> list[dict[str, Any]]:
return [
{
"index": item["index"],
"placeholder": item["placeholder"],
"description": item["description"],
"prompt": None,
"url": None,
"status": IMAGE_STATUS_PENDING,
"provider": provider,
"style": style,
"size": size,
"error": None,
"created_at": now_iso,
"updated_at": now_iso,
}
for item in placeholders
]
def split_narrative_to_sections(narrative: str) -> list[dict[str, Any]]:
"""
将带 {{IMAGE:...}} 占位符的正文按占位符拆成多段。
返回 list[dict],每项含:
- content: 本段纯文本(不含占位符)
- placeholder_info: 本段后的配图占位信息,或 None最后一段无图
"""
if not (narrative or narrative.strip()):
return []
placeholders = parse_image_placeholders(narrative, max_images=None)
sections: list[dict[str, Any]] = []
for i in range(len(placeholders) + 1):
if i == 0:
start = 0
else:
prev = placeholders[i - 1]
start = prev["start_offset"] + len(prev["placeholder"])
if i < len(placeholders):
end = placeholders[i]["start_offset"]
placeholder_info = placeholders[i]
else:
end = len(narrative)
placeholder_info = None
content = narrative[start:end]
if isinstance(content, str):
content = content.strip()
sections.append(
{"content": content or "", "placeholder_info": placeholder_info}
)
return sections
def parse_narrative_json(raw: str) -> list[dict[str, Any]]:
"""
解析 LLM 输出的 JSON 格式叙事。
返回与 split_narrative_to_sections 相同结构list[dict],每项含 content、placeholder_info。
"""
if not (raw or raw.strip()):
return []
try:
payload = extract_json_payload(raw)
data = json.loads(payload)
paragraphs = data.get("paragraphs") or []
if not isinstance(paragraphs, list):
return []
except (json.JSONDecodeError, TypeError, AttributeError):
return []
result: list[dict[str, Any]] = []
for i, p in enumerate(paragraphs):
if not isinstance(p, dict):
continue
content = (p.get("content") or "").strip()
desc = (p.get("image_description") or "").strip()
placeholder_info = None
if desc:
placeholder_info = {
"placeholder": f"{{{{IMAGE:{desc}}}}}",
"description": desc,
"index": i,
"start_offset": 0,
}
result.append({"content": content, "placeholder_info": placeholder_info})
return result
def parse_narrative_to_sections(narrative: str) -> list[dict[str, Any]]:
"""
将 narrative 解析为 sections。优先尝试 JSON 格式,失败则回退到占位符解析。
返回与 split_narrative_to_sections 相同结构。
"""
if not (narrative or narrative.strip()):
return []
stripped = narrative.strip()
if stripped.startswith("{") and "paragraphs" in stripped:
segments = parse_narrative_json(narrative)
if segments:
return segments
return split_narrative_to_sections(narrative)