Files
life-echo/api/app/features/memoir/memoir_images/parser.py

138 lines
4.3 KiB
Python
Raw Normal View History

import json
import re
from typing import Any
from .json_payload import extract_json_payload
2026-03-11 11:27:32 +08:00
from .schema import IMAGE_STATUS_PENDING
PLACEHOLDER_RE = re.compile(
r"\{\{\{\{IMAGE:(.*?)\}\}\}\}|\{\{IMAGE:(.*?)\}\}",
re.DOTALL,
)
def parse_image_placeholders(content: str, max_images: int) -> list[dict[str, Any]]:
items: list[dict[str, Any]] = []
for match in PLACEHOLDER_RE.finditer(content or ""):
description = (match.group(1) or match.group(2) or "").strip()
if not description:
continue
items.append(
{
"index": len(items),
"description": description,
"placeholder": match.group(0),
"start_offset": match.start(),
}
)
2026-03-11 14:07:02 +08:00
if max_images is not None and len(items) >= max_images:
break
return items
def build_initial_image_assets(
placeholders: list[dict[str, Any]],
provider: str,
style: str,
size: str,
now_iso: str,
) -> list[dict[str, Any]]:
return [
{
"index": item["index"],
"placeholder": item["placeholder"],
"description": item["description"],
"prompt": None,
"url": None,
2026-03-11 11:27:32 +08:00
"status": IMAGE_STATUS_PENDING,
"provider": provider,
"style": style,
"size": size,
"error": None,
"created_at": now_iso,
"updated_at": now_iso,
}
for item in placeholders
]
def split_narrative_to_sections(narrative: str) -> list[dict[str, Any]]:
"""
将带 {{IMAGE:...}} 占位符的正文按占位符拆成多段
返回 list[dict]每项含:
- content: 本段纯文本不含占位符
- placeholder_info: 本段后的配图占位信息 None最后一段无图
"""
if not (narrative or narrative.strip()):
return []
placeholders = parse_image_placeholders(narrative, max_images=None)
sections: list[dict[str, Any]] = []
for i in range(len(placeholders) + 1):
if i == 0:
start = 0
else:
prev = placeholders[i - 1]
start = prev["start_offset"] + len(prev["placeholder"])
if i < len(placeholders):
end = placeholders[i]["start_offset"]
placeholder_info = placeholders[i]
else:
end = len(narrative)
placeholder_info = None
content = narrative[start:end]
if isinstance(content, str):
content = content.strip()
2026-03-19 14:36:14 +08:00
sections.append(
{"content": content or "", "placeholder_info": placeholder_info}
)
return sections
def parse_narrative_json(raw: str) -> list[dict[str, Any]]:
"""
解析 LLM 输出的 JSON 格式叙事
返回与 split_narrative_to_sections 相同结构list[dict]每项含 contentplaceholder_info
"""
if not (raw or raw.strip()):
return []
try:
payload = extract_json_payload(raw)
data = json.loads(payload)
paragraphs = data.get("paragraphs") or []
if not isinstance(paragraphs, list):
return []
except (json.JSONDecodeError, TypeError, AttributeError):
return []
result: list[dict[str, Any]] = []
for i, p in enumerate(paragraphs):
if not isinstance(p, dict):
continue
content = (p.get("content") or "").strip()
desc = (p.get("image_description") or "").strip()
placeholder_info = None
if desc:
placeholder_info = {
"placeholder": f"{{{{IMAGE:{desc}}}}}",
"description": desc,
"index": i,
"start_offset": 0,
}
result.append({"content": content, "placeholder_info": placeholder_info})
return result
def parse_narrative_to_sections(narrative: str) -> list[dict[str, Any]]:
"""
narrative 解析为 sections优先尝试 JSON 格式失败则回退到占位符解析
返回与 split_narrative_to_sections 相同结构
"""
if not (narrative or narrative.strip()):
return []
stripped = narrative.strip()
if stripped.startswith("{") and "paragraphs" in stripped:
segments = parse_narrative_json(narrative)
if segments:
return segments
return split_narrative_to_sections(narrative)