feat(api): embed memoir chapter images in PDF export and strip placeholders

Made-with: Cursor
2026-03-10 16:06:09 +08:00
parent 879466fde1
commit f5afeb39ef
2 changed files with 124 additions and 47 deletions
--- a/api/services/pdf_service.py
+++ b/api/services/pdf_service.py
@@ -1,57 +1,81 @@
 """
 PDF 生成服务
 """
+import logging
+import re
 from typing import List
-from reportlab.lib.pagesizes import letter, A4
+
+import httpx
+from reportlab.lib.pagesizes import A4
 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 from reportlab.lib.units import inch
-from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Image as ReportLabImage
 from reportlab.pdfbase import pdfmetrics
-from reportlab.pdfbase.ttfonts import TTFont
 from reportlab.pdfbase.cidfonts import UnicodeCIDFont
 from io import BytesIO
-import os
+
+logger = logging.getLogger(__name__)
+
+PLACEHOLDER_RE = re.compile(r"\{\{\{\{IMAGE:.*?\}\}\}\}|\{\{IMAGE:.*?\}\}", re.DOTALL)
+
+
+def strip_image_placeholders(text: str) -> str:
+    return PLACEHOLDER_RE.sub("", text or "").strip()
+
+
+def split_content_blocks(content: str, images: list[dict]) -> list[dict]:
+    blocks: list[dict] = []
+    remaining = content
+    for image in sorted(images or [], key=lambda item: item.get("index", 0)):
+        placeholder = image.get("placeholder")
+        if not placeholder or placeholder not in remaining:
+            continue
+        before, remaining = remaining.split(placeholder, 1)
+        cleaned_before = strip_image_placeholders(before)
+        if cleaned_before:
+            blocks.append({"type": "text", "value": cleaned_before})
+        if image.get("status") == "completed" and image.get("url"):
+            blocks.append({"type": "image", "url": image["url"]})
+    cleaned_remaining = strip_image_placeholders(remaining)
+    if cleaned_remaining:
+        blocks.append({"type": "text", "value": cleaned_remaining})
+    return blocks


 class PDFService:
    """PDF 生成服务"""
-    
+
    def __init__(self):
-        # 尝试注册中文字体
        try:
-            # 使用系统字体或 ReportLab 内置的中文字体
-            # 如果没有中文字体文件，使用 UnicodeCIDFont
            pdfmetrics.registerFont(UnicodeCIDFont('STSong-Light'))
            self.chinese_font = 'STSong-Light'
        except Exception:
-            # 如果注册失败，使用默认字体（可能不支持中文）
            self.chinese_font = 'Helvetica'
-    
+
+    async def _fetch_image_bytes(self, url: str) -> bytes | None:
+        try:
+            async with httpx.AsyncClient(timeout=30) as client:
+                response = await client.get(url)
+                response.raise_for_status()
+                return response.content
+        except Exception as exc:
+            logger.warning(f"PDF 图片下载失败: url={url}, error={exc}")
+            return None
+
    async def generate_pdf(self, book, chapters: List) -> bytes:
-        """
-        生成 PDF
-        
-        Args:
-            book: 回忆录对象
-            chapters: 章节列表
-        
-        Returns:
-            PDF 字节数据
-        """
        buffer = BytesIO()
        doc = SimpleDocTemplate(buffer, pagesize=A4)
-        
-        # 创建样式
+
        styles = getSampleStyleSheet()
        title_style = ParagraphStyle(
            'CustomTitle',
            parent=styles['Heading1'],
            fontSize=24,
            spaceAfter=30,
-            alignment=1,  # 居中
+            alignment=1,
            fontName=self.chinese_font
        )
-        
+
        heading_style = ParagraphStyle(
            'CustomHeading',
            parent=styles['Heading1'],
@@ -59,7 +83,7 @@ class PDFService:
            spaceAfter=12,
            fontName=self.chinese_font
        )
-        
+
        normal_style = ParagraphStyle(
            'CustomNormal',
            parent=styles['Normal'],
@@ -67,41 +91,48 @@ class PDFService:
            leading=18,
            fontName=self.chinese_font
        )
-        
-        # 构建内容
+
        story = []
-        
-        # 封面
+
        story.append(Paragraph(book.title, title_style))
-        story.append(Spacer(1, 0.5*inch))
+        story.append(Spacer(1, 0.5 * inch))
        story.append(PageBreak())
-        
-        # 目录
+
        story.append(Paragraph("目录", heading_style))
-        story.append(Spacer(1, 0.2*inch))
+        story.append(Spacer(1, 0.2 * inch))
        for i, chapter in enumerate(chapters, 1):
            story.append(Paragraph(f"{i}. {chapter.title}", normal_style))
        story.append(PageBreak())
-        
-        # 章节内容
+
        for chapter in chapters:
            story.append(Paragraph(chapter.title, heading_style))
-            story.append(Spacer(1, 0.2*inch))
-            
-            # 分段处理内容
-            paragraphs = chapter.content.split('\n\n')
-            for para in paragraphs:
-                if para.strip():
-                    story.append(Paragraph(para.strip(), normal_style))
-                    story.append(Spacer(1, 0.1*inch))
-            
+            story.append(Spacer(1, 0.2 * inch))
+
+            images = getattr(chapter, "images", None) or []
+            blocks = split_content_blocks(chapter.content, images)
+
+            for block in blocks:
+                if block["type"] == "text":
+                    paragraphs = block["value"].split('\n\n')
+                    for para in paragraphs:
+                        if para.strip():
+                            story.append(Paragraph(para.strip(), normal_style))
+                            story.append(Spacer(1, 0.1 * inch))
+                elif block["type"] == "image":
+                    image_bytes = await self._fetch_image_bytes(block["url"])
+                    if image_bytes:
+                        try:
+                            img = ReportLabImage(BytesIO(image_bytes), width=5 * inch, height=3.75 * inch)
+                            story.append(img)
+                            story.append(Spacer(1, 0.2 * inch))
+                        except Exception as exc:
+                            logger.warning(f"PDF 图片嵌入失败: {exc}")
+
            story.append(PageBreak())
-        
-        # 生成 PDF
+
        doc.build(story)
        buffer.seek(0)
        return buffer.read()


-# 全局实例
 pdf_service = PDFService()
--- a/api/tests/test_pdf_service_images.py
+++ b/api/tests/test_pdf_service_images.py
@@ -0,0 +1,46 @@
+import unittest
+from unittest.mock import AsyncMock, patch, MagicMock
+
+from api.services.pdf_service import PDFService
+
+
+class PDFServiceImagesTest(unittest.IsolatedAsyncioTestCase):
+    @patch("api.services.pdf_service.httpx.AsyncClient")
+    async def test_generate_pdf_embeds_completed_images_and_removes_placeholders(self, async_client_cls):
+        png_bytes = (
+            b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01"
+            b"\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc```\x00\x00"
+            b"\x00\x04\x00\x01\xf6\x178U\x00\x00\x00\x00IEND\xaeB`\x82"
+        )
+        mock_response = MagicMock()
+        mock_response.content = png_bytes
+        mock_response.raise_for_status = MagicMock()
+
+        mock_client = AsyncMock()
+        mock_client.get.return_value = mock_response
+        async_client_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+        async_client_cls.return_value.__aexit__ = AsyncMock(return_value=False)
+
+        service = PDFService()
+        book = type("BookStub", (), {"title": "我的回忆录"})()
+        chapter = type(
+            "ChapterStub",
+            (),
+            {
+                "title": "童年的夏天",
+                "content": "那条路我一直记得。\n\n{{{{IMAGE:南方小镇的青石板路}}}}\n\n奶奶常坐在那里。",
+                "images": [
+                    {
+                        "index": 0,
+                        "placeholder": "{{{{IMAGE:南方小镇的青石板路}}}}",
+                        "url": "https://cos.example.com/0.png",
+                        "status": "completed",
+                    }
+                ],
+            },
+        )()
+
+        pdf_bytes = await service.generate_pdf(book, [chapter])
+
+        self.assertGreater(len(pdf_bytes), 100)
+        self.assertNotIn(b"IMAGE:", pdf_bytes)