api/scripts/copyright_source_pdf.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
软著申报用：源码整理生成 PDF（reportlab）

默认扫描整个 monorepo 根目录（本脚本位于 api/scripts/，向上两级即为仓库根），
会包含后端 api、Expo 客户端 app-expo、评测 Web app-eval-web 等；也可用 --root 只扫某一子项目。

使用说明（简要）：
1) 安装依赖：在 api 目录执行 uv sync（本仓库已含 reportlab）
2) 中文字体：可设置 CJK_MONO_FONT_PATH；默认可用 STSong-Light（Adobe CID）。苹方/冬青等常为 CFF，
   ReportLab 无法直接嵌入。使用 STSong-Light 时，ASCII（含空格、英文代码）用内置 Courier 分段绘制，
   中文用 STSong，避免整行用 CID 时拉丁字距错位、挤在一起。
   需要退回纯 TrueType 单字体（Menlo 等）时用 --no-cid。
3) 修改下方「配置区」常量：软件全称、版本号、必要时改 SOURCE_ROOT / 输出路径、字体路径、后缀与跳过目录
4) 运行（全仓默认，仅 .py / .ts(x) / .js(x) / .vue 等源码）：cd api && uv run python scripts/copyright_source_pdf.py
   仅后端示例：... --root ../api

注意：
- 空行会从统计与输出中剔除；行首行尾以外的空白（缩进）保留。
- 总页数按 ceil(非空行总数/50) 估算；>60 页则取前 1500 行 + 后 1500 行，最终 60 页、页码 1–60。
- 不足 60 页时输出全部非空行；最后一页若不足 50 行，用空行补齐到 50 行。
"""

from __future__ import annotations

import argparse
import math
import os
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable

from reportlab.lib.pagesizes import A4
from reportlab.lib.units import cm
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.cidfonts import UnicodeCIDFont
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfgen import canvas

# 本文件位于 <repo>/api/scripts/，仓库根为其上两级目录
REPO_ROOT = Path(__file__).resolve().parents[2]

# =========================
# 配置区（按项目修改；也可用命令行覆盖部分项）
# =========================
SOFTWARE_FULL_NAME = "岁月留书"
VERSION = "V1.0.0"

# 默认：整仓源码根（api + 各前端/工具子项目）；仅申报名某一模块时可改为子路径或 CLI --root
SOURCE_ROOT = REPO_ROOT
OUTPUT_PDF = REPO_ROOT / "copyright_source_listing.pdf"

# PDF 页眉/材料中出现的文件路径：将本机目录前缀替换为申报用单位根（如 上海华嘎科技有限公司/life-echo/...）
FILE_PATH_ABSOLUTE_PREFIX = Path("/Users/kevin/Codes/hgtk")
FILE_PATH_DISPLAY_ROOT = "上海华嘎科技有限公司"

# 设为 None 时自动探测（macOS：Menlo.ttc / Songti.ttc 等）；也可填本机 TTF/OTF 或 TTC 路径
CJK_MONO_FONT_PATH: Path | None = None

# STSong-Light（CID）与拉丁混排时 ReportLab 常把 ASCII 字距算错；ASCII 段单独用内置 Courier。
ASCII_LATIN_FONT = "Courier"


@dataclass(frozen=True)
class RenderFonts:
    """primary：中文等非 ASCII；latin_font 非空时 U+0000–U+007F 用 Courier。"""

    primary: str
    latin_font: str | None = None


def _segment_latin_cjk(line: str) -> list[tuple[str, str]]:
    if not line:
        return []
    out: list[tuple[str, str]] = []
    buf: list[str] = []
    is_lat = ord(line[0]) < 128
    for ch in line:
        lat = ord(ch) < 128
        if lat != is_lat:
            out.append(("lat" if is_lat else "cjk", "".join(buf)))
            buf = [ch]
            is_lat = lat
        else:
            buf.append(ch)
    out.append(("lat" if is_lat else "cjk", "".join(buf)))
    return out


def _mixed_line_width(line: str, fonts: RenderFonts, font_size: float) -> float:
    if fonts.latin_font is None:
        return pdfmetrics.stringWidth(line, fonts.primary, font_size)
    w = 0.0
    for kind, chunk in _segment_latin_cjk(line):
        fn = fonts.latin_font if kind == "lat" else fonts.primary
        w += pdfmetrics.stringWidth(chunk, fn, font_size)
    return w


def _draw_line(
    c: canvas.Canvas,
    x: float,
    y: float,
    line: str,
    fonts: RenderFonts,
    font_size: float,
) -> None:
    if fonts.latin_font is None:
        c.setFont(fonts.primary, font_size)
        c.drawString(x, y, line)
        return
    xpos = x
    for kind, chunk in _segment_latin_cjk(line):
        if not chunk:
            continue
        fn = fonts.latin_font if kind == "lat" else fonts.primary
        c.setFont(fn, font_size)
        c.drawString(xpos, y, chunk)
        xpos += pdfmetrics.stringWidth(chunk, fn, font_size)


# 仅收录「可执行/可编译」源码：本仓为 Python + TS/JS 栈；排除 .md、.json、.yaml 等配置与文档。
# 其它语言可自行向集合内追加后缀。
INCLUDE_SUFFIXES: set[str] = {
    ".py",
    ".pyi",
    ".ts",
    ".tsx",
    ".mts",
    ".cts",
    ".js",
    ".jsx",
    ".mjs",
    ".cjs",
    ".vue",
}

# 需要跳过的目录名（只比对路径每一段 name）
SKIP_DIR_NAMES: set[str] = {
    ".git",
    ".svn",
    ".hg",
    "node_modules",
    "venv",
    ".venv",
    "env",
    ".env",
    ".idea",
    ".vscode",
    "__pycache__",
    ".mypy_cache",
    ".pytest_cache",
    ".ruff_cache",
    ".next",
    ".nuxt",
    ".output",
    ".turbo",
    ".parcel-cache",
    ".expo",
    "dist",
    "build",
    "target",
    "out",
    "coverage",
    "htmlcov",
    "storybook-static",
    "Pods",
    ".gradle",
    "DerivedData",
}

# 是否在文件边界插入标记行
ADD_FILE_MARKERS = True
FILE_MARKER_PREFIX = "// ===== "

LINES_PER_PAGE = 50
MAX_PAGES = 60

# 苹方在多数 macOS 上为 .ttc 内嵌 CFF 轮廓，ReportLab TTFont 无法载入；仍会先尝试以下路径（以防特例）。
_PINGFANG_CANDIDATES: tuple[Path, ...] = (
    Path("/System/Library/Fonts/PingFang.ttc"),
    Path(
        "/System/Library/Fonts/Hiragino Sans GB.ttc"
    ),  # 冬青黑体，常与苹方同源 CFF，多半失败
    Path("/Library/Fonts/PingFang.ttc"),
)


def _try_register_stsong_cid() -> bool:
    try:
        pdfmetrics.registerFont(UnicodeCIDFont("STSong-Light"))
        return True
    except Exception:
        return False


def _register_stsong_cid_with_notice(*, pingfang_failed: bool) -> str:
    if not _try_register_stsong_cid():
        return ""
    if pingfang_failed:
        print(
            "已载入 PDF 简体中文：STSong-Light（Adobe Unicode CID）。"
            "说明：系统「苹方/冬青」等常为 OpenType-CFF，ReportLab 无法写入该轮廓；"
            "已改用内置宋体轮廓以保证中文可见（非苹方字形，软著材料通常可接受）。",
            file=sys.stderr,
        )
    else:
        print(
            "已载入 PDF 简体中文：STSong-Light（Adobe Unicode CID）。",
            file=sys.stderr,
        )
    return "STSong-Light"


# macOS：TrueType 轮廓 .ttc（Menlo、部分宋体/黑体）；苹方多为 CFF，见上方说明。
_MACOS_TTC_CANDIDATES: tuple[tuple[Path, int], ...] = (
    (Path("/System/Library/Fonts/Menlo.ttc"), 0),
    (Path("/System/Library/Fonts/Supplemental/Songti.ttc"), 0),
    (Path("/System/Library/Fonts/STHeiti Light.ttc"), 0),
    (Path("/System/Library/Fonts/STHeiti Medium.ttc"), 0),
)


def _try_register_ttf_or_ttc(path: Path, font_name: str = "CodeCJK") -> bool:
    """将字体注册为 font_name；.ttc 递增 subfontIndex 直至成功或无更多子字体。"""
    if not path.is_file():
        return False
    if path.suffix.lower() == ".ttc":
        idx = 0
        while idx < 64:
            try:
                pdfmetrics.registerFont(TTFont(font_name, str(path), subfontIndex=idx))
                return True
            except Exception as e:
                msg = str(e).lower()
                if "bad subfontindex" in msg:
                    break
                if "subfontindex" in msg and "not in" in msg:
                    break
                idx += 1
        return False
    try:
        pdfmetrics.registerFont(TTFont(font_name, str(path)))
        return True
    except Exception:
        return False


def register_font(user_path: Path | None, *, no_cid: bool = False) -> RenderFonts:
    """选择正文字体；STSong CID 时同时返回 Courier 供拉丁混排。"""
    font_tt = "CodeCJK"
    if user_path is not None and user_path.is_file():
        if _try_register_ttf_or_ttc(user_path, font_tt):
            print(f"已载入 PDF 字体：{user_path}", file=sys.stderr)
            return RenderFonts(primary=font_tt)

    pingfang_failed = False
    if sys.platform == "darwin":
        for p in _PINGFANG_CANDIDATES:
            if _try_register_ttf_or_ttc(p, font_tt):
                print(f"已载入 PDF 字体（苹方/平方相关）：{p}", file=sys.stderr)
                return RenderFonts(primary=font_tt)
        pingfang_failed = any(p.is_file() for p in _PINGFANG_CANDIDATES)

    if not no_cid:
        cid_name = _register_stsong_cid_with_notice(pingfang_failed=pingfang_failed)
        if cid_name:
            return RenderFonts(primary=cid_name, latin_font=ASCII_LATIN_FONT)

    if sys.platform == "darwin":
        for ttc_path, sub in _MACOS_TTC_CANDIDATES:
            if not ttc_path.is_file():
                continue
            try:
                pdfmetrics.registerFont(
                    TTFont(font_tt, str(ttc_path), subfontIndex=sub)
                )
                label = "系统等宽" if "Menlo" in ttc_path.name else "系统字体"
                print(
                    f"已载入 PDF 字体（macOS TrueType）：{ttc_path}（{label}）",
                    file=sys.stderr,
                )
                return RenderFonts(primary=font_tt)
            except Exception:
                continue

    for p in (
        Path("/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc"),
        Path("/usr/share/fonts/truetype/noto/NotoSansMonoCJK-Regular.ttf"),
    ):
        if _try_register_ttf_or_ttc(p, font_tt):
            print(f"已载入 PDF 字体（Linux 常见路径）：{p}", file=sys.stderr)
            return RenderFonts(primary=font_tt)

    if no_cid:
        print(
            "警告：已指定 --no-cid 且未找到可用 TrueType 轮廓字体，将使用 Courier。",
            file=sys.stderr,
        )
    else:
        print(
            "警告：未找到可用的中文矢量字体，将使用 Courier（中文可能无法显示）。",
            file=sys.stderr,
        )
    return RenderFonts(primary="Courier")


@dataclass(frozen=True)
class ExtractResult:
    lines: list[str]
    pages: int
    capped: bool
    original_non_empty_lines: int
    original_pages_ceil: int


def should_skip_dir(name: str) -> bool:
    return name in SKIP_DIR_NAMES


def iter_source_files(root: Path) -> Iterable[Path]:
    for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
        dirnames[:] = [d for d in dirnames if not should_skip_dir(d)]
        for fn in filenames:
            p = Path(dirpath) / fn
            if p.suffix.lower() in INCLUDE_SUFFIXES:
                yield p


def path_for_pdf_listing(fp: Path) -> str:
    """将绝对路径转为 PDF 展示路径：.../hgtk/foo -> 上海华嘎科技有限公司/foo。"""
    try:
        resolved = fp.resolve()
        base = FILE_PATH_ABSOLUTE_PREFIX.expanduser().resolve()
        rel = resolved.relative_to(base)
        return f"{FILE_PATH_DISPLAY_ROOT}/{rel.as_posix()}"
    except ValueError:
        return fp.as_posix()


def read_nonempty_lines_from_files(files: list[Path]) -> list[str]:
    out: list[str] = []
    for fp in files:
        if ADD_FILE_MARKERS:
            out.append(f"{FILE_MARKER_PREFIX}{path_for_pdf_listing(fp)} =====")
        try:
            text = fp.read_text(encoding="utf-8", errors="replace")
        except OSError:
            text = fp.read_text(encoding="latin-1", errors="replace")

        for raw in text.splitlines():
            line = raw.rstrip("\n")
            if line.strip() == "":
                continue
            out.append(line)
    return out


def extract_for_pdf(
    lines: list[str],
    *,
    lines_per_page: int = LINES_PER_PAGE,
    max_pages: int = MAX_PAGES,
) -> ExtractResult:
    original_non_empty_lines = len(lines)
    original_pages_ceil = max(1, math.ceil(original_non_empty_lines / lines_per_page))

    capped = original_pages_ceil > max_pages
    if capped:
        first_n = lines_per_page * (max_pages // 2)
        last_n = lines_per_page * (max_pages // 2)
        selected = lines[:first_n] + lines[-last_n:]
    else:
        selected = list(lines)

    pages = max(1, math.ceil(len(selected) / lines_per_page))
    return ExtractResult(
        lines=selected,
        pages=pages,
        capped=capped,
        original_non_empty_lines=original_non_empty_lines,
        original_pages_ceil=original_pages_ceil,
    )


def pad_to_multiple(lines: list[str], lines_per_page: int) -> list[str]:
    if not lines:
        return [""] * lines_per_page
    remainder = len(lines) % lines_per_page
    if remainder == 0:
        return lines
    return lines + [""] * (lines_per_page - remainder)


def fit_line_to_width(
    line: str, fonts: RenderFonts, font_size: float, max_width: float
) -> str:
    if _mixed_line_width(line, fonts, font_size) <= max_width:
        return line
    lo, hi = 0, len(line)
    while lo < hi:
        mid = (lo + hi + 1) // 2
        if _mixed_line_width(line[:mid], fonts, font_size) <= max_width:
            lo = mid
        else:
            hi = mid - 1
    ell = "…"
    ell_w = _mixed_line_width(ell, fonts, font_size)
    while lo > 0 and _mixed_line_width(line[:lo], fonts, font_size) + ell_w > max_width:
        lo -= 1
    if lo <= 0:
        return ell
    return line[:lo] + ell


def draw_pdf(
    out_path: Path,
    all_lines: list[str],
    *,
    fonts: RenderFonts,
    lines_per_page: int,
    software_name: str,
    version: str,
    font_size: float = 8.5,
) -> None:
    page_w, page_h = A4
    left = 2 * cm
    right = 2 * cm
    top = 2 * cm
    bottom = 2 * cm

    sep_gap = 0.25 * cm

    c = canvas.Canvas(str(out_path), pagesize=A4)
    c.setTitle(f"{software_name} {version} — 源代码节选")
    c.setAuthor(software_name)

    header_baseline_y = page_h - top - 0.35 * cm
    sep_y = header_baseline_y - 0.35 * cm

    content_top = sep_y - sep_gap
    content_bottom = bottom + 0.5 * cm
    if lines_per_page <= 1:
        line_step = max(content_top - content_bottom, 1.0)
    else:
        line_step = (content_top - content_bottom) / (lines_per_page - 1)

    max_text_width = page_w - left - right
    truncated_count = 0

    n = len(all_lines)
    if n % lines_per_page != 0:
        raise ValueError("all_lines 长度必须是 lines_per_page 的整数倍（请先 pad）")

    total_pages = n // lines_per_page

    for page_no in range(1, total_pages + 1):
        left_text = f"{software_name} {version}"
        _draw_line(c, left, header_baseline_y, left_text, fonts, font_size)

        page_label = str(page_no)
        page_font = fonts.latin_font or fonts.primary
        pw = pdfmetrics.stringWidth(page_label, page_font, font_size)
        c.setFont(page_font, font_size)
        c.drawString(page_w - right - pw, header_baseline_y, page_label)

        c.setLineWidth(0.3)
        c.line(left, sep_y, page_w - right, sep_y)

        start = (page_no - 1) * lines_per_page
        page_slice = all_lines[start : start + lines_per_page]

        for i, raw in enumerate(page_slice):
            y = content_top - i * line_step
            fitted = fit_line_to_width(raw, fonts, font_size, max_text_width)
            if fitted != raw and raw:
                truncated_count += 1
            _draw_line(c, left, y, fitted, fonts, font_size)

        c.showPage()

    if truncated_count:
        print(
            f"提示：共有 {truncated_count} 行因超宽被截断（可减小字号或换更窄字体）。",
            file=sys.stderr,
        )

    c.save()


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description="将源码目录整理为软著申报用 PDF（reportlab）",
    )
    p.add_argument(
        "--root",
        type=Path,
        default=None,
        help="源码根目录（默认使用脚本内 SOURCE_ROOT）",
    )
    p.add_argument(
        "--out",
        type=Path,
        default=None,
        help="输出 PDF 路径（默认使用脚本内 OUTPUT_PDF）",
    )
    p.add_argument(
        "--font",
        type=Path,
        default=None,
        help="CJK 等宽字体 TTF/OTF（默认使用脚本内 CJK_MONO_FONT_PATH）",
    )
    p.add_argument(
        "--name",
        default=None,
        help="软件全称（默认脚本内 SOFTWARE_FULL_NAME）",
    )
    p.add_argument(
        "--version",
        dest="version_str",
        default=None,
        help="版本号（默认脚本内 VERSION）",
    )
    p.add_argument(
        "--font-size",
        type=float,
        default=8.3,
        help="正文字号（略小可减少超长行截断，默认 8.3）",
    )
    p.add_argument(
        "--no-cid",
        action="store_true",
        help="不使用 Adobe STSong-Light（CID），仅尝试 TrueType 轮廓（Menlo 等），中文更易缺失",
    )
    return p.parse_args()


def main() -> int:
    args = parse_args()
    root = args.root or SOURCE_ROOT
    out_pdf = args.out or OUTPUT_PDF
    font_path = args.font
    if font_path is None:
        font_path = CJK_MONO_FONT_PATH
    name = args.name or SOFTWARE_FULL_NAME
    ver = args.version_str or VERSION

    if not root.is_dir():
        print(f"错误：源码根目录不存在或不是目录：{root}", file=sys.stderr)
        return 1

    files = sorted(iter_source_files(root), key=lambda p: p.as_posix())
    lines = read_nonempty_lines_from_files(files)
    result = extract_for_pdf(lines)
    padded = pad_to_multiple(result.lines, LINES_PER_PAGE)

    fonts = register_font(font_path, no_cid=args.no_cid)

    out_pdf.parent.mkdir(parents=True, exist_ok=True)
    draw_pdf(
        out_pdf,
        padded,
        fonts=fonts,
        lines_per_page=LINES_PER_PAGE,
        software_name=name,
        version=ver,
        font_size=args.font_size,
    )

    out_pages = len(padded) // LINES_PER_PAGE
    print(
        "完成：",
        f"源非空行 {result.original_non_empty_lines}（按 50 行/页约 {result.original_pages_ceil} 页）",
        f"节选后 {len(result.lines)} 行",
        f"输出 {out_pdf} 共 {out_pages} 页",
        f" capped={result.capped}",
    )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())