Files
life-echo/api/scripts/copyright_source_pdf.py

585 lines
18 KiB
Python
Raw Normal View History

2026-05-12 15:25:09 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
软著申报用源码整理生成 PDFreportlab
默认扫描整个 monorepo 根目录本脚本位于 api/scripts/向上两级即为仓库根
会包含后端 apiExpo 客户端 app-expo评测 Web app-eval-web 也可用 --root 只扫某一子项目
使用说明简要
1) 安装依赖 api 目录执行 uv sync本仓库已含 reportlab
2) 中文字体可设置 CJK_MONO_FONT_PATH默认可用 STSong-LightAdobe CID苹方/冬青等常为 CFF
ReportLab 无法直接嵌入使用 STSong-Light ASCII含空格英文代码用内置 Courier 分段绘制
中文用 STSong避免整行用 CID 时拉丁字距错位挤在一起
需要退回纯 TrueType 单字体Menlo 时用 --no-cid
3) 修改下方配置区常量软件全称版本号必要时改 SOURCE_ROOT / 输出路径字体路径后缀与跳过目录
4) 运行全仓默认 .py / .ts(x) / .js(x) / .vue 等源码cd api && uv run python scripts/copyright_source_pdf.py
仅后端示例... --root ../api
注意
- 空行会从统计与输出中剔除行首行尾以外的空白缩进保留
- 总页数按 ceil(非空行总数/50) 估算>60 页则取前 1500 + 1500 最终 60 页码 160
- 不足 60 页时输出全部非空行最后一页若不足 50 用空行补齐到 50
"""
from __future__ import annotations
import argparse
import math
import os
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable
from reportlab.lib.pagesizes import A4
from reportlab.lib.units import cm
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.cidfonts import UnicodeCIDFont
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfgen import canvas
# 本文件位于 <repo>/api/scripts/,仓库根为其上两级目录
REPO_ROOT = Path(__file__).resolve().parents[2]
# =========================
# 配置区(按项目修改;也可用命令行覆盖部分项)
# =========================
SOFTWARE_FULL_NAME = "岁月留书"
VERSION = "V1.0.0"
# 默认整仓源码根api + 各前端/工具子项目);仅申报名某一模块时可改为子路径或 CLI --root
SOURCE_ROOT = REPO_ROOT
OUTPUT_PDF = REPO_ROOT / "copyright_source_listing.pdf"
# PDF 页眉/材料中出现的文件路径:将本机目录前缀替换为申报用单位根(如 上海华嘎科技有限公司/life-echo/...
2026-05-12 15:25:09 +08:00
FILE_PATH_ABSOLUTE_PREFIX = Path("/Users/kevin/Codes/hgtk")
FILE_PATH_DISPLAY_ROOT = "上海华嘎科技有限公司"
2026-05-12 15:25:09 +08:00
# 设为 None 时自动探测macOSMenlo.ttc / Songti.ttc 等);也可填本机 TTF/OTF 或 TTC 路径
CJK_MONO_FONT_PATH: Path | None = None
# STSong-LightCID与拉丁混排时 ReportLab 常把 ASCII 字距算错ASCII 段单独用内置 Courier。
ASCII_LATIN_FONT = "Courier"
@dataclass(frozen=True)
class RenderFonts:
"""primary中文等非 ASCIIlatin_font 非空时 U+0000U+007F 用 Courier。"""
primary: str
latin_font: str | None = None
def _segment_latin_cjk(line: str) -> list[tuple[str, str]]:
if not line:
return []
out: list[tuple[str, str]] = []
buf: list[str] = []
is_lat = ord(line[0]) < 128
for ch in line:
lat = ord(ch) < 128
if lat != is_lat:
out.append(("lat" if is_lat else "cjk", "".join(buf)))
buf = [ch]
is_lat = lat
else:
buf.append(ch)
out.append(("lat" if is_lat else "cjk", "".join(buf)))
return out
def _mixed_line_width(line: str, fonts: RenderFonts, font_size: float) -> float:
if fonts.latin_font is None:
return pdfmetrics.stringWidth(line, fonts.primary, font_size)
w = 0.0
for kind, chunk in _segment_latin_cjk(line):
fn = fonts.latin_font if kind == "lat" else fonts.primary
w += pdfmetrics.stringWidth(chunk, fn, font_size)
return w
def _draw_line(
c: canvas.Canvas,
x: float,
y: float,
line: str,
fonts: RenderFonts,
font_size: float,
) -> None:
if fonts.latin_font is None:
c.setFont(fonts.primary, font_size)
c.drawString(x, y, line)
return
xpos = x
for kind, chunk in _segment_latin_cjk(line):
if not chunk:
continue
fn = fonts.latin_font if kind == "lat" else fonts.primary
c.setFont(fn, font_size)
c.drawString(xpos, y, chunk)
xpos += pdfmetrics.stringWidth(chunk, fn, font_size)
# 仅收录「可执行/可编译」源码:本仓为 Python + TS/JS 栈;排除 .md、.json、.yaml 等配置与文档。
# 其它语言可自行向集合内追加后缀。
INCLUDE_SUFFIXES: set[str] = {
".py",
".pyi",
".ts",
".tsx",
".mts",
".cts",
".js",
".jsx",
".mjs",
".cjs",
".vue",
}
# 需要跳过的目录名(只比对路径每一段 name
SKIP_DIR_NAMES: set[str] = {
".git",
".svn",
".hg",
"node_modules",
"venv",
".venv",
"env",
".env",
".idea",
".vscode",
"__pycache__",
".mypy_cache",
".pytest_cache",
".ruff_cache",
".next",
".nuxt",
".output",
".turbo",
".parcel-cache",
".expo",
"dist",
"build",
"target",
"out",
"coverage",
"htmlcov",
"storybook-static",
"Pods",
".gradle",
"DerivedData",
}
# 是否在文件边界插入标记行
ADD_FILE_MARKERS = True
FILE_MARKER_PREFIX = "// ===== "
LINES_PER_PAGE = 50
MAX_PAGES = 60
# 苹方在多数 macOS 上为 .ttc 内嵌 CFF 轮廓ReportLab TTFont 无法载入;仍会先尝试以下路径(以防特例)。
_PINGFANG_CANDIDATES: tuple[Path, ...] = (
Path("/System/Library/Fonts/PingFang.ttc"),
Path(
"/System/Library/Fonts/Hiragino Sans GB.ttc"
), # 冬青黑体,常与苹方同源 CFF多半失败
Path("/Library/Fonts/PingFang.ttc"),
)
def _try_register_stsong_cid() -> bool:
try:
pdfmetrics.registerFont(UnicodeCIDFont("STSong-Light"))
return True
except Exception:
return False
def _register_stsong_cid_with_notice(*, pingfang_failed: bool) -> str:
if not _try_register_stsong_cid():
return ""
if pingfang_failed:
print(
"已载入 PDF 简体中文STSong-LightAdobe Unicode CID"
"说明:系统「苹方/冬青」等常为 OpenType-CFFReportLab 无法写入该轮廓;"
"已改用内置宋体轮廓以保证中文可见(非苹方字形,软著材料通常可接受)。",
file=sys.stderr,
)
else:
print(
"已载入 PDF 简体中文STSong-LightAdobe Unicode CID",
file=sys.stderr,
)
return "STSong-Light"
# macOSTrueType 轮廓 .ttcMenlo、部分宋体/黑体);苹方多为 CFF见上方说明。
_MACOS_TTC_CANDIDATES: tuple[tuple[Path, int], ...] = (
(Path("/System/Library/Fonts/Menlo.ttc"), 0),
(Path("/System/Library/Fonts/Supplemental/Songti.ttc"), 0),
(Path("/System/Library/Fonts/STHeiti Light.ttc"), 0),
(Path("/System/Library/Fonts/STHeiti Medium.ttc"), 0),
)
def _try_register_ttf_or_ttc(path: Path, font_name: str = "CodeCJK") -> bool:
"""将字体注册为 font_name.ttc 递增 subfontIndex 直至成功或无更多子字体。"""
if not path.is_file():
return False
if path.suffix.lower() == ".ttc":
idx = 0
while idx < 64:
try:
pdfmetrics.registerFont(TTFont(font_name, str(path), subfontIndex=idx))
return True
except Exception as e:
msg = str(e).lower()
if "bad subfontindex" in msg:
break
if "subfontindex" in msg and "not in" in msg:
break
idx += 1
return False
try:
pdfmetrics.registerFont(TTFont(font_name, str(path)))
return True
except Exception:
return False
def register_font(user_path: Path | None, *, no_cid: bool = False) -> RenderFonts:
"""选择正文字体STSong CID 时同时返回 Courier 供拉丁混排。"""
font_tt = "CodeCJK"
if user_path is not None and user_path.is_file():
if _try_register_ttf_or_ttc(user_path, font_tt):
print(f"已载入 PDF 字体:{user_path}", file=sys.stderr)
return RenderFonts(primary=font_tt)
pingfang_failed = False
if sys.platform == "darwin":
for p in _PINGFANG_CANDIDATES:
if _try_register_ttf_or_ttc(p, font_tt):
print(f"已载入 PDF 字体(苹方/平方相关):{p}", file=sys.stderr)
return RenderFonts(primary=font_tt)
pingfang_failed = any(p.is_file() for p in _PINGFANG_CANDIDATES)
if not no_cid:
cid_name = _register_stsong_cid_with_notice(pingfang_failed=pingfang_failed)
if cid_name:
return RenderFonts(primary=cid_name, latin_font=ASCII_LATIN_FONT)
if sys.platform == "darwin":
for ttc_path, sub in _MACOS_TTC_CANDIDATES:
if not ttc_path.is_file():
continue
try:
pdfmetrics.registerFont(
TTFont(font_tt, str(ttc_path), subfontIndex=sub)
)
label = "系统等宽" if "Menlo" in ttc_path.name else "系统字体"
print(
f"已载入 PDF 字体macOS TrueType{ttc_path}{label}",
file=sys.stderr,
)
return RenderFonts(primary=font_tt)
except Exception:
continue
for p in (
Path("/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc"),
Path("/usr/share/fonts/truetype/noto/NotoSansMonoCJK-Regular.ttf"),
):
if _try_register_ttf_or_ttc(p, font_tt):
print(f"已载入 PDF 字体Linux 常见路径):{p}", file=sys.stderr)
return RenderFonts(primary=font_tt)
if no_cid:
print(
"警告:已指定 --no-cid 且未找到可用 TrueType 轮廓字体,将使用 Courier。",
file=sys.stderr,
)
else:
print(
"警告:未找到可用的中文矢量字体,将使用 Courier中文可能无法显示",
file=sys.stderr,
)
return RenderFonts(primary="Courier")
@dataclass(frozen=True)
class ExtractResult:
lines: list[str]
pages: int
capped: bool
original_non_empty_lines: int
original_pages_ceil: int
def should_skip_dir(name: str) -> bool:
return name in SKIP_DIR_NAMES
def iter_source_files(root: Path) -> Iterable[Path]:
for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
dirnames[:] = [d for d in dirnames if not should_skip_dir(d)]
for fn in filenames:
p = Path(dirpath) / fn
if p.suffix.lower() in INCLUDE_SUFFIXES:
yield p
def path_for_pdf_listing(fp: Path) -> str:
"""将绝对路径转为 PDF 展示路径:.../hgtk/foo -> 上海华嘎科技有限公司/foo。"""
2026-05-12 15:25:09 +08:00
try:
resolved = fp.resolve()
base = FILE_PATH_ABSOLUTE_PREFIX.expanduser().resolve()
rel = resolved.relative_to(base)
return f"{FILE_PATH_DISPLAY_ROOT}/{rel.as_posix()}"
except ValueError:
return fp.as_posix()
def read_nonempty_lines_from_files(files: list[Path]) -> list[str]:
out: list[str] = []
for fp in files:
if ADD_FILE_MARKERS:
out.append(f"{FILE_MARKER_PREFIX}{path_for_pdf_listing(fp)} =====")
try:
text = fp.read_text(encoding="utf-8", errors="replace")
except OSError:
text = fp.read_text(encoding="latin-1", errors="replace")
for raw in text.splitlines():
line = raw.rstrip("\n")
if line.strip() == "":
continue
out.append(line)
return out
def extract_for_pdf(
lines: list[str],
*,
lines_per_page: int = LINES_PER_PAGE,
max_pages: int = MAX_PAGES,
) -> ExtractResult:
original_non_empty_lines = len(lines)
original_pages_ceil = max(1, math.ceil(original_non_empty_lines / lines_per_page))
capped = original_pages_ceil > max_pages
if capped:
first_n = lines_per_page * (max_pages // 2)
last_n = lines_per_page * (max_pages // 2)
selected = lines[:first_n] + lines[-last_n:]
else:
selected = list(lines)
pages = max(1, math.ceil(len(selected) / lines_per_page))
return ExtractResult(
lines=selected,
pages=pages,
capped=capped,
original_non_empty_lines=original_non_empty_lines,
original_pages_ceil=original_pages_ceil,
)
def pad_to_multiple(lines: list[str], lines_per_page: int) -> list[str]:
if not lines:
return [""] * lines_per_page
remainder = len(lines) % lines_per_page
if remainder == 0:
return lines
return lines + [""] * (lines_per_page - remainder)
def fit_line_to_width(
line: str, fonts: RenderFonts, font_size: float, max_width: float
) -> str:
if _mixed_line_width(line, fonts, font_size) <= max_width:
return line
lo, hi = 0, len(line)
while lo < hi:
mid = (lo + hi + 1) // 2
if _mixed_line_width(line[:mid], fonts, font_size) <= max_width:
lo = mid
else:
hi = mid - 1
ell = ""
ell_w = _mixed_line_width(ell, fonts, font_size)
while lo > 0 and _mixed_line_width(line[:lo], fonts, font_size) + ell_w > max_width:
lo -= 1
if lo <= 0:
return ell
return line[:lo] + ell
def draw_pdf(
out_path: Path,
all_lines: list[str],
*,
fonts: RenderFonts,
lines_per_page: int,
software_name: str,
version: str,
font_size: float = 8.5,
) -> None:
page_w, page_h = A4
left = 2 * cm
right = 2 * cm
top = 2 * cm
bottom = 2 * cm
sep_gap = 0.25 * cm
c = canvas.Canvas(str(out_path), pagesize=A4)
c.setTitle(f"{software_name} {version} — 源代码节选")
c.setAuthor(software_name)
header_baseline_y = page_h - top - 0.35 * cm
sep_y = header_baseline_y - 0.35 * cm
content_top = sep_y - sep_gap
content_bottom = bottom + 0.5 * cm
if lines_per_page <= 1:
line_step = max(content_top - content_bottom, 1.0)
else:
line_step = (content_top - content_bottom) / (lines_per_page - 1)
max_text_width = page_w - left - right
truncated_count = 0
n = len(all_lines)
if n % lines_per_page != 0:
raise ValueError("all_lines 长度必须是 lines_per_page 的整数倍(请先 pad")
total_pages = n // lines_per_page
for page_no in range(1, total_pages + 1):
left_text = f"{software_name} {version}"
_draw_line(c, left, header_baseline_y, left_text, fonts, font_size)
page_label = str(page_no)
page_font = fonts.latin_font or fonts.primary
pw = pdfmetrics.stringWidth(page_label, page_font, font_size)
c.setFont(page_font, font_size)
c.drawString(page_w - right - pw, header_baseline_y, page_label)
c.setLineWidth(0.3)
c.line(left, sep_y, page_w - right, sep_y)
start = (page_no - 1) * lines_per_page
page_slice = all_lines[start : start + lines_per_page]
for i, raw in enumerate(page_slice):
y = content_top - i * line_step
fitted = fit_line_to_width(raw, fonts, font_size, max_text_width)
if fitted != raw and raw:
truncated_count += 1
_draw_line(c, left, y, fitted, fonts, font_size)
c.showPage()
if truncated_count:
print(
f"提示:共有 {truncated_count} 行因超宽被截断(可减小字号或换更窄字体)。",
file=sys.stderr,
)
c.save()
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="将源码目录整理为软著申报用 PDFreportlab",
)
p.add_argument(
"--root",
type=Path,
default=None,
help="源码根目录(默认使用脚本内 SOURCE_ROOT",
)
p.add_argument(
"--out",
type=Path,
default=None,
help="输出 PDF 路径(默认使用脚本内 OUTPUT_PDF",
)
p.add_argument(
"--font",
type=Path,
default=None,
help="CJK 等宽字体 TTF/OTF默认使用脚本内 CJK_MONO_FONT_PATH",
)
p.add_argument(
"--name",
default=None,
help="软件全称(默认脚本内 SOFTWARE_FULL_NAME",
)
p.add_argument(
"--version",
dest="version_str",
default=None,
help="版本号(默认脚本内 VERSION",
)
p.add_argument(
"--font-size",
type=float,
default=8.3,
help="正文字号(略小可减少超长行截断,默认 8.3",
)
p.add_argument(
"--no-cid",
action="store_true",
help="不使用 Adobe STSong-LightCID仅尝试 TrueType 轮廓Menlo 等),中文更易缺失",
)
return p.parse_args()
def main() -> int:
args = parse_args()
root = args.root or SOURCE_ROOT
out_pdf = args.out or OUTPUT_PDF
font_path = args.font
if font_path is None:
font_path = CJK_MONO_FONT_PATH
name = args.name or SOFTWARE_FULL_NAME
ver = args.version_str or VERSION
if not root.is_dir():
print(f"错误:源码根目录不存在或不是目录:{root}", file=sys.stderr)
return 1
files = sorted(iter_source_files(root), key=lambda p: p.as_posix())
lines = read_nonempty_lines_from_files(files)
result = extract_for_pdf(lines)
padded = pad_to_multiple(result.lines, LINES_PER_PAGE)
fonts = register_font(font_path, no_cid=args.no_cid)
out_pdf.parent.mkdir(parents=True, exist_ok=True)
draw_pdf(
out_pdf,
padded,
fonts=fonts,
lines_per_page=LINES_PER_PAGE,
software_name=name,
version=ver,
font_size=args.font_size,
)
out_pages = len(padded) // LINES_PER_PAGE
print(
"完成:",
f"源非空行 {result.original_non_empty_lines}(按 50 行/页约 {result.original_pages_ceil} 页)",
f"节选后 {len(result.lines)}",
f"输出 {out_pdf}{out_pages}",
f" capped={result.capped}",
)
return 0
if __name__ == "__main__":
raise SystemExit(main())