life-echo/api/app/features/memory/chunker.py

"""Transcript chunker — split raw text into retrieval-ready chunks."""

import re


def chunk_transcript(
    text: str, *, max_chars: int = 800, overlap_chars: int = 100
) -> list[str]:
    """
    Split transcript text into overlapping chunks.
    Uses character count as proxy for tokens (~4 chars/token for Chinese).
    """
    if not text or not text.strip():
        return []
    text = text.strip()
    if len(text) <= max_chars:
        return [text] if text else []

    chunks: list[str] = []
    start = 0
    step = max_chars - overlap_chars

    while start < len(text):
        end = start + max_chars
        chunk = text[start:end]
        # 尽量在句末切分
        if end < len(text):
            for sep in ["。", "！", "？", "\n", "；", ".", "!", "?"]:
                last_sep = chunk.rfind(sep)
                if last_sep > max_chars // 2:
                    chunk = chunk[: last_sep + 1]
                    end = start + len(chunk)
                    break
        if chunk.strip():
            chunks.append(chunk.strip())
        start += len(chunk) if chunk else step

    return chunks