life-echo/api/app/features/memory/chunker.py

"""Transcript chunker — split raw text into retrieval-ready chunks."""


def chunk_transcript(
    text: str, *, max_chars: int = 800, overlap_chars: int = 100
) -> list[str]:
    """
    Split transcript text into overlapping chunks.
    Uses character count as proxy for tokens (~4 chars/token for Chinese).
    """
    if not text or not text.strip():
        return []
    text = text.strip()
    if len(text) <= max_chars:
        return [text] if text else []
    if max_chars <= 0:
        raise ValueError("max_chars must be positive")
    if overlap_chars < 0:
        raise ValueError("overlap_chars cannot be negative")
    overlap = min(overlap_chars, max_chars - 1)

    chunks: list[str] = []
    start = 0

    while start < len(text):
        end = start + max_chars
        chunk = text[start:end]
        # 尽量在句末切分
        if end < len(text):
            for sep in ["。", "！", "？", "\n", "；", ".", "!", "?"]:
                last_sep = chunk.rfind(sep)
                if last_sep > max_chars // 2:
                    chunk = chunk[: last_sep + 1]
                    end = start + len(chunk)
                    break
        if chunk.strip():
            chunks.append(chunk.strip())
        if not chunk:
            start += max_chars - overlap
            continue
        next_start = end - overlap
        if next_start <= start:
            next_start = start + len(chunk)
        start = next_start

    return chunks