api/tests/test_reply_segments.py

"""segments_from_llm_response：与客户端 split 规则对齐的单元校验。"""

from app.agents.chat.reply_limits import (
    nonempty_segments_or_fallback,
    segments_from_llm_response,
    strip_leading_en_period_ack_for_chat,
    strip_markdown_for_chat,
    strip_parenthetical_asides_for_chat,
)


def test_split_marker():
    assert segments_from_llm_response("a[SPLIT]b", max_segments=3) == ["a", "b"]


def test_paragraph_fallback_when_no_marker():
    a = "太为你高兴了！在上海大剧院的舞台绽放，聚光灯下的你。"
    b = "说到舞台，我忽然想起你黄浦江边的童年。从看着江水流淌，到在舞台上演绎别人的悲欢。"
    assert segments_from_llm_response(f"{a}\n\n{b}", max_segments=3) == [a, b]


def test_short_paragraphs_not_split():
    t = "a\n\nb"
    assert segments_from_llm_response(t, max_segments=3) == [t]


def test_nonempty_fallback_when_all_blank():
    assert nonempty_segments_or_fallback(["", "  "], fallback="ok") == ["ok"]


def test_split_marker_strips_markdown():
    assert segments_from_llm_response("**A**[SPLIT]_B_", max_segments=3) == ["A", "B"]


def test_paragraph_split_strips_markdown():
    a = "**太为你高兴了！在上海大剧院的舞台绽放，聚光灯下的你。**"
    b = "[详情](https://e.com)说到舞台，我忽然想起你黄浦江边的童年。"
    assert segments_from_llm_response(f"{a}\n\n{b}", max_segments=3) == [
        "太为你高兴了！在上海大剧院的舞台绽放，聚光灯下的你。",
        "详情说到舞台，我忽然想起你黄浦江边的童年。",
    ]


def test_strip_markdown_for_chat_preserves_split_token():
    assert "[SPLIT]" in strip_markdown_for_chat("a **b** [SPLIT] c")


def test_strip_parenthetical_removes_stage_directions():
    assert strip_parenthetical_asides_for_chat("你好（轻轻笑） lately") == "你好 lately"
    assert strip_parenthetical_asides_for_chat("(sigh) okay") == "okay"
    assert strip_parenthetical_asides_for_chat("a（一）（二）b") == "ab"


def test_segments_strip_parentheticals_before_split():
    assert segments_from_llm_response(
        "先说（轻轻笑）承接[SPLIT]再问一句", max_segments=3
    ) == ["先说承接", "再问一句"]


def test_strip_parenthetical_multiple_passes():
    assert strip_parenthetical_asides_for_chat("a（一）b（二）c") == "abc"


def test_strip_leading_en_period_ack():
    assert strip_leading_en_period_ack_for_chat("嗯。后面正文") == "后面正文"
    assert strip_leading_en_period_ack_for_chat("嗯嗯。后面") == "后面"
    assert strip_leading_en_period_ack_for_chat("  嗯。  第二句") == "第二句"
    assert strip_leading_en_period_ack_for_chat("句中嗯。不打头") == "句中嗯。不打头"


def test_segments_strip_leading_en_ack():
    assert segments_from_llm_response("嗯。只有一句", max_segments=3) == ["只有一句"]
    assert segments_from_llm_response("嗯。A[SPLIT]嗯。B", max_segments=3) == ["A", "B"]


# ── 与客户端 MESSAGE_SPLIT_REGEX 对齐的容错拆段 ───────────────
# 防回归：避免后端只发 1 条 AGENT_RESPONSE，文本里残留 `[ SPLIT ]` / `[split]` 等字面量，
# 导致前端用容错正则拆出空尾段后渲染出「假装在回复」的空气泡。


def test_split_marker_with_inner_whitespace():
    """LLM 偶尔会写 `[ SPLIT ]` 带空格，后端必须按分隔符拆，不能留在文本里。"""
    assert segments_from_llm_response("第一段[ SPLIT ]第二段", max_segments=3) == [
        "第一段",
        "第二段",
    ]
    assert segments_from_llm_response("a [SPLIT ] b", max_segments=3) == ["a", "b"]
    assert segments_from_llm_response("a [ SPLIT] b", max_segments=3) == ["a", "b"]


def test_split_marker_case_insensitive():
    """与客户端正则 `/i` 对齐：`[split]` / `[Split]` 同样视为分隔符。"""
    assert segments_from_llm_response("a[split]b", max_segments=3) == ["a", "b"]
    assert segments_from_llm_response("a[Split]b", max_segments=3) == ["a", "b"]
    assert segments_from_llm_response("a[SpLiT]b", max_segments=3) == ["a", "b"]


def test_split_marker_fullwidth_brackets():
    """模型在中文环境下偶尔输出全角括号 【SPLIT】 / ［SPLIT］，应正常拆段。"""
    assert segments_from_llm_response("第一段【SPLIT】第二段", max_segments=3) == [
        "第一段",
        "第二段",
    ]
    assert segments_from_llm_response("第一段［SPLIT］第二段", max_segments=3) == [
        "第一段",
        "第二段",
    ]


def test_split_marker_with_zero_width_chars():
    """LLM 偶尔会在分隔符前后插入 ZWSP/ZWNJ/ZWJ/BOM，应先归一化再拆段。"""
    assert segments_from_llm_response(
        "第一段\u200b[SPLIT]\u200c第二段", max_segments=3
    ) == ["第一段", "第二段"]
    assert segments_from_llm_response(
        "first\ufeff[ SPLIT ]\u200dsecond", max_segments=3
    ) == ["first", "second"]


def test_split_marker_trailing_only_returns_single_segment():
    """`[SPLIT]` 出现在结尾时只剩一段非空内容，不应留下空尾段污染前端拆段。"""
    assert segments_from_llm_response("hello[SPLIT]", max_segments=3) == ["hello"]
    assert segments_from_llm_response("hello [ SPLIT ]", max_segments=3) == ["hello"]
    assert segments_from_llm_response("hello【SPLIT】", max_segments=3) == ["hello"]


def test_split_marker_combined_variants():
    """混合大小写 + 全角 + 空格：与客户端规范化一致即可正常拆段。"""
    assert segments_from_llm_response("a【 split 】b", max_segments=3) == ["a", "b"]
    assert segments_from_llm_response("a［ SPLIT ］b[Split]c", max_segments=3) == [
        "a",
        "b",
        "c",
    ]
-												feat(memory,conversation): 记忆富化/证据包、时间线幂等字段与对话分段全链路

数据库
- 新增迁移 0003：timeline_events.memory_source_id 外键 → memory_sources，便于按 ingest 源做时间线幂等

后端 - 记忆
- 新增 ingest 后 LLM 富化（摘要/事实/时间线），可配置开关与最大字符数
- 新增证据包组装：合并 chunk、摘要、事实、时间线、故事等检索结果；支持空 query 时是否仍带 rolling 等开关
- repo/retriever/service/router/schemas/summarizer/timeline/extractor 等扩展；文档 memory-retrieval.md 更新

后端 - 对话 WS
- 增加 PING/PONG；分段 ASR 日志与空音频处理；转写失败与「无助手回复」错误提示更明确
- 助手多段回复持久化使用统一分隔符，与分段逻辑一致

后端 - Agent
- reply_limits：按 [SPLIT] 与段落拆段，并保证非空 fallback，供 WS 与 TTS 多段下发

后端 - 回忆录任务
- transcript ingest 记录 source_id；任务成功结?

											
										
										
											2026-03-27 16:01:28 +08:00
+								"""segments_from_llm_response：与客户端 split 规则对齐的单元校验。"""
 								from app.agents.chat.reply_limits import (
 								    nonempty_segments_or_fallback,
 								    segments_from_llm_response,
-												fix:
1. 修复安卓部分机型顶部安全区遮挡回忆录标题的问题；
2. 降低封面图生成阈值和展示逻辑，独立封面图未生成时，使用正文图；
3. 去掉“嗯。”生硬回答，去掉不合理段首承接词；
4. 新增章节封面所需最少插图数的配置项

											
										
										
											2026-04-16 20:42:54 +08:00
+								    strip_leading_en_period_ack_for_chat,
-												fix/ 修复AI聊天时回复markdown导致聊天气泡布局问题

											
										
										
											2026-04-03 14:06:55 +08:00
+								    strip_markdown_for_chat,
-												feat(chat): host-style memoir prompts and strip parenthetical stage directions

- Add strip_parenthetical_asides_for_chat in reply pipeline before [SPLIT]
- Expand output_rules bans (performance parens) and voice as warm host
- Refocus opening/guided prompts on pulling conversation toward memoir oral history
- Align interview opening fallbacks with memoir-first tone
- Add unit tests for parenthetical stripping

											
										
										
											2026-04-10 13:55:08 +08:00
+								    strip_parenthetical_asides_for_chat,
-												feat(memory,conversation): 记忆富化/证据包、时间线幂等字段与对话分段全链路

数据库
- 新增迁移 0003：timeline_events.memory_source_id 外键 → memory_sources，便于按 ingest 源做时间线幂等

后端 - 记忆
- 新增 ingest 后 LLM 富化（摘要/事实/时间线），可配置开关与最大字符数
- 新增证据包组装：合并 chunk、摘要、事实、时间线、故事等检索结果；支持空 query 时是否仍带 rolling 等开关
- repo/retriever/service/router/schemas/summarizer/timeline/extractor 等扩展；文档 memory-retrieval.md 更新

后端 - 对话 WS
- 增加 PING/PONG；分段 ASR 日志与空音频处理；转写失败与「无助手回复」错误提示更明确
- 助手多段回复持久化使用统一分隔符，与分段逻辑一致

后端 - Agent
- reply_limits：按 [SPLIT] 与段落拆段，并保证非空 fallback，供 WS 与 TTS 多段下发

后端 - 回忆录任务
- transcript ingest 记录 source_id；任务成功结?

											
										
										
											2026-03-27 16:01:28 +08:00
+								)
 								def test_split_marker():
 								    assert segments_from_llm_response("a[SPLIT]b", max_segments=3) == ["a", "b"]
 								def test_paragraph_fallback_when_no_marker():
 								    a = "太为你高兴了！在上海大剧院的舞台绽放，聚光灯下的你。"
 								    b = "说到舞台，我忽然想起你黄浦江边的童年。从看着江水流淌，到在舞台上演绎别人的悲欢。"
 								    assert segments_from_llm_response(f"{a}\n\n{b}", max_segments=3) == [a, b]
 								def test_short_paragraphs_not_split():
 								    t = "a\n\nb"
 								    assert segments_from_llm_response(t, max_segments=3) == [t]
 								def test_nonempty_fallback_when_all_blank():
 								    assert nonempty_segments_or_fallback(["", "  "], fallback="ok") == ["ok"]
-												fix/ 修复AI聊天时回复markdown导致聊天气泡布局问题

											
										
										
											2026-04-03 14:06:55 +08:00
 								def test_split_marker_strips_markdown():
 								    assert segments_from_llm_response("**A**[SPLIT]_B_", max_segments=3) == ["A", "B"]
 								def test_paragraph_split_strips_markdown():
 								    a = "**太为你高兴了！在上海大剧院的舞台绽放，聚光灯下的你。**"
 								    b = "[详情](https://e.com)说到舞台，我忽然想起你黄浦江边的童年。"
 								    assert segments_from_llm_response(f"{a}\n\n{b}", max_segments=3) == [
 								        "太为你高兴了！在上海大剧院的舞台绽放，聚光灯下的你。",
 								        "详情说到舞台，我忽然想起你黄浦江边的童年。",
 								    ]
 								def test_strip_markdown_for_chat_preserves_split_token():
 								    assert "[SPLIT]" in strip_markdown_for_chat("a **b** [SPLIT] c")
-												feat(chat): host-style memoir prompts and strip parenthetical stage directions

- Add strip_parenthetical_asides_for_chat in reply pipeline before [SPLIT]
- Expand output_rules bans (performance parens) and voice as warm host
- Refocus opening/guided prompts on pulling conversation toward memoir oral history
- Align interview opening fallbacks with memoir-first tone
- Add unit tests for parenthetical stripping

											
										
										
											2026-04-10 13:55:08 +08:00
 								def test_strip_parenthetical_removes_stage_directions():
 								    assert strip_parenthetical_asides_for_chat("你好（轻轻笑） lately") == "你好 lately"
 								    assert strip_parenthetical_asides_for_chat("(sigh) okay") == "okay"
 								    assert strip_parenthetical_asides_for_chat("a（一）（二）b") == "ab"
 								def test_segments_strip_parentheticals_before_split():
 								    assert segments_from_llm_response(
 								        "先说（轻轻笑）承接[SPLIT]再问一句", max_segments=3
 								    ) == ["先说承接", "再问一句"]
 								def test_strip_parenthetical_multiple_passes():
 								    assert strip_parenthetical_asides_for_chat("a（一）b（二）c") == "abc"
-												fix:
1. 修复安卓部分机型顶部安全区遮挡回忆录标题的问题；
2. 降低封面图生成阈值和展示逻辑，独立封面图未生成时，使用正文图；
3. 去掉“嗯。”生硬回答，去掉不合理段首承接词；
4. 新增章节封面所需最少插图数的配置项

											
										
										
											2026-04-16 20:42:54 +08:00
 								def test_strip_leading_en_period_ack():
 								    assert strip_leading_en_period_ack_for_chat("嗯。后面正文") == "后面正文"
 								    assert strip_leading_en_period_ack_for_chat("嗯嗯。后面") == "后面"
 								    assert strip_leading_en_period_ack_for_chat("  嗯。  第二句") == "第二句"
 								    assert strip_leading_en_period_ack_for_chat("句中嗯。不打头") == "句中嗯。不打头"
 								def test_segments_strip_leading_en_ack():
 								    assert segments_from_llm_response("嗯。只有一句", max_segments=3) == ["只有一句"]
 								    assert segments_from_llm_response("嗯。A[SPLIT]嗯。B", max_segments=3) == ["A", "B"]
-												feat(i18n): persist language preference and thread through chat, memoir, TTS

- Add users.language_preference (Alembic 0018, default zh); capture at signup/SMS
  only; expose on auth and profile APIs
- Lite English prompts for chat and memoir; localized stage labels and agent
  names (Life Echo / 岁月知己)
- Tencent TTS: language-aware synthesis, ModelType=1 for 501004, English chunking
- WebSocket pipeline: emit all AGENT_RESPONSE segments when TTS cancels; INFO logs
  for tts_this_turn and TTS decisions; on-demand TTS logging
- Expo: device language on auth, i18n tiers/agent name, [SPLIT] streaming UX fixes
- Tests for migration, prompts, pipeline, router tts_this_turn, reply segments

Co-authored-by: Cursor <cursoragent@cursor.com>

											
										
										
											2026-05-11 16:16:49 +08:00
 								# ── 与客户端 MESSAGE_SPLIT_REGEX 对齐的容错拆段 ───────────────
 								# 防回归：避免后端只发 1 条 AGENT_RESPONSE，文本里残留 `[ SPLIT ]` / `[split]` 等字面量，
 								# 导致前端用容错正则拆出空尾段后渲染出「假装在回复」的空气泡。
 								def test_split_marker_with_inner_whitespace():
 								    """LLM 偶尔会写 `[ SPLIT ]` 带空格，后端必须按分隔符拆，不能留在文本里。"""
 								    assert segments_from_llm_response("第一段[ SPLIT ]第二段", max_segments=3) == [
 								        "第一段",
 								        "第二段",
 								    ]
 								    assert segments_from_llm_response("a [SPLIT ] b", max_segments=3) == ["a", "b"]
 								    assert segments_from_llm_response("a [ SPLIT] b", max_segments=3) == ["a", "b"]
 								def test_split_marker_case_insensitive():
 								    """与客户端正则 `/i` 对齐：`[split]` / `[Split]` 同样视为分隔符。"""
 								    assert segments_from_llm_response("a[split]b", max_segments=3) == ["a", "b"]
 								    assert segments_from_llm_response("a[Split]b", max_segments=3) == ["a", "b"]
 								    assert segments_from_llm_response("a[SpLiT]b", max_segments=3) == ["a", "b"]
 								def test_split_marker_fullwidth_brackets():
 								    """模型在中文环境下偶尔输出全角括号 【SPLIT】 / ［SPLIT］，应正常拆段。"""
 								    assert segments_from_llm_response("第一段【SPLIT】第二段", max_segments=3) == [
 								        "第一段",
 								        "第二段",
 								    ]
 								    assert segments_from_llm_response("第一段［SPLIT］第二段", max_segments=3) == [
 								        "第一段",
 								        "第二段",
 								    ]
 								def test_split_marker_with_zero_width_chars():
 								    """LLM 偶尔会在分隔符前后插入 ZWSP/ZWNJ/ZWJ/BOM，应先归一化再拆段。"""
 								    assert segments_from_llm_response(
 								        "第一段\u200b[SPLIT]\u200c第二段", max_segments=3
 								    ) == ["第一段", "第二段"]
 								    assert segments_from_llm_response(
 								        "first\ufeff[ SPLIT ]\u200dsecond", max_segments=3
 								    ) == ["first", "second"]
 								def test_split_marker_trailing_only_returns_single_segment():
 								    """`[SPLIT]` 出现在结尾时只剩一段非空内容，不应留下空尾段污染前端拆段。"""
 								    assert segments_from_llm_response("hello[SPLIT]", max_segments=3) == ["hello"]
 								    assert segments_from_llm_response("hello [ SPLIT ]", max_segments=3) == ["hello"]
 								    assert segments_from_llm_response("hello【SPLIT】", max_segments=3) == ["hello"]
 								def test_split_marker_combined_variants():
 								    """混合大小写 + 全角 + 空格：与客户端规范化一致即可正常拆段。"""
 								    assert segments_from_llm_response("a【 split 】b", max_segments=3) == ["a", "b"]
 								    assert segments_from_llm_response("a［ SPLIT ］b[Split]c", max_segments=3) == [
 								        "a",
 								        "b",
 								        "c",
 								    ]