Files
life-echo/api/tests/test_reply_segments.py
Kevin ccdc4e4277 feat(i18n): persist language preference and thread through chat, memoir, TTS
- Add users.language_preference (Alembic 0018, default zh); capture at signup/SMS
  only; expose on auth and profile APIs
- Lite English prompts for chat and memoir; localized stage labels and agent
  names (Life Echo / 岁月知己)
- Tencent TTS: language-aware synthesis, ModelType=1 for 501004, English chunking
- WebSocket pipeline: emit all AGENT_RESPONSE segments when TTS cancels; INFO logs
  for tts_this_turn and TTS decisions; on-demand TTS logging
- Expo: device language on auth, i18n tiers/agent name, [SPLIT] streaming UX fixes
- Tests for migration, prompts, pipeline, router tts_this_turn, reply segments

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-11 16:16:49 +08:00

135 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""segments_from_llm_response与客户端 split 规则对齐的单元校验。"""
from app.agents.chat.reply_limits import (
nonempty_segments_or_fallback,
segments_from_llm_response,
strip_leading_en_period_ack_for_chat,
strip_markdown_for_chat,
strip_parenthetical_asides_for_chat,
)
def test_split_marker():
assert segments_from_llm_response("a[SPLIT]b", max_segments=3) == ["a", "b"]
def test_paragraph_fallback_when_no_marker():
a = "太为你高兴了!在上海大剧院的舞台绽放,聚光灯下的你。"
b = "说到舞台,我忽然想起你黄浦江边的童年。从看着江水流淌,到在舞台上演绎别人的悲欢。"
assert segments_from_llm_response(f"{a}\n\n{b}", max_segments=3) == [a, b]
def test_short_paragraphs_not_split():
t = "a\n\nb"
assert segments_from_llm_response(t, max_segments=3) == [t]
def test_nonempty_fallback_when_all_blank():
assert nonempty_segments_or_fallback(["", " "], fallback="ok") == ["ok"]
def test_split_marker_strips_markdown():
assert segments_from_llm_response("**A**[SPLIT]_B_", max_segments=3) == ["A", "B"]
def test_paragraph_split_strips_markdown():
a = "**太为你高兴了!在上海大剧院的舞台绽放,聚光灯下的你。**"
b = "[详情](https://e.com)说到舞台,我忽然想起你黄浦江边的童年。"
assert segments_from_llm_response(f"{a}\n\n{b}", max_segments=3) == [
"太为你高兴了!在上海大剧院的舞台绽放,聚光灯下的你。",
"详情说到舞台,我忽然想起你黄浦江边的童年。",
]
def test_strip_markdown_for_chat_preserves_split_token():
assert "[SPLIT]" in strip_markdown_for_chat("a **b** [SPLIT] c")
def test_strip_parenthetical_removes_stage_directions():
assert strip_parenthetical_asides_for_chat("你好(轻轻笑) lately") == "你好 lately"
assert strip_parenthetical_asides_for_chat("(sigh) okay") == "okay"
assert strip_parenthetical_asides_for_chat("ab") == "ab"
def test_segments_strip_parentheticals_before_split():
assert segments_from_llm_response(
"先说(轻轻笑)承接[SPLIT]再问一句", max_segments=3
) == ["先说承接", "再问一句"]
def test_strip_parenthetical_multiple_passes():
assert strip_parenthetical_asides_for_chat("abc") == "abc"
def test_strip_leading_en_period_ack():
assert strip_leading_en_period_ack_for_chat("嗯。后面正文") == "后面正文"
assert strip_leading_en_period_ack_for_chat("嗯嗯。后面") == "后面"
assert strip_leading_en_period_ack_for_chat(" 嗯。 第二句") == "第二句"
assert strip_leading_en_period_ack_for_chat("句中嗯。不打头") == "句中嗯。不打头"
def test_segments_strip_leading_en_ack():
assert segments_from_llm_response("嗯。只有一句", max_segments=3) == ["只有一句"]
assert segments_from_llm_response("嗯。A[SPLIT]嗯。B", max_segments=3) == ["A", "B"]
# ── 与客户端 MESSAGE_SPLIT_REGEX 对齐的容错拆段 ───────────────
# 防回归:避免后端只发 1 条 AGENT_RESPONSE文本里残留 `[ SPLIT ]` / `[split]` 等字面量,
# 导致前端用容错正则拆出空尾段后渲染出「假装在回复」的空气泡。
def test_split_marker_with_inner_whitespace():
"""LLM 偶尔会写 `[ SPLIT ]` 带空格,后端必须按分隔符拆,不能留在文本里。"""
assert segments_from_llm_response("第一段[ SPLIT ]第二段", max_segments=3) == [
"第一段",
"第二段",
]
assert segments_from_llm_response("a [SPLIT ] b", max_segments=3) == ["a", "b"]
assert segments_from_llm_response("a [ SPLIT] b", max_segments=3) == ["a", "b"]
def test_split_marker_case_insensitive():
"""与客户端正则 `/i` 对齐:`[split]` / `[Split]` 同样视为分隔符。"""
assert segments_from_llm_response("a[split]b", max_segments=3) == ["a", "b"]
assert segments_from_llm_response("a[Split]b", max_segments=3) == ["a", "b"]
assert segments_from_llm_response("a[SpLiT]b", max_segments=3) == ["a", "b"]
def test_split_marker_fullwidth_brackets():
"""模型在中文环境下偶尔输出全角括号 【SPLIT】 / SPLIT应正常拆段。"""
assert segments_from_llm_response("第一段【SPLIT】第二段", max_segments=3) == [
"第一段",
"第二段",
]
assert segments_from_llm_response("第一段SPLIT第二段", max_segments=3) == [
"第一段",
"第二段",
]
def test_split_marker_with_zero_width_chars():
"""LLM 偶尔会在分隔符前后插入 ZWSP/ZWNJ/ZWJ/BOM应先归一化再拆段。"""
assert segments_from_llm_response(
"第一段\u200b[SPLIT]\u200c第二段", max_segments=3
) == ["第一段", "第二段"]
assert segments_from_llm_response(
"first\ufeff[ SPLIT ]\u200dsecond", max_segments=3
) == ["first", "second"]
def test_split_marker_trailing_only_returns_single_segment():
"""`[SPLIT]` 出现在结尾时只剩一段非空内容,不应留下空尾段污染前端拆段。"""
assert segments_from_llm_response("hello[SPLIT]", max_segments=3) == ["hello"]
assert segments_from_llm_response("hello [ SPLIT ]", max_segments=3) == ["hello"]
assert segments_from_llm_response("hello【SPLIT】", max_segments=3) == ["hello"]
def test_split_marker_combined_variants():
"""混合大小写 + 全角 + 空格:与客户端规范化一致即可正常拆段。"""
assert segments_from_llm_response("a【 split 】b", max_segments=3) == ["a", "b"]
assert segments_from_llm_response("a SPLIT b[Split]c", max_segments=3) == [
"a",
"b",
"c",
]