- Add users.language_preference (Alembic 0018, default zh); capture at signup/SMS only; expose on auth and profile APIs - Lite English prompts for chat and memoir; localized stage labels and agent names (Life Echo / 岁月知己) - Tencent TTS: language-aware synthesis, ModelType=1 for 501004, English chunking - WebSocket pipeline: emit all AGENT_RESPONSE segments when TTS cancels; INFO logs for tts_this_turn and TTS decisions; on-demand TTS logging - Expo: device language on auth, i18n tiers/agent name, [SPLIT] streaming UX fixes - Tests for migration, prompts, pipeline, router tts_this_turn, reply segments Co-authored-by: Cursor <cursoragent@cursor.com>
135 lines
5.7 KiB
Python
135 lines
5.7 KiB
Python
"""segments_from_llm_response:与客户端 split 规则对齐的单元校验。"""
|
||
|
||
from app.agents.chat.reply_limits import (
|
||
nonempty_segments_or_fallback,
|
||
segments_from_llm_response,
|
||
strip_leading_en_period_ack_for_chat,
|
||
strip_markdown_for_chat,
|
||
strip_parenthetical_asides_for_chat,
|
||
)
|
||
|
||
|
||
def test_split_marker():
|
||
assert segments_from_llm_response("a[SPLIT]b", max_segments=3) == ["a", "b"]
|
||
|
||
|
||
def test_paragraph_fallback_when_no_marker():
|
||
a = "太为你高兴了!在上海大剧院的舞台绽放,聚光灯下的你。"
|
||
b = "说到舞台,我忽然想起你黄浦江边的童年。从看着江水流淌,到在舞台上演绎别人的悲欢。"
|
||
assert segments_from_llm_response(f"{a}\n\n{b}", max_segments=3) == [a, b]
|
||
|
||
|
||
def test_short_paragraphs_not_split():
|
||
t = "a\n\nb"
|
||
assert segments_from_llm_response(t, max_segments=3) == [t]
|
||
|
||
|
||
def test_nonempty_fallback_when_all_blank():
|
||
assert nonempty_segments_or_fallback(["", " "], fallback="ok") == ["ok"]
|
||
|
||
|
||
def test_split_marker_strips_markdown():
|
||
assert segments_from_llm_response("**A**[SPLIT]_B_", max_segments=3) == ["A", "B"]
|
||
|
||
|
||
def test_paragraph_split_strips_markdown():
|
||
a = "**太为你高兴了!在上海大剧院的舞台绽放,聚光灯下的你。**"
|
||
b = "[详情](https://e.com)说到舞台,我忽然想起你黄浦江边的童年。"
|
||
assert segments_from_llm_response(f"{a}\n\n{b}", max_segments=3) == [
|
||
"太为你高兴了!在上海大剧院的舞台绽放,聚光灯下的你。",
|
||
"详情说到舞台,我忽然想起你黄浦江边的童年。",
|
||
]
|
||
|
||
|
||
def test_strip_markdown_for_chat_preserves_split_token():
|
||
assert "[SPLIT]" in strip_markdown_for_chat("a **b** [SPLIT] c")
|
||
|
||
|
||
def test_strip_parenthetical_removes_stage_directions():
|
||
assert strip_parenthetical_asides_for_chat("你好(轻轻笑) lately") == "你好 lately"
|
||
assert strip_parenthetical_asides_for_chat("(sigh) okay") == "okay"
|
||
assert strip_parenthetical_asides_for_chat("a(一)(二)b") == "ab"
|
||
|
||
|
||
def test_segments_strip_parentheticals_before_split():
|
||
assert segments_from_llm_response(
|
||
"先说(轻轻笑)承接[SPLIT]再问一句", max_segments=3
|
||
) == ["先说承接", "再问一句"]
|
||
|
||
|
||
def test_strip_parenthetical_multiple_passes():
|
||
assert strip_parenthetical_asides_for_chat("a(一)b(二)c") == "abc"
|
||
|
||
|
||
def test_strip_leading_en_period_ack():
|
||
assert strip_leading_en_period_ack_for_chat("嗯。后面正文") == "后面正文"
|
||
assert strip_leading_en_period_ack_for_chat("嗯嗯。后面") == "后面"
|
||
assert strip_leading_en_period_ack_for_chat(" 嗯。 第二句") == "第二句"
|
||
assert strip_leading_en_period_ack_for_chat("句中嗯。不打头") == "句中嗯。不打头"
|
||
|
||
|
||
def test_segments_strip_leading_en_ack():
|
||
assert segments_from_llm_response("嗯。只有一句", max_segments=3) == ["只有一句"]
|
||
assert segments_from_llm_response("嗯。A[SPLIT]嗯。B", max_segments=3) == ["A", "B"]
|
||
|
||
|
||
# ── 与客户端 MESSAGE_SPLIT_REGEX 对齐的容错拆段 ───────────────
|
||
# 防回归:避免后端只发 1 条 AGENT_RESPONSE,文本里残留 `[ SPLIT ]` / `[split]` 等字面量,
|
||
# 导致前端用容错正则拆出空尾段后渲染出「假装在回复」的空气泡。
|
||
|
||
|
||
def test_split_marker_with_inner_whitespace():
|
||
"""LLM 偶尔会写 `[ SPLIT ]` 带空格,后端必须按分隔符拆,不能留在文本里。"""
|
||
assert segments_from_llm_response("第一段[ SPLIT ]第二段", max_segments=3) == [
|
||
"第一段",
|
||
"第二段",
|
||
]
|
||
assert segments_from_llm_response("a [SPLIT ] b", max_segments=3) == ["a", "b"]
|
||
assert segments_from_llm_response("a [ SPLIT] b", max_segments=3) == ["a", "b"]
|
||
|
||
|
||
def test_split_marker_case_insensitive():
|
||
"""与客户端正则 `/i` 对齐:`[split]` / `[Split]` 同样视为分隔符。"""
|
||
assert segments_from_llm_response("a[split]b", max_segments=3) == ["a", "b"]
|
||
assert segments_from_llm_response("a[Split]b", max_segments=3) == ["a", "b"]
|
||
assert segments_from_llm_response("a[SpLiT]b", max_segments=3) == ["a", "b"]
|
||
|
||
|
||
def test_split_marker_fullwidth_brackets():
|
||
"""模型在中文环境下偶尔输出全角括号 【SPLIT】 / [SPLIT],应正常拆段。"""
|
||
assert segments_from_llm_response("第一段【SPLIT】第二段", max_segments=3) == [
|
||
"第一段",
|
||
"第二段",
|
||
]
|
||
assert segments_from_llm_response("第一段[SPLIT]第二段", max_segments=3) == [
|
||
"第一段",
|
||
"第二段",
|
||
]
|
||
|
||
|
||
def test_split_marker_with_zero_width_chars():
|
||
"""LLM 偶尔会在分隔符前后插入 ZWSP/ZWNJ/ZWJ/BOM,应先归一化再拆段。"""
|
||
assert segments_from_llm_response(
|
||
"第一段\u200b[SPLIT]\u200c第二段", max_segments=3
|
||
) == ["第一段", "第二段"]
|
||
assert segments_from_llm_response(
|
||
"first\ufeff[ SPLIT ]\u200dsecond", max_segments=3
|
||
) == ["first", "second"]
|
||
|
||
|
||
def test_split_marker_trailing_only_returns_single_segment():
|
||
"""`[SPLIT]` 出现在结尾时只剩一段非空内容,不应留下空尾段污染前端拆段。"""
|
||
assert segments_from_llm_response("hello[SPLIT]", max_segments=3) == ["hello"]
|
||
assert segments_from_llm_response("hello [ SPLIT ]", max_segments=3) == ["hello"]
|
||
assert segments_from_llm_response("hello【SPLIT】", max_segments=3) == ["hello"]
|
||
|
||
|
||
def test_split_marker_combined_variants():
|
||
"""混合大小写 + 全角 + 空格:与客户端规范化一致即可正常拆段。"""
|
||
assert segments_from_llm_response("a【 split 】b", max_segments=3) == ["a", "b"]
|
||
assert segments_from_llm_response("a[ SPLIT ]b[Split]c", max_segments=3) == [
|
||
"a",
|
||
"b",
|
||
"c",
|
||
]
|