feat: 语音确认、联调与运维增强

- 语音：序数解析（第一个/第二个等）、解析失败计数与 API detail.retry_remaining；百度 ASR 固定 dev_pid 为普通话；SurgeryPipelineError 支持 extra 并入 HTTP detail。 - Demo：demo 路由与假 RTSP、客户端 index 与 README；BackendResolver 与配置调整。 - 可观测：消耗 TSV 日志、语音文件日志、终端 Markdown 辅助；相关测试与依赖更新。 - 注意：.env 仍被 gitignore，本地密钥不会进入本提交。 Made-with: Cursor
2026-04-23 14:24:20 +08:00
parent 42720f81cf
commit 0c05463617
39 changed files with 3030 additions and 143 deletions
--- a/app/services/voice_confirm.py
+++ b/app/services/voice_confirm.py
@@ -32,12 +32,100 @@ _CN_DIGITS = {
 }


+def _parse_ordinal_index_1based(token: str) -> int | None:
+    """将「1」「3」「一」「三」「十一」等解析为 1-based 序数，失败返回 None。"""
+    t = (token or "").strip()
+    if not t:
+        return None
+    if t.isdigit():
+        v = int(t)
+        return v if 1 <= v <= 99 else None
+    if t in _CN_DIGITS and t != "零" and t != "十":
+        return int(_CN_DIGITS[t])
+    if t == "十":
+        return 10
+    if len(t) == 2 and t[0] == "十" and t[1] in _CN_DIGITS and t[1] not in ("零", "十"):
+        return 10 + int(_CN_DIGITS[t[1]])
+    if len(t) == 2 and t[1] == "十" and t[0] in _CN_DIGITS and t[0] != "零":
+        return int(_CN_DIGITS[t[0]]) * 10
+    if len(t) == 3 and t[0] in _CN_DIGITS and t[1] == "十" and t[2] in _CN_DIGITS:
+        return int(_CN_DIGITS[t[0]]) * 10 + int(_CN_DIGITS[t[2]])
+    return None
+
+
+def _label_from_ordinal_1based(n1: int, options: list[str]) -> str | None:
+    if n1 < 1:
+        return None
+    idx = n1 - 1
+    if 0 <= idx < len(options):
+        return options[idx]
+    return None
+
+
+def _choose_from_ordinal_text(raw: str, options: list[str]) -> str | None:
+    """从「第一个」「第2个」「选3」「1号」等表述解析选项。返回 None 表示本函数未识别。"""
+    n_opt = len(options)
+    if n_opt < 1:
+        return None
+
+    # 1) 显式「第N个/项/款/…」，允许夹带后噪声，如「第一个对」
+    for m in re.finditer(
+        r"第([0-9]+|[一二两三四五六七八九十百]+)(?:个|项|款|的|种|名)?", raw
+    ):
+        n1 = _parse_ordinal_index_1based(m.group(1))
+        if n1 is not None:
+            ch = _label_from_ordinal_1based(n1, options)
+            if ch is not None:
+                return ch
+    m_pick = re.search(
+        r"(?:^|[\s,，;；:：])(?:选|要|就)\s*0*([1-9]\d?)(?:\s*号|个|项|款)?",
+        raw,
+    )
+    if m_pick:
+        n1 = int(m_pick.group(1))
+        ch = _label_from_ordinal_1based(n1, options)
+        if ch is not None:
+            return ch
+    norm_for_opt = raw.replace(" ", "").lower()
+    m_op = re.search(r"(?:option|选项)\s*[:：]?\s*(\d+)", norm_for_opt, re.IGNORECASE)
+    if m_op:
+        n1 = int(m_op.group(1))
+        ch = _label_from_ordinal_1based(n1, options)
+        if ch is not None:
+            return ch
+
+    # 2) 行首/句末「一」「二」单字，仅当候选项数较少时
+    s = raw.replace(" ", "")
+    if n_opt <= 3:
+        m_one = re.match(r"^([一二两三四])$", s)
+        if m_one:
+            tok = m_one.group(1)
+            if tok in _CN_DIGITS and tok not in ("零", "十"):
+                n1 = int(_CN_DIGITS[tok])
+                ch = _label_from_ordinal_1based(n1, options)
+                if ch is not None:
+                    return ch
+    m_tail = re.search(r"([0-9一二两三四五六七八九十]+)\s*号$", s)
+    if m_tail:
+        n1 = _parse_ordinal_index_1based(m_tail.group(1))
+        if n1 is not None:
+            ch = _label_from_ordinal_1based(n1, options)
+            if ch is not None:
+                return ch
+
+    return None
+
+
 def parse_voice_choice(asr_text: str, options: list[str]) -> str | None:
    """
    从识别文本中解析医生选择的耗材名称。
    支持：完全匹配、子串匹配、第 N 个（1/一/第一个）。
    """
-    raw = (asr_text or "").strip()
+    raw = re.sub(
+        r"^[。，、；：！？\s]+|[。，、；：！？\s]+$",
+        "",
+        (asr_text or "").strip(),
+    )
    if not raw:
        return None
    normalized = raw.replace(" ", "").lower()
@@ -46,6 +134,10 @@ def parse_voice_choice(asr_text: str, options: list[str]) -> str | None:
        if opt and opt in raw:
            return opt

+    chosen_ord = _choose_from_ordinal_text(raw, options)
+    if chosen_ord is not None:
+        return chosen_ord
+
    m_num = re.search(r"(\d+)", raw)
    if m_num:
        idx = int(m_num.group(1)) - 1
@@ -55,14 +147,11 @@ def parse_voice_choice(asr_text: str, options: list[str]) -> str | None:
    m_cn = re.search(r"第([一二两三四五六七八九十\d]+)个", raw)
    if m_cn:
        token = m_cn.group(1)
-        if token.isdigit():
-            idx = int(token) - 1
-        elif token in _CN_DIGITS:
-            idx = _CN_DIGITS[token] - 1
-        else:
-            idx = -1
-        if 0 <= idx < len(options):
-            return options[idx]
+        n1 = int(token) if token.isdigit() else _parse_ordinal_index_1based(token)
+        if n1 is not None:
+            ch = _label_from_ordinal_1based(n1, options)
+            if ch is not None:
+                return ch

    for i, opt in enumerate(options):
        if not opt:
@@ -107,13 +196,9 @@ def is_rejection_phrase(asr_text: str) -> bool:


 def build_prompt_text(options: list[tuple[str, float]]) -> str:
-    parts = [
-        "请确认刚才使用的耗材是下面哪一项，可以说序号或名称；"
-        "若是清单内其它耗材，也可以直接说该耗材名称。"
-    ]
+    parts = ["请确认刚才使用的耗材是下面哪一项。"]
    for i, (name, _conf) in enumerate(options, start=1):
        parts.append(f"第{i}个，{name}。")
-    parts.append("若都不是请说不是。")
    return "".join(parts)


@@ -228,6 +313,32 @@ class VoiceConfirmationOrchestrator:
            tmp.close()
        return path, None

+    async def speak_prompt(self, text: str) -> None:
+        """仅百度 TTS + ffplay 播报，不录音。供待确认入队时提示手术室。"""
+        if not (text or "").strip():
+            return
+        if not self._s.voice_tts_on_pending_enqueued:
+            return
+        if not self._s.voice_confirmation_enabled:
+            return
+        if not self._baidu.configured:
+            logger.debug("speak_prompt skipped: baidu_speech not configured")
+            return
+        async with self._lock:
+            mp3_path, err = await run_in_threadpool(self._synthesize_to_temp_mp3, text)
+            if err or not mp3_path:
+                logger.warning("TTS synthesis failed: {}", err)
+                return
+            try:
+                play_err = await run_in_threadpool(self._play_mp3_file, mp3_path)
+                if play_err:
+                    logger.warning("TTS play failed: {}", play_err)
+            finally:
+                try:
+                    os.unlink(mp3_path)
+                except OSError:
+                    pass
+
    async def run_confirmation(
        self,
        *,