feat: 语音确认、联调与运维增强

- 语音:序数解析(第一个/第二个等)、解析失败计数与 API detail.retry_remaining;
  百度 ASR 固定 dev_pid 为普通话;SurgeryPipelineError 支持 extra 并入 HTTP detail。
- Demo:demo 路由与假 RTSP、客户端 index 与 README;BackendResolver 与配置调整。
- 可观测:消耗 TSV 日志、语音文件日志、终端 Markdown 辅助;相关测试与依赖更新。
- 注意:.env 仍被 gitignore,本地密钥不会进入本提交。

Made-with: Cursor
This commit is contained in:
Kevin
2026-04-23 14:24:20 +08:00
parent 42720f81cf
commit 0c05463617
39 changed files with 3030 additions and 143 deletions

View File

@@ -21,7 +21,6 @@ class BackendResolver:
) -> None:
self._s = settings
self._hik = hikvision_runtime
self._rtsp_urls_map = settings.video_rtsp_url_map()
def _parse_json_object(self, raw: str) -> dict[str, Any]:
raw = (raw or "").strip()
@@ -55,8 +54,10 @@ class BackendResolver:
return VideoBackendKind.RTSP
def rtsp_url_for_camera(self, camera_id: str) -> str:
if camera_id in self._rtsp_urls_map:
return self._rtsp_urls_map[camera_id]
# Re-read on each use so VIDEO_RTSP_URLS_JSON_FILE can be hot-updated (e.g. dev orchestrator).
m = self._s.video_rtsp_url_map()
if camera_id in m:
return m[camera_id]
tpl = (self._s.video_rtsp_url_template or "").strip()
if tpl:
try:

View File

@@ -26,6 +26,8 @@ from app.services.video.backend_resolver import BackendResolver
from app.services.video.hikvision_runtime import HikvisionInitRefCount, HikvisionRuntime
from app.services.video.rtsp_capture import RtspCapture
from app.services.video.types import VideoBackendKind
from app.services.consumption_tsv_log import append_consumption_window, init_consumption_log_file
from app.services.voice_file_log import init_voice_log_file
from app.services.voice_confirm import build_prompt_text
from app.surgery_errors import SurgeryPipelineError
@@ -41,6 +43,8 @@ class PendingConsumableConfirmation:
created_at: datetime
model_top1_label: str
model_top1_confidence: float
#: 本轮待确认在解析失败时累计次数(首败 + 重试),供 API 计算 retry_remaining。
voice_parse_failures: int = 0
@dataclass
@@ -49,6 +53,8 @@ class CameraStreamInferState:
votes: list[tuple[float, str, ClsTop3]] = field(default_factory=list)
stream_t0: float | None = None
#: 与 `stream_t0` 同一次初始化时的 `time.time()`,与 monotonic 流逝秒相加得到墙钟时间戳
stream_wall_start: float | None = None
next_bucket: int = 0
@@ -258,6 +264,8 @@ class CameraSessionManager:
)
run = RunningSurgery(stop_event=stop_event, state=state, tasks=tasks)
init_consumption_log_file(surgery_id)
init_voice_log_file(surgery_id, self._s)
async with self._manager_lock:
self._active[surgery_id] = run
@@ -408,6 +416,22 @@ class CameraSessionManager:
return []
return list(self._active[surgery_id].state.candidate_consumables)
async def record_voice_parse_failure(
self, surgery_id: str, confirmation_id: str
) -> tuple[int, int]:
"""解析失败时累加计数,返回 (当前失败次数, 距上限还剩几次「重试机会」)。"""
if surgery_id not in self._active:
return 0, 0
st = self._active[surgery_id].state
max_r = int(self._s.voice_confirm_max_failed_parse_rounds)
async with st.lock:
p = st.pending_by_id.get(confirmation_id)
if p is None or p.status != "pending":
return 0, 0
p.voice_parse_failures += 1
remaining = max(0, max_r - p.voice_parse_failures)
return p.voice_parse_failures, remaining
def next_pending_confirmation(
self, surgery_id: str
) -> PendingConsumableConfirmation | None:
@@ -622,6 +646,19 @@ class CameraSessionManager:
if snap is None:
continue
if self._s.video_log_inference_results:
logger.info(
"Vision result surgery={} camera={} top1={}({:.3f}) top2={}({:.3f}) top3={}({:.3f})",
surgery_id,
camera_id,
snap.t1_name,
snap.t1_conf,
snap.t2_name,
snap.t2_conf,
snap.t3_name,
snap.t3_conf,
)
wsec = self._s.consumable_vision_window_sec
pending_preds: list[PredictionResult] = []
async with state.lock:
@@ -630,6 +667,7 @@ class CameraSessionManager:
)
if cis.stream_t0 is None:
cis.stream_t0 = time.monotonic()
cis.stream_wall_start = time.time()
t_rel = time.monotonic() - cis.stream_t0
cis.votes.append((t_rel, snap.t1_name, snap))
current_b = int(t_rel // wsec)
@@ -648,7 +686,19 @@ class CameraSessionManager:
if not bucket_pts:
continue
best = window_bucket_to_best_snap(bucket_pts)
if best is not None:
if best is not None and cis.stream_wall_start is not None:
if self._s.consumption_tsv_log_enabled or self._s.consumption_log_markdown_terminal:
wall_lo = cis.stream_wall_start + lo
wall_hi = cis.stream_wall_start + hi
append_consumption_window(
surgery_id=surgery_id,
name_to_code=state.name_to_code,
best=best,
doctor_id=self._s.video_result_doctor_id,
camera_id=camera_id,
wall_start_epoch=wall_lo,
wall_end_epoch=wall_hi,
)
pending_preds.append(
cls_top3_to_prediction_result(best)
)