operating-room-monitor-server/backend/app/algo_host/result_adapter.py

"""Map algorithm_subprocesses/5.15 TSV output to domain objects (orchestration adapter only)."""

from __future__ import annotations

import csv
import hashlib
import re
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from pathlib import Path

from app.baked import pipeline as bp
from app.consumable_catalog import (
    effective_candidate_consumables,
    normalize_candidate_consumables_raw,
)
from app.domain.consumption import SurgeryConsumptionStored


@dataclass(frozen=True)
class ReferenceDoctorInfo:
    doctor_id: str
    doctor_name: str | None
    display: str
    raw_line: str


_DOCTOR_NAME_ID_RE = re.compile(
    r"^(?P<name>.+?)\s*\(id=(?P<id>[^,\s)]+)(?:,\s*conf=[\d.]+)?\)\s*(?:\[低置信度\])?\s*$"
)
_DOCTOR_ID_ONLY_RE = re.compile(
    r"^doctor_id=(?P<id>[^\s(]+)(?:\s*\(conf=[\d.]+\))?\s*(?:\[低置信度\])?\s*$"
)


def sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()


def candidate_cache_key(candidate_consumables: list[str]) -> str:
    raw = "\n".join(candidate_consumables).encode("utf-8")
    return hashlib.sha256(raw).hexdigest()[:12]


def resolve_reference_candidates(candidate_consumables: list[str] | None) -> list[str]:
    requested = normalize_candidate_consumables_raw(list(candidate_consumables or []))
    return effective_candidate_consumables(requested)


def parse_reference_doctor_info(path: Path) -> ReferenceDoctorInfo | None:
    if not path.is_file():
        return None
    raw_line = ""
    for line in path.read_text(encoding="utf-8").splitlines():
        stripped = line.strip()
        if stripped.startswith("医生信息：") or stripped.startswith("医生信息:"):
            raw_line = stripped
            break
    if not raw_line:
        return None

    body = raw_line.split("：", 1)[-1].split(":", 1)[-1].strip()
    if not body or body == "未启用":
        return ReferenceDoctorInfo(
            doctor_id=bp.VIDEO_RESULT_DOCTOR_ID,
            doctor_name=None,
            display=body or "未启用",
            raw_line=raw_line,
        )
    if body.startswith("识别失败"):
        return ReferenceDoctorInfo(
            doctor_id=bp.VIDEO_RESULT_DOCTOR_ID,
            doctor_name=None,
            display=body,
            raw_line=raw_line,
        )

    match = _DOCTOR_NAME_ID_RE.match(body)
    if match:
        name = match.group("name").strip()
        did = match.group("id").strip()
        return ReferenceDoctorInfo(
            doctor_id=did,
            doctor_name=name,
            display=f"{name} ({did})",
            raw_line=raw_line,
        )

    match = _DOCTOR_ID_ONLY_RE.match(body)
    if match:
        did = match.group("id").strip()
        return ReferenceDoctorInfo(
            doctor_id=did,
            doctor_name=None,
            display=did,
            raw_line=raw_line,
        )

    return ReferenceDoctorInfo(
        doctor_id=bp.VIDEO_RESULT_DOCTOR_ID,
        doctor_name=None,
        display=body,
        raw_line=raw_line,
    )


def is_reference_result_complete(path: Path) -> bool:
    if not path.is_file() or path.stat().st_size <= 0:
        return False
    lines = [line.strip() for line in path.read_text(encoding="utf-8").splitlines() if line.strip()]
    if not any(line.lower().startswith("rank\t") for line in lines):
        return False
    has_doctor_footer = any(
        line.startswith("医生信息：") or line.startswith("医生信息:") for line in lines
    )
    has_segment_row = False
    for line in lines:
        if line.lower().startswith("rank\t"):
            continue
        if line.startswith("医生信息"):
            continue
        parts = line.split("\t")
        if len(parts) >= 5 and parts[0].strip().isdigit():
            has_segment_row = True
            break
    return has_doctor_footer and has_segment_row


def doctor_id_for_consumption_rows(doctor: ReferenceDoctorInfo | None) -> str:
    if doctor is None:
        return bp.VIDEO_RESULT_DOCTOR_ID
    if doctor.doctor_name:
        return f"{doctor.doctor_name} ({doctor.doctor_id})"
    if doctor.doctor_id and doctor.doctor_id != bp.VIDEO_RESULT_DOCTOR_ID:
        return doctor.doctor_id
    return bp.VIDEO_RESULT_DOCTOR_ID


def parse_reference_tsv(
    path: Path,
    *,
    base_timestamp: datetime | None = None,
    doctor: ReferenceDoctorInfo | None = None,
) -> list[SurgeryConsumptionStored]:
    if base_timestamp is None:
        base_timestamp = datetime.now(timezone.utc)
    if doctor is None:
        doctor = parse_reference_doctor_info(path)
    row_doctor_id = doctor_id_for_consumption_rows(doctor)
    out: list[SurgeryConsumptionStored] = []
    with path.open("r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f, delimiter="\t")
        for row in reader:
            name = (row.get("top1_name") or "").strip()
            if not name or name.startswith("（"):
                continue
            if name.startswith("医生信息"):
                continue
            item_id = (row.get("product_id_top1") or "").strip() or name
            try:
                start_sec = float((row.get("start_sec") or "0").strip() or 0.0)
            except ValueError:
                start_sec = 0.0
            out.append(
                SurgeryConsumptionStored(
                    item_id=item_id,
                    item_name=name,
                    qty=1,
                    doctor_id=row_doctor_id,
                    timestamp=base_timestamp + timedelta(seconds=max(0.0, start_sec)),
                    source="video_batch",
                )
            )
    return out