179 lines
5.6 KiB
Python
179 lines
5.6 KiB
Python
"""Map algorithm_subprocesses/5.15 TSV output to domain objects (orchestration adapter only)."""
|
||
|
||
from __future__ import annotations
|
||
|
||
import csv
|
||
import hashlib
|
||
import re
|
||
from dataclasses import dataclass
|
||
from datetime import datetime, timedelta, timezone
|
||
from pathlib import Path
|
||
|
||
from app.baked import pipeline as bp
|
||
from app.consumable_catalog import (
|
||
effective_candidate_consumables,
|
||
normalize_candidate_consumables_raw,
|
||
)
|
||
from app.domain.consumption import SurgeryConsumptionStored
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class ReferenceDoctorInfo:
|
||
doctor_id: str
|
||
doctor_name: str | None
|
||
display: str
|
||
raw_line: str
|
||
|
||
|
||
_DOCTOR_NAME_ID_RE = re.compile(
|
||
r"^(?P<name>.+?)\s*\(id=(?P<id>[^,\s)]+)(?:,\s*conf=[\d.]+)?\)\s*(?:\[低置信度\])?\s*$"
|
||
)
|
||
_DOCTOR_ID_ONLY_RE = re.compile(
|
||
r"^doctor_id=(?P<id>[^\s(]+)(?:\s*\(conf=[\d.]+\))?\s*(?:\[低置信度\])?\s*$"
|
||
)
|
||
|
||
|
||
def sha256_file(path: Path) -> str:
|
||
h = hashlib.sha256()
|
||
with path.open("rb") as f:
|
||
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
||
h.update(chunk)
|
||
return h.hexdigest()
|
||
|
||
|
||
def candidate_cache_key(candidate_consumables: list[str]) -> str:
|
||
raw = "\n".join(candidate_consumables).encode("utf-8")
|
||
return hashlib.sha256(raw).hexdigest()[:12]
|
||
|
||
|
||
def resolve_reference_candidates(candidate_consumables: list[str] | None) -> list[str]:
|
||
requested = normalize_candidate_consumables_raw(list(candidate_consumables or []))
|
||
return effective_candidate_consumables(requested)
|
||
|
||
|
||
def parse_reference_doctor_info(path: Path) -> ReferenceDoctorInfo | None:
|
||
if not path.is_file():
|
||
return None
|
||
raw_line = ""
|
||
for line in path.read_text(encoding="utf-8").splitlines():
|
||
stripped = line.strip()
|
||
if stripped.startswith("医生信息:") or stripped.startswith("医生信息:"):
|
||
raw_line = stripped
|
||
break
|
||
if not raw_line:
|
||
return None
|
||
|
||
body = raw_line.split(":", 1)[-1].split(":", 1)[-1].strip()
|
||
if not body or body == "未启用":
|
||
return ReferenceDoctorInfo(
|
||
doctor_id=bp.VIDEO_RESULT_DOCTOR_ID,
|
||
doctor_name=None,
|
||
display=body or "未启用",
|
||
raw_line=raw_line,
|
||
)
|
||
if body.startswith("识别失败"):
|
||
return ReferenceDoctorInfo(
|
||
doctor_id=bp.VIDEO_RESULT_DOCTOR_ID,
|
||
doctor_name=None,
|
||
display=body,
|
||
raw_line=raw_line,
|
||
)
|
||
|
||
match = _DOCTOR_NAME_ID_RE.match(body)
|
||
if match:
|
||
name = match.group("name").strip()
|
||
did = match.group("id").strip()
|
||
return ReferenceDoctorInfo(
|
||
doctor_id=did,
|
||
doctor_name=name,
|
||
display=f"{name} ({did})",
|
||
raw_line=raw_line,
|
||
)
|
||
|
||
match = _DOCTOR_ID_ONLY_RE.match(body)
|
||
if match:
|
||
did = match.group("id").strip()
|
||
return ReferenceDoctorInfo(
|
||
doctor_id=did,
|
||
doctor_name=None,
|
||
display=did,
|
||
raw_line=raw_line,
|
||
)
|
||
|
||
return ReferenceDoctorInfo(
|
||
doctor_id=bp.VIDEO_RESULT_DOCTOR_ID,
|
||
doctor_name=None,
|
||
display=body,
|
||
raw_line=raw_line,
|
||
)
|
||
|
||
|
||
def is_reference_result_complete(path: Path) -> bool:
|
||
if not path.is_file() or path.stat().st_size <= 0:
|
||
return False
|
||
lines = [line.strip() for line in path.read_text(encoding="utf-8").splitlines() if line.strip()]
|
||
if not any(line.lower().startswith("rank\t") for line in lines):
|
||
return False
|
||
has_doctor_footer = any(
|
||
line.startswith("医生信息:") or line.startswith("医生信息:") for line in lines
|
||
)
|
||
has_segment_row = False
|
||
for line in lines:
|
||
if line.lower().startswith("rank\t"):
|
||
continue
|
||
if line.startswith("医生信息"):
|
||
continue
|
||
parts = line.split("\t")
|
||
if len(parts) >= 5 and parts[0].strip().isdigit():
|
||
has_segment_row = True
|
||
break
|
||
return has_doctor_footer and has_segment_row
|
||
|
||
|
||
def doctor_id_for_consumption_rows(doctor: ReferenceDoctorInfo | None) -> str:
|
||
if doctor is None:
|
||
return bp.VIDEO_RESULT_DOCTOR_ID
|
||
if doctor.doctor_name:
|
||
return f"{doctor.doctor_name} ({doctor.doctor_id})"
|
||
if doctor.doctor_id and doctor.doctor_id != bp.VIDEO_RESULT_DOCTOR_ID:
|
||
return doctor.doctor_id
|
||
return bp.VIDEO_RESULT_DOCTOR_ID
|
||
|
||
|
||
def parse_reference_tsv(
|
||
path: Path,
|
||
*,
|
||
base_timestamp: datetime | None = None,
|
||
doctor: ReferenceDoctorInfo | None = None,
|
||
) -> list[SurgeryConsumptionStored]:
|
||
if base_timestamp is None:
|
||
base_timestamp = datetime.now(timezone.utc)
|
||
if doctor is None:
|
||
doctor = parse_reference_doctor_info(path)
|
||
row_doctor_id = doctor_id_for_consumption_rows(doctor)
|
||
out: list[SurgeryConsumptionStored] = []
|
||
with path.open("r", encoding="utf-8", newline="") as f:
|
||
reader = csv.DictReader(f, delimiter="\t")
|
||
for row in reader:
|
||
name = (row.get("top1_name") or "").strip()
|
||
if not name or name.startswith("("):
|
||
continue
|
||
if name.startswith("医生信息"):
|
||
continue
|
||
item_id = (row.get("product_id_top1") or "").strip() or name
|
||
try:
|
||
start_sec = float((row.get("start_sec") or "0").strip() or 0.0)
|
||
except ValueError:
|
||
start_sec = 0.0
|
||
out.append(
|
||
SurgeryConsumptionStored(
|
||
item_id=item_id,
|
||
item_name=name,
|
||
qty=1,
|
||
doctor_id=row_doctor_id,
|
||
timestamp=base_timestamp + timedelta(seconds=max(0.0, start_sec)),
|
||
source="video_batch",
|
||
)
|
||
)
|
||
return out
|