life-echo/api/app/core/business_telemetry.py

"""
业务链路 OpenTelemetry span（回忆录阶段、WS、外部依赖等）。
"""

from __future__ import annotations

import time
from contextlib import contextmanager
from typing import Any, Iterator

from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode

from app.core.config import settings
from app.core.telemetry import get_meter, get_tracer

_meter = None
_duration_hist = None

# 仅低基数字段进入 span attribute（禁止 user_id / conversation_id 等）
_ALLOWED_SPAN_ATTRS = frozenset(
    {"provider", "chapter_category", "segment_count", "batch_size", "hours"}
)


def _ensure_instruments() -> None:
    global _meter, _duration_hist
    if _meter is not None or not settings.otel_enabled:
        return
    _meter = get_meter("app.business")
    _duration_hist = _meter.create_histogram(
        "business.operation.duration",
        unit="ms",
        description="Business operation wall time",
    )


def _normalize_attr_value(value: Any) -> str | int | float | bool:
    if isinstance(value, (str, int, float, bool)):
        return value
    return str(value)


@contextmanager
def business_span(
    name: str,
    /,
    **attributes: Any,
) -> Iterator[trace.Span]:
    if not settings.otel_enabled:
        yield trace.INVALID_SPAN
        return

    tracer = get_tracer("app.business")
    otel_attrs = {
        f"business.{k}": _normalize_attr_value(v)
        for k, v in attributes.items()
        if k in _ALLOWED_SPAN_ATTRS and v is not None and v != ""
    }
    t0 = time.perf_counter()
    outcome = "ok"
    with tracer.start_as_current_span(name, attributes=otel_attrs) as span:
        try:
            yield span
        except Exception:
            outcome = "error"
            if span.is_recording():
                span.set_status(Status(StatusCode.ERROR))
            raise
        finally:
            duration_ms = (time.perf_counter() - t0) * 1000
            if span.is_recording():
                span.set_attribute("business.duration_ms", round(duration_ms, 2))
                if outcome == "ok":
                    span.set_status(Status(StatusCode.OK))
            _ensure_instruments()
            if _duration_hist is not None:
                _duration_hist.record(
                    duration_ms,
                    {"operation": name, "outcome": outcome},
                )