refactor(api): TOML 配置 SSOT、统一错误契约、Auth/事务加固与可观测性 (#33)
配置 SSOT(TOML + .env) 统一错误契约 Auth 与事务边界 Redis / Celery 可靠性:业务 Redis(DB/0)与 Celery broker/backend(DB/1)显式拆分;连接池、sync client 可观测性(OpenTelemetry + LGTM)
This commit is contained in:
@@ -31,12 +31,16 @@ from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||
from opentelemetry.sdk.trace.sampling import ParentBasedTraceIdRatio
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.runtime_constants import otel_defaults
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fastapi import FastAPI
|
||||
|
||||
_initialized = False
|
||||
_otel_logging_handler: LoggingHandler | None = None
|
||||
_tracer_provider: TracerProvider | None = None
|
||||
_meter_provider: MeterProvider | None = None
|
||||
_log_provider: LoggerProvider | None = None
|
||||
|
||||
|
||||
def _build_resource(service_name: str) -> Resource:
|
||||
@@ -56,8 +60,8 @@ def _build_sampler():
|
||||
TraceIdRatioBased,
|
||||
)
|
||||
|
||||
name = (settings.otel_traces_sampler or "always_on").strip().lower()
|
||||
arg = settings.otel_traces_sampler_arg
|
||||
name = (otel_defaults.traces_sampler(settings.app_environment) or "always_on").strip().lower()
|
||||
arg = otel_defaults.traces_sampler_arg(settings.app_environment)
|
||||
if name in ("always_on", "alwayson"):
|
||||
return ALWAYS_ON
|
||||
if name in ("always_off", "alwaysoff"):
|
||||
@@ -68,39 +72,58 @@ def _build_sampler():
|
||||
return ParentBasedTraceIdRatio(ratio)
|
||||
|
||||
|
||||
def _otlp_timeout_seconds() -> int | None:
|
||||
env = (settings.app_environment or "").strip().lower()
|
||||
if env == "development":
|
||||
return 3
|
||||
return 10
|
||||
|
||||
|
||||
def setup_telemetry(*, service_name: str) -> None:
|
||||
"""配置 OTLP exporter 与自动 instrumentation(幂等)。"""
|
||||
global _initialized, _otel_logging_handler
|
||||
global _tracer_provider, _meter_provider, _log_provider
|
||||
if _initialized or not settings.otel_enabled:
|
||||
return
|
||||
|
||||
endpoint = settings.otel_exporter_otlp_endpoint.rstrip("/")
|
||||
insecure = settings.otel_exporter_otlp_insecure
|
||||
insecure = otel_defaults.exporter_insecure
|
||||
timeout = _otlp_timeout_seconds()
|
||||
|
||||
resource = _build_resource(service_name)
|
||||
|
||||
span_exporter = OTLPSpanExporter(endpoint=endpoint, insecure=insecure)
|
||||
tracer_provider = TracerProvider(resource=resource, sampler=_build_sampler())
|
||||
tracer_provider.add_span_processor(BatchSpanProcessor(span_exporter))
|
||||
trace.set_tracer_provider(tracer_provider)
|
||||
span_exporter = OTLPSpanExporter(
|
||||
endpoint=endpoint, insecure=insecure, timeout=timeout
|
||||
)
|
||||
_tracer_provider = TracerProvider(resource=resource, sampler=_build_sampler())
|
||||
_tracer_provider.add_span_processor(
|
||||
BatchSpanProcessor(span_exporter, export_timeout_millis=(timeout or 10) * 1000)
|
||||
)
|
||||
trace.set_tracer_provider(_tracer_provider)
|
||||
|
||||
metric_exporter = OTLPMetricExporter(endpoint=endpoint, insecure=insecure)
|
||||
metric_exporter = OTLPMetricExporter(
|
||||
endpoint=endpoint, insecure=insecure, timeout=timeout
|
||||
)
|
||||
metric_reader = PeriodicExportingMetricReader(
|
||||
metric_exporter,
|
||||
export_interval_millis=settings.otel_metric_export_interval_ms,
|
||||
export_interval_millis=otel_defaults.metric_export_interval_ms,
|
||||
)
|
||||
meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
|
||||
metrics.set_meter_provider(meter_provider)
|
||||
_meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
|
||||
metrics.set_meter_provider(_meter_provider)
|
||||
|
||||
log_exporter = OTLPLogExporter(endpoint=endpoint, insecure=insecure)
|
||||
log_provider = LoggerProvider(resource=resource)
|
||||
log_provider.add_log_record_processor(BatchLogRecordProcessor(log_exporter))
|
||||
set_logger_provider(log_provider)
|
||||
log_exporter = OTLPLogExporter(endpoint=endpoint, insecure=insecure, timeout=timeout)
|
||||
_log_provider = LoggerProvider(resource=resource)
|
||||
_log_provider.add_log_record_processor(
|
||||
BatchLogRecordProcessor(
|
||||
log_exporter, export_timeout_millis=(timeout or 10) * 1000
|
||||
)
|
||||
)
|
||||
set_logger_provider(_log_provider)
|
||||
|
||||
LoggingInstrumentor().instrument(set_logging_format=True)
|
||||
_otel_logging_handler = LoggingHandler(
|
||||
level=logging.NOTSET,
|
||||
logger_provider=log_provider,
|
||||
logger_provider=_log_provider,
|
||||
)
|
||||
logging.getLogger().addHandler(_otel_logging_handler)
|
||||
|
||||
@@ -111,6 +134,56 @@ def setup_telemetry(*, service_name: str) -> None:
|
||||
_initialized = True
|
||||
|
||||
|
||||
def shutdown_telemetry() -> None:
|
||||
"""停止 OTLP 导出线程并卸载 instrumentation(测试进程退出 / 热重载 / Ctrl+C 前调用)。"""
|
||||
global _initialized, _otel_logging_handler
|
||||
global _tracer_provider, _meter_provider, _log_provider
|
||||
if not _initialized:
|
||||
return
|
||||
|
||||
for name in (
|
||||
"opentelemetry",
|
||||
"opentelemetry.sdk",
|
||||
"opentelemetry.exporter",
|
||||
"opentelemetry.exporter.otlp",
|
||||
):
|
||||
logging.getLogger(name).setLevel(logging.CRITICAL)
|
||||
|
||||
if _otel_logging_handler is not None:
|
||||
logging.getLogger().removeHandler(_otel_logging_handler)
|
||||
_otel_logging_handler = None
|
||||
|
||||
try:
|
||||
FastAPIInstrumentor().uninstrument()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for instrumentor in (
|
||||
LoggingInstrumentor(),
|
||||
HTTPXClientInstrumentor(),
|
||||
RedisInstrumentor(),
|
||||
SQLAlchemyInstrumentor(),
|
||||
CeleryInstrumentor(),
|
||||
):
|
||||
try:
|
||||
instrumentor.uninstrument()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for provider in (_log_provider, _meter_provider, _tracer_provider):
|
||||
if provider is None:
|
||||
continue
|
||||
try:
|
||||
provider.shutdown()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
_tracer_provider = None
|
||||
_meter_provider = None
|
||||
_log_provider = None
|
||||
_initialized = False
|
||||
|
||||
|
||||
def instrument_fastapi_app(app: FastAPI) -> None:
|
||||
if not settings.otel_enabled:
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user