refactor(api): TOML 配置 SSOT、统一错误契约、Auth/事务加固与可观测性 (#33)

配置 SSOT(TOML + .env)
统一错误契约
Auth 与事务边界
Redis / Celery 可靠性:业务 Redis(DB/0)与 Celery broker/backend(DB/1)显式拆分;连接池、sync client
可观测性(OpenTelemetry + LGTM)
This commit is contained in:
Sully
2026-05-22 13:44:50 +08:00
committed by GitHub
parent f09ae248f9
commit 53e0065e3e
298 changed files with 15247 additions and 4344 deletions

View File

@@ -31,12 +31,16 @@ from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.trace.sampling import ParentBasedTraceIdRatio
from app.core.config import settings
from app.core.runtime_constants import otel_defaults
if TYPE_CHECKING:
from fastapi import FastAPI
_initialized = False
_otel_logging_handler: LoggingHandler | None = None
_tracer_provider: TracerProvider | None = None
_meter_provider: MeterProvider | None = None
_log_provider: LoggerProvider | None = None
def _build_resource(service_name: str) -> Resource:
@@ -56,8 +60,8 @@ def _build_sampler():
TraceIdRatioBased,
)
name = (settings.otel_traces_sampler or "always_on").strip().lower()
arg = settings.otel_traces_sampler_arg
name = (otel_defaults.traces_sampler(settings.app_environment) or "always_on").strip().lower()
arg = otel_defaults.traces_sampler_arg(settings.app_environment)
if name in ("always_on", "alwayson"):
return ALWAYS_ON
if name in ("always_off", "alwaysoff"):
@@ -68,39 +72,58 @@ def _build_sampler():
return ParentBasedTraceIdRatio(ratio)
def _otlp_timeout_seconds() -> int | None:
env = (settings.app_environment or "").strip().lower()
if env == "development":
return 3
return 10
def setup_telemetry(*, service_name: str) -> None:
"""配置 OTLP exporter 与自动 instrumentation幂等"""
global _initialized, _otel_logging_handler
global _tracer_provider, _meter_provider, _log_provider
if _initialized or not settings.otel_enabled:
return
endpoint = settings.otel_exporter_otlp_endpoint.rstrip("/")
insecure = settings.otel_exporter_otlp_insecure
insecure = otel_defaults.exporter_insecure
timeout = _otlp_timeout_seconds()
resource = _build_resource(service_name)
span_exporter = OTLPSpanExporter(endpoint=endpoint, insecure=insecure)
tracer_provider = TracerProvider(resource=resource, sampler=_build_sampler())
tracer_provider.add_span_processor(BatchSpanProcessor(span_exporter))
trace.set_tracer_provider(tracer_provider)
span_exporter = OTLPSpanExporter(
endpoint=endpoint, insecure=insecure, timeout=timeout
)
_tracer_provider = TracerProvider(resource=resource, sampler=_build_sampler())
_tracer_provider.add_span_processor(
BatchSpanProcessor(span_exporter, export_timeout_millis=(timeout or 10) * 1000)
)
trace.set_tracer_provider(_tracer_provider)
metric_exporter = OTLPMetricExporter(endpoint=endpoint, insecure=insecure)
metric_exporter = OTLPMetricExporter(
endpoint=endpoint, insecure=insecure, timeout=timeout
)
metric_reader = PeriodicExportingMetricReader(
metric_exporter,
export_interval_millis=settings.otel_metric_export_interval_ms,
export_interval_millis=otel_defaults.metric_export_interval_ms,
)
meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
metrics.set_meter_provider(meter_provider)
_meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
metrics.set_meter_provider(_meter_provider)
log_exporter = OTLPLogExporter(endpoint=endpoint, insecure=insecure)
log_provider = LoggerProvider(resource=resource)
log_provider.add_log_record_processor(BatchLogRecordProcessor(log_exporter))
set_logger_provider(log_provider)
log_exporter = OTLPLogExporter(endpoint=endpoint, insecure=insecure, timeout=timeout)
_log_provider = LoggerProvider(resource=resource)
_log_provider.add_log_record_processor(
BatchLogRecordProcessor(
log_exporter, export_timeout_millis=(timeout or 10) * 1000
)
)
set_logger_provider(_log_provider)
LoggingInstrumentor().instrument(set_logging_format=True)
_otel_logging_handler = LoggingHandler(
level=logging.NOTSET,
logger_provider=log_provider,
logger_provider=_log_provider,
)
logging.getLogger().addHandler(_otel_logging_handler)
@@ -111,6 +134,56 @@ def setup_telemetry(*, service_name: str) -> None:
_initialized = True
def shutdown_telemetry() -> None:
"""停止 OTLP 导出线程并卸载 instrumentation测试进程退出 / 热重载 / Ctrl+C 前调用)。"""
global _initialized, _otel_logging_handler
global _tracer_provider, _meter_provider, _log_provider
if not _initialized:
return
for name in (
"opentelemetry",
"opentelemetry.sdk",
"opentelemetry.exporter",
"opentelemetry.exporter.otlp",
):
logging.getLogger(name).setLevel(logging.CRITICAL)
if _otel_logging_handler is not None:
logging.getLogger().removeHandler(_otel_logging_handler)
_otel_logging_handler = None
try:
FastAPIInstrumentor().uninstrument()
except Exception:
pass
for instrumentor in (
LoggingInstrumentor(),
HTTPXClientInstrumentor(),
RedisInstrumentor(),
SQLAlchemyInstrumentor(),
CeleryInstrumentor(),
):
try:
instrumentor.uninstrument()
except Exception:
pass
for provider in (_log_provider, _meter_provider, _tracer_provider):
if provider is None:
continue
try:
provider.shutdown()
except Exception:
pass
_tracer_provider = None
_meter_provider = None
_log_provider = None
_initialized = False
def instrument_fastapi_app(app: FastAPI) -> None:
if not settings.otel_enabled:
return