diff --git a/.cursor/settings.json b/.cursor/settings.json index 9c2ee48..331de8f 100644 --- a/.cursor/settings.json +++ b/.cursor/settings.json @@ -5,6 +5,9 @@ }, "postman": { "enabled": true + }, + "grafana-assistant": { + "enabled": true } } } diff --git a/api/.env.example b/api/.env.example index 35cd342..ed543ba 100644 --- a/api/.env.example +++ b/api/.env.example @@ -16,6 +16,37 @@ # LIFE_ECHO_API_HOST_PORT=8000 # 若 Caddy 跑在独立容器且非 host 网络,不要用 127.0.0.1,应把 Caddy 加入与本 compose 相同的 Docker 网络,并对 http://life-echo-api-prod:8000 做 reverse_proxy。 +# ============================================================================= +# OpenTelemetry(见 docs/observability.md;Settings 只读 .env,勿 shell export) +# ============================================================================= +# docker-compose.observability.yml 宿主机端口(高位口,避免 3000/9090/4317 冲突) +# GRAFANA_HOST_PORT=48300 +# PROMETHEUS_HOST_PORT=49090 +# OTEL_GRPC_HOST_PORT=48317 +# OTEL_HTTP_HOST_PORT=48318 +# OTEL_COLLECTOR_HEALTH_HOST_PORT=48333 +# TEMPO_HTTP_HOST_PORT=43200 +# LOKI_HTTP_HOST_PORT=43100 +# +# --- development(.env.development):本机 uvicorn/celery --- +# OTEL_ENABLED=true +# OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:48317 +# OTEL_TRACES_SAMPLER=always_on +# +# --- staging / production(.env.staging / .env.production):容器内 compose --- +# OTEL_ENABLED=false +# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 +# OTEL_TRACES_SAMPLER=parentbased_traceidratio +# OTEL_TRACES_SAMPLER_ARG=0.1 +# +OTEL_ENABLED=true +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:48317 +OTEL_EXPORTER_OTLP_INSECURE=true +OTEL_SERVICE_NAME=life-echo-api +OTEL_TRACES_SAMPLER=always_on +# OTEL_TRACES_SAMPLER_ARG=0.1 +# OTEL_METRIC_EXPORT_INTERVAL_MS=10000 + # ============================================================================= # Logging(loguru sink 最低级别:TRACE / DEBUG / INFO / WARNING / ERROR / CRITICAL) # ============================================================================= @@ -140,7 +171,7 @@ REDIS_SESSION_TTL=86400 # CELERY_MEMORY_ENRICHMENT_QUEUE=memory_idle # ============================================================================= -# Internal evaluation API(internal_main / internal-eval.sh;与主 API 进程隔离) +# Internal evaluation API(internal_main;development.sh 默认一并启动;与主 API 进程隔离) # ============================================================================= # 本地:`openssl rand -hex 32`;不用 internal eval 时可留空 INTERNAL_EVAL_API_KEY= diff --git a/api/.env.production b/api/.env.production index c059d7b..6650d28 100644 --- a/api/.env.production +++ b/api/.env.production @@ -33,6 +33,18 @@ LOG_LEVEL=INFO # CELERY_LOG_LEVEL= # HTTPX_LOG_LEVEL= +# ============================================================================= +# OpenTelemetry(生产;第二阶段 compose profile 接入后设 OTEL_ENABLED=true,见 docs/observability.md) +# 容器内 API/Celery → http://otel-collector:4317;勿用 localhost +# ============================================================================= +OTEL_ENABLED=false +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 +OTEL_EXPORTER_OTLP_INSECURE=true +OTEL_SERVICE_NAME=life-echo-api +OTEL_TRACES_SAMPLER=parentbased_traceidratio +OTEL_TRACES_SAMPLER_ARG=0.1 +# OTEL_METRIC_EXPORT_INTERVAL_MS=10000 + # ============================================================================= # LLM / DeepSeek # ============================================================================= diff --git a/api/.env.staging b/api/.env.staging index 7a6dcc6..fa1ed8b 100644 --- a/api/.env.staging +++ b/api/.env.staging @@ -32,6 +32,18 @@ LOG_LEVEL=INFO # CELERY_LOG_LEVEL= # HTTPX_LOG_LEVEL= +# ============================================================================= +# OpenTelemetry(预发;compose 接入 LGTM 后设 OTEL_ENABLED=true,见 docs/observability.md) +# API/Celery 容器内 endpoint 用服务名;Grafana 宿主机端口见 observability compose(默认 48300 等) +# ============================================================================= +OTEL_ENABLED=false +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 +OTEL_EXPORTER_OTLP_INSECURE=true +OTEL_SERVICE_NAME=life-echo-api +OTEL_TRACES_SAMPLER=parentbased_traceidratio +OTEL_TRACES_SAMPLER_ARG=0.1 +# OTEL_METRIC_EXPORT_INTERVAL_MS=10000 + # ============================================================================= # LLM / DeepSeek # ============================================================================= diff --git a/api/README.md b/api/README.md index fa26cd1..715351f 100644 --- a/api/README.md +++ b/api/README.md @@ -32,6 +32,16 @@ Life Echo API 是一个智能对话系统,通过 WebSocket 实时连接,使 - **ASR/TTS**: OpenAI Whisper API - **认证**: JWT (python-jose) + bcrypt - **其他**: Pydantic, python-dotenv +- **可观测性**: OpenTelemetry → Grafana LGTM(Tempo / Prometheus / Loki),见 [`docs/observability.md`](docs/observability.md) + +## 可观测性(本地) + +```bash +docker compose -f docker-compose.dev.yml -f docker-compose.observability.yml up -d +# Grafana: http://127.0.0.1:48300 +``` + +在 `.env` 中配置 `OTEL_*`(见 [`.env.example`](.env.example)),与 Postgres/Redis 一样由 Settings 加载,无需 shell export。详见 [`docs/observability.md`](docs/observability.md)。 ## 项目结构 diff --git a/api/app/adapters/asr/tencent_asr.py b/api/app/adapters/asr/tencent_asr.py index 0620910..d80fccf 100644 --- a/api/app/adapters/asr/tencent_asr.py +++ b/api/app/adapters/asr/tencent_asr.py @@ -3,6 +3,7 @@ import asyncio import base64 +from app.core.business_telemetry import business_span from app.core.logging import get_logger from app.ports.asr import ASRTranscriptionError @@ -39,6 +40,10 @@ class TencentASRProvider: return bool(self._secret_id and self._secret_key and self._get_client()) async def transcribe(self, audio: bytes, format: str = "m4a") -> str: + with business_span("asr.transcribe", provider="tencent"): + return await self._transcribe_inner(audio, format) + + async def _transcribe_inner(self, audio: bytes, format: str) -> str: client = self._get_client() if not client: raise ASRTranscriptionError( diff --git a/api/app/adapters/asr/whisper_local.py b/api/app/adapters/asr/whisper_local.py index a64a0a5..dfe6bd9 100644 --- a/api/app/adapters/asr/whisper_local.py +++ b/api/app/adapters/asr/whisper_local.py @@ -8,6 +8,7 @@ import re import tempfile from typing import Any, Iterable +from app.core.business_telemetry import business_span from app.core.logging import get_logger from app.ports.asr import ASRTranscriptionError @@ -102,6 +103,10 @@ class WhisperASRProvider: return self._load_model() async def transcribe(self, audio: bytes, format: str = "m4a") -> str: + with business_span("asr.transcribe", provider="whisper"): + return await self._transcribe_inner(audio, format) + + async def _transcribe_inner(self, audio: bytes, format: str) -> str: # 与 v1.1.0 相同的单次 transcribe;推理放线程池,避免阻塞 asyncio(tag 上为同步调用)。 self._load_model() if not self._model: diff --git a/api/app/adapters/embedding/zhipu.py b/api/app/adapters/embedding/zhipu.py index cb018ad..249d415 100644 --- a/api/app/adapters/embedding/zhipu.py +++ b/api/app/adapters/embedding/zhipu.py @@ -6,6 +6,7 @@ import asyncio from zai import ZhipuAiClient +from app.core.business_telemetry import business_span from app.core.embedding import MEMORY_EMBEDDING_DIMENSION from app.core.logging import get_logger @@ -57,12 +58,13 @@ class ZhipuEmbeddingProvider: async def embed_texts(self, texts: list[str]) -> list[list[float]]: if not self._client or not texts: return [] - out: list[list[float]] = [] - for i in range(0, len(texts), _EMBED_BATCH_SIZE): - batch = texts[i : i + _EMBED_BATCH_SIZE] - part = await asyncio.to_thread(self._create_vectors_sync, batch) - out.extend(part) - return out + with business_span("embedding.zhipu.embed", batch_size=len(texts)): + out: list[list[float]] = [] + for i in range(0, len(texts), _EMBED_BATCH_SIZE): + batch = texts[i : i + _EMBED_BATCH_SIZE] + part = await asyncio.to_thread(self._create_vectors_sync, batch) + out.extend(part) + return out def embed_text_sync(self, text: str) -> list[float]: vecs = self.embed_texts_sync([text]) @@ -71,8 +73,9 @@ class ZhipuEmbeddingProvider: def embed_texts_sync(self, texts: list[str]) -> list[list[float]]: if not self._client or not texts: return [] - out: list[list[float]] = [] - for i in range(0, len(texts), _EMBED_BATCH_SIZE): - batch = texts[i : i + _EMBED_BATCH_SIZE] - out.extend(self._create_vectors_sync(batch)) - return out + with business_span("embedding.zhipu.embed", batch_size=len(texts)): + out: list[list[float]] = [] + for i in range(0, len(texts), _EMBED_BATCH_SIZE): + batch = texts[i : i + _EMBED_BATCH_SIZE] + out.extend(self._create_vectors_sync(batch)) + return out diff --git a/api/app/adapters/llm/deepseek.py b/api/app/adapters/llm/deepseek.py index 244574c..0d6805d 100644 --- a/api/app/adapters/llm/deepseek.py +++ b/api/app/adapters/llm/deepseek.py @@ -4,6 +4,8 @@ from collections.abc import AsyncIterator from langchain_openai import ChatOpenAI +from app.core.llm_telemetry import langchain_invoke_span, observe_astream + class DeepSeekLLMProvider: """LangChain-based LLM adapter for DeepSeek and OpenAI-compatible APIs. @@ -56,7 +58,15 @@ class DeepSeekLLMProvider: ) -> str: llm = self._get_llm(temperature, model, max_tokens) lc_messages = _to_langchain_messages(messages) - result = await llm.ainvoke(lc_messages) + resolved_model = model or self._default_model + with langchain_invoke_span( + agent="deepseek.complete", + provider="deepseek", + model=resolved_model, + call_type="chat", + ) as tel: + result = await llm.ainvoke(lc_messages) + tel["response"] = result return str(result.content) async def stream( @@ -69,7 +79,14 @@ class DeepSeekLLMProvider: ) -> AsyncIterator[str]: llm = self._get_llm(temperature, model, max_tokens) lc_messages = _to_langchain_messages(messages) - async for chunk in llm.astream(lc_messages): + resolved_model = model or self._default_model + async for chunk in observe_astream( + llm, + lc_messages, + agent="deepseek.stream", + provider="deepseek", + model=resolved_model, + ): if chunk.content: yield str(chunk.content) diff --git a/api/app/adapters/sms/tencent.py b/api/app/adapters/sms/tencent.py index 357988b..e3f736d 100644 --- a/api/app/adapters/sms/tencent.py +++ b/api/app/adapters/sms/tencent.py @@ -7,6 +7,7 @@ from tencentcloud.common.exception.tencent_cloud_sdk_exception import ( from tencentcloud.sms.v20210111 import models as sms_models from tencentcloud.sms.v20210111 import sms_client +from app.core.business_telemetry import business_span from app.core.logging import get_logger logger = get_logger(__name__) @@ -32,6 +33,10 @@ class TencentSmsSender: self._template_param_count = template_param_count def send_verification_code(self, phone: str, code: str) -> bool: + with business_span("sms.tencent.send"): + return self._send_verification_code_inner(phone, code) + + def _send_verification_code_inner(self, phone: str, code: str) -> bool: if not self._secret_id or not self._secret_key: logger.error("Tencent SMS credentials not configured") return False diff --git a/api/app/adapters/tts/openai_tts.py b/api/app/adapters/tts/openai_tts.py index 6c2553a..55eaf64 100644 --- a/api/app/adapters/tts/openai_tts.py +++ b/api/app/adapters/tts/openai_tts.py @@ -5,6 +5,7 @@ from io import BytesIO from openai import OpenAI +from app.core.business_telemetry import business_span from app.core.logging import get_logger logger = get_logger(__name__) @@ -35,6 +36,10 @@ class OpenAITTSProvider: *, language: str = "zh", # noqa: ARG002 — OpenAI TTS auto-detects language ) -> bytes: + with business_span("tts.synthesize", provider="openai"): + return await self._synthesize_api(text, voice) + + async def _synthesize_api(self, text: str, voice: str) -> bytes: if not self._client: return b"" try: diff --git a/api/app/adapters/tts/tencent_tts.py b/api/app/adapters/tts/tencent_tts.py index 2377fa3..c00fa15 100644 --- a/api/app/adapters/tts/tencent_tts.py +++ b/api/app/adapters/tts/tencent_tts.py @@ -5,6 +5,7 @@ import base64 import re import uuid +from app.core.business_telemetry import business_span from app.core.logging import get_logger logger = get_logger(__name__) @@ -180,6 +181,16 @@ class TencentTTSProvider: voice: str = "alloy", *, language: str = "zh", + ) -> bytes: + with business_span("tts.synthesize", provider="tencent"): + return await self._synthesize_inner(text, voice, language=language) + + async def _synthesize_inner( + self, + text: str, + voice: str = "alloy", + *, + language: str = "zh", ) -> bytes: if not self._secret_id or not self._secret_key: logger.error( diff --git a/api/app/agents/chat/interview_agent.py b/api/app/agents/chat/interview_agent.py index 105598a..15293c8 100644 --- a/api/app/agents/chat/interview_agent.py +++ b/api/app/agents/chat/interview_agent.py @@ -38,6 +38,7 @@ from app.agents.state_schema import ( interview_control_state, narrative_coverage_state, ) +from app.core.llm_telemetry import infer_provider_model, observe_ainvoke from app.core.agent_logging import ( agent_span, log_agent_payload, @@ -331,7 +332,15 @@ class InterviewAgent: conversation_turn_total, history_pairs_windowed, ) - response = await chat_llm.ainvoke(messages) + provider, model = infer_provider_model(chat_llm) + response = await observe_ainvoke( + chat_llm, + messages, + agent="InterviewAgent.generate_response", + provider=provider, + model=model, + call_type="chat", + ) response_ms = (time.perf_counter() - llm_t0) * 1000 logger.info( "event=chat_llm_done agent=InterviewAgent.generate_response_with_state " @@ -384,7 +393,15 @@ class InterviewAgent: _message_contents_char_count(retry_messages), conversation_id, ) - response_retry = await chat_llm.ainvoke(retry_messages) + provider, model = infer_provider_model(chat_llm) + response_retry = await observe_ainvoke( + chat_llm, + retry_messages, + agent="InterviewAgent.duplicate_guard_retry", + provider=provider, + model=model, + call_type="chat", + ) logger.info( "event=chat_llm_done agent=InterviewAgent.duplicate_guard_retry " "response_latency_ms={:.2f}", @@ -524,7 +541,15 @@ class InterviewAgent: hw.turn_total, len(hw.window) // 2, ) - response = await opening_llm.ainvoke(messages) + provider, model = infer_provider_model(opening_llm) + response = await observe_ainvoke( + opening_llm, + messages, + agent="InterviewAgent.opening", + provider=provider, + model=model, + call_type="chat", + ) logger.info( "event=chat_llm_done agent=InterviewAgent.generate_opening_message " "response_latency_ms={:.2f}", @@ -643,7 +668,15 @@ class InterviewAgent: len(hw.window) // 2, idle_hours, ) - response = await re_greet_llm.ainvoke(messages) + provider, model = infer_provider_model(re_greet_llm) + response = await observe_ainvoke( + re_greet_llm, + messages, + agent="InterviewAgent.re_greeting", + provider=provider, + model=model, + call_type="chat", + ) logger.info( "event=chat_llm_done agent=InterviewAgent.generate_re_greeting_message " "response_latency_ms={:.2f}", diff --git a/api/app/core/agent_logging.py b/api/app/core/agent_logging.py index e2021e0..b468a99 100644 --- a/api/app/core/agent_logging.py +++ b/api/app/core/agent_logging.py @@ -24,7 +24,11 @@ import time from contextlib import contextmanager from typing import Any, Iterator +from opentelemetry import trace +from opentelemetry.trace import Status, StatusCode + from app.core.config import settings +from app.core.telemetry import get_tracer _dedup_lock = threading.Lock() _last_prompt_sha256_by_label: dict[str, str] = {} @@ -97,15 +101,41 @@ def agent_span( ctx = " ".join(f"{k}={v!r}" for k, v in context.items()) if agent_verbose_enabled(): logger.debug("agent_span_start {} {}", operation, ctx) - try: - yield - finally: - ms = (time.perf_counter() - t0) * 1000 + + def _log_end(ms: float) -> None: if agent_verbose_enabled(): logger.debug("agent_span_end {} duration_ms={:.2f} {}", operation, ms, ctx) elif settings.log_agent_verbose: logger.info("agent_span {} duration_ms={:.2f} {}", operation, ms, ctx) + if settings.otel_enabled: + tracer = get_tracer("app.agent") + with tracer.start_as_current_span( + "agent.operation", + attributes={"agent.operation": operation}, + ) as span: + failed = False + try: + yield + except Exception: + failed = True + if span.is_recording(): + span.set_status(Status(StatusCode.ERROR)) + raise + finally: + ms = (time.perf_counter() - t0) * 1000 + if span.is_recording(): + span.set_attribute("agent.duration_ms", round(ms, 2)) + if not failed: + span.set_status(Status(StatusCode.OK)) + _log_end(ms) + return + + try: + yield + finally: + _log_end((time.perf_counter() - t0) * 1000) + def log_agent_payload( logger: Any, diff --git a/api/app/core/alembic_revision_repair.py b/api/app/core/alembic_revision_repair.py index 69a3e87..2426348 100644 --- a/api/app/core/alembic_revision_repair.py +++ b/api/app/core/alembic_revision_repair.py @@ -9,6 +9,8 @@ _WITHDRAWN_0020_REVISIONS = frozenset( "0020_add_tts_audio_urls_column", "0020_backfill_missing_schema", "0020_backfill_all_missing_columns", + # 曾本地试运行后从仓库撤回,仅 dev 库可能残留 stamp + "0020_chapters_book_id", } ) _REPAIR_TARGET_REVISION = "0018_users_language_preference" diff --git a/api/app/core/business_telemetry.py b/api/app/core/business_telemetry.py new file mode 100644 index 0000000..0a0488d --- /dev/null +++ b/api/app/core/business_telemetry.py @@ -0,0 +1,81 @@ +""" +业务链路 OpenTelemetry span(回忆录阶段、WS、外部依赖等)。 +""" + +from __future__ import annotations + +import time +from contextlib import contextmanager +from typing import Any, Iterator + +from opentelemetry import trace +from opentelemetry.trace import Status, StatusCode + +from app.core.config import settings +from app.core.telemetry import get_meter, get_tracer + +_meter = None +_duration_hist = None + +# 仅低基数字段进入 span attribute(禁止 user_id / conversation_id 等) +_ALLOWED_SPAN_ATTRS = frozenset( + {"provider", "chapter_category", "segment_count", "batch_size", "hours"} +) + + +def _ensure_instruments() -> None: + global _meter, _duration_hist + if _meter is not None or not settings.otel_enabled: + return + _meter = get_meter("app.business") + _duration_hist = _meter.create_histogram( + "business.operation.duration", + unit="ms", + description="Business operation wall time", + ) + + +def _normalize_attr_value(value: Any) -> str | int | float | bool: + if isinstance(value, (str, int, float, bool)): + return value + return str(value) + + +@contextmanager +def business_span( + name: str, + /, + **attributes: Any, +) -> Iterator[trace.Span]: + if not settings.otel_enabled: + yield trace.INVALID_SPAN + return + + tracer = get_tracer("app.business") + otel_attrs = { + f"business.{k}": _normalize_attr_value(v) + for k, v in attributes.items() + if k in _ALLOWED_SPAN_ATTRS and v is not None and v != "" + } + t0 = time.perf_counter() + outcome = "ok" + with tracer.start_as_current_span(name, attributes=otel_attrs) as span: + try: + yield span + except Exception: + outcome = "error" + if span.is_recording(): + span.set_status(Status(StatusCode.ERROR)) + raise + finally: + duration_ms = (time.perf_counter() - t0) * 1000 + if span.is_recording(): + span.set_attribute("business.duration_ms", round(duration_ms, 2)) + if outcome == "ok": + span.set_status(Status(StatusCode.OK)) + _ensure_instruments() + if _duration_hist is not None: + _duration_hist.record( + duration_ms, + {"operation": name, "outcome": outcome}, + ) diff --git a/api/app/core/config.py b/api/app/core/config.py index 6341f26..03a172a 100644 --- a/api/app/core/config.py +++ b/api/app/core/config.py @@ -223,6 +223,36 @@ class Settings(BaseSettings): # 非空时额外写入 JSONL(serialize=True),便于 Loki/ELK;与 stderr 彩色控制台并存 log_json_file: str = "" + # ── OpenTelemetry ───────────────────────────────────────── + otel_enabled: bool = False + otel_exporter_otlp_endpoint: str = "http://localhost:48317" + otel_exporter_otlp_insecure: bool = True + otel_service_name: str = "" + otel_traces_sampler: str = Field( + default="always_on", + description="always_on | parentbased_traceidratio | always_off", + ) + otel_traces_sampler_arg: float | None = Field(default=None, ge=0.0, le=1.0) + otel_metric_export_interval_ms: int = Field(default=10_000, ge=1000, le=300_000) + + @field_validator("otel_enabled", mode="before") + @classmethod + def _coerce_otel_enabled(cls, v: object) -> bool: + if isinstance(v, bool): + return v + if v is None: + return False + return str(v).strip().lower() in ("1", "true", "yes", "on") + + @field_validator("otel_exporter_otlp_insecure", mode="before") + @classmethod + def _coerce_otel_exporter_otlp_insecure(cls, v: object) -> bool: + if isinstance(v, bool): + return v + if v is None: + return True + return str(v).strip().lower() in ("1", "true", "yes", "on") + @field_validator("celery_purge_broker_on_startup", mode="before") @classmethod def _coerce_celery_purge_broker_on_startup(cls, v: object) -> bool: diff --git a/api/app/core/langchain_llm.py b/api/app/core/langchain_llm.py index 6c09ba7..b8fd33a 100644 --- a/api/app/core/langchain_llm.py +++ b/api/app/core/langchain_llm.py @@ -16,6 +16,7 @@ from app.core.agent_logging import ( agent_verbose_enabled, log_agent_payload, ) +from app.core.llm_telemetry import infer_provider_model, langchain_invoke_span from app.core.logging import get_logger logger = get_logger(__name__) @@ -68,29 +69,41 @@ def invoke_json_object( sha = _prompt_sha12(prompt_for_api) attempts = 2 if retry_empty else 1 t0 = time.perf_counter() + provider, model = infer_provider_model(llm) last_content = "" - for attempt in range(attempts): - response = bound.invoke(prompt_for_api) - content = (getattr(response, "content", None) or "").strip() - last_content = content - if content: - if attempt > 0: - logger.info( - "json_object 空内容重试成功 agent={} prompt_sha12={}", + with langchain_invoke_span( + agent=tag, + provider=provider, + model=model, + call_type="json", + prompt_sha12=sha, + max_tokens=max_tokens, + ) as tel: + for attempt in range(attempts): + response = bound.invoke(prompt_for_api) + tel["response"] = response + content = (getattr(response, "content", None) or "").strip() + last_content = content + if content: + if attempt > 0: + logger.info( + "json_object 空内容重试成功 agent={} prompt_sha12={}", + tag, + sha, + ) + tel["outcome"] = "ok" + _log_json_object_done( + tag, sha, prompt_for_api, content, attempt + 1, t0, success=True + ) + return content + if attempt == 0 and retry_empty: + logger.warning( + "json_object 返回空 content,将重试 agent={} attempt={} prompt_sha12={}", tag, + attempt, sha, ) - _log_json_object_done( - tag, sha, prompt_for_api, content, attempt + 1, t0, success=True - ) - return content - if attempt == 0 and retry_empty: - logger.warning( - "json_object 返回空 content,将重试 agent={} attempt={} prompt_sha12={}", - tag, - attempt, - sha, - ) + tel["outcome"] = "error" logger.warning("json_object 仍为空 agent={} prompt_sha12={}", tag, sha) _log_json_object_done( tag, sha, prompt_for_api, last_content, attempts, t0, success=False @@ -113,29 +126,41 @@ async def ainvoke_json_object( sha = _prompt_sha12(prompt_for_api) attempts = 2 if retry_empty else 1 t0 = time.perf_counter() + provider, model = infer_provider_model(llm) last_content = "" - for attempt in range(attempts): - response = await bound.ainvoke(prompt_for_api) - content = (getattr(response, "content", None) or "").strip() - last_content = content - if content: - if attempt > 0: - logger.info( - "json_object 空内容重试成功 agent={} prompt_sha12={}", + with langchain_invoke_span( + agent=tag, + provider=provider, + model=model, + call_type="json", + prompt_sha12=sha, + max_tokens=max_tokens, + ) as tel: + for attempt in range(attempts): + response = await bound.ainvoke(prompt_for_api) + tel["response"] = response + content = (getattr(response, "content", None) or "").strip() + last_content = content + if content: + if attempt > 0: + logger.info( + "json_object 空内容重试成功 agent={} prompt_sha12={}", + tag, + sha, + ) + tel["outcome"] = "ok" + _log_json_object_done( + tag, sha, prompt_for_api, content, attempt + 1, t0, success=True + ) + return content + if attempt == 0 and retry_empty: + logger.warning( + "json_object 返回空 content,将重试 agent={} attempt={} prompt_sha12={}", tag, + attempt, sha, ) - _log_json_object_done( - tag, sha, prompt_for_api, content, attempt + 1, t0, success=True - ) - return content - if attempt == 0 and retry_empty: - logger.warning( - "json_object 返回空 content,将重试 agent={} attempt={} prompt_sha12={}", - tag, - attempt, - sha, - ) + tel["outcome"] = "error" logger.warning("json_object 仍为空 agent={} prompt_sha12={}", tag, sha) _log_json_object_done( tag, sha, prompt_for_api, last_content, attempts, t0, success=False diff --git a/api/app/core/llm_call.py b/api/app/core/llm_call.py index 4b4ae7b..52cf767 100644 --- a/api/app/core/llm_call.py +++ b/api/app/core/llm_call.py @@ -30,6 +30,12 @@ from app.core.langchain_llm import ( ) from app.core.llm_errors import LlmHttpErrorVendor, format_llm_http_error_message from app.core.llm_http_openai_chat_errors import should_log_openai_error_as_warning +from app.core.llm_telemetry import ( + extract_token_usage, + infer_provider_model, + llm_call_span, + record_llm_call, +) from app.core.logging import get_logger logger = get_logger(__name__) @@ -138,14 +144,16 @@ def _invoke_raw_sync( max_tokens: int, agent: str, retry_empty: bool, -) -> tuple[str, int]: +) -> tuple[str, int, int, int]: prompt_for_api = ensure_json_object_prompt_has_json_keyword(prompt) bound = bind_json_object_mode(llm, max_tokens=max_tokens) tag = agent or "json_object" sha = _prompt_sha12(prompt_for_api) attempts = 2 if retry_empty else 1 + last_in, last_out = 0, 0 for attempt in range(attempts): response = bound.invoke(prompt_for_api) + last_in, last_out = extract_token_usage(response) content = (getattr(response, "content", None) or "").strip() if content: if attempt > 0: @@ -154,7 +162,7 @@ def _invoke_raw_sync( tag, sha, ) - return content, attempt + 1 + return content, attempt + 1, last_in, last_out if attempt == 0 and retry_empty: logger.warning( "json_object 返回空 content,将重试 agent={} attempt={} prompt_sha12={}", @@ -163,7 +171,7 @@ def _invoke_raw_sync( sha, ) logger.warning("json_object 仍为空 agent={} prompt_sha12={}", tag, sha) - return "", attempts + return "", attempts, last_in, last_out async def _invoke_raw_async( @@ -173,14 +181,16 @@ async def _invoke_raw_async( max_tokens: int, agent: str, retry_empty: bool, -) -> tuple[str, int]: +) -> tuple[str, int, int, int]: prompt_for_api = ensure_json_object_prompt_has_json_keyword(prompt) bound = bind_json_object_mode(llm, max_tokens=max_tokens) tag = agent or "json_object" sha = _prompt_sha12(prompt_for_api) attempts = 2 if retry_empty else 1 + last_in, last_out = 0, 0 for attempt in range(attempts): response = await bound.ainvoke(prompt_for_api) + last_in, last_out = extract_token_usage(response) content = (getattr(response, "content", None) or "").strip() if content: if attempt > 0: @@ -189,7 +199,7 @@ async def _invoke_raw_async( tag, sha, ) - return content, attempt + 1 + return content, attempt + 1, last_in, last_out if attempt == 0 and retry_empty: logger.warning( "json_object 返回空 content,将重试 agent={} attempt={} prompt_sha12={}", @@ -198,7 +208,7 @@ async def _invoke_raw_async( sha, ) logger.warning("json_object 仍为空 agent={} prompt_sha12={}", tag, sha) - return "", attempts + return "", attempts, last_in, last_out def _parse_and_validate( @@ -252,6 +262,12 @@ def _emit_meta( parse_ok: bool, used_fallback: bool, error_kind: str | None, + provider: str, + model: str, + prompt_sha12: str, + input_tokens: int = 0, + output_tokens: int = 0, + span: Any | None = None, ) -> None: meta = LLMCallMeta( agent=agent, @@ -263,17 +279,35 @@ def _emit_meta( used_fallback=used_fallback, error_kind=error_kind, ) - logger.bind( - event="llm_json_call", + bind = { + "event": "llm_json_call", + "agent": meta.agent, + "schema": meta.schema_name, + "max_tokens": meta.max_tokens, + "duration_ms": round(meta.duration_ms, 2), + "attempts": meta.attempts, + "parse_ok": meta.parse_ok, + "used_fallback": meta.used_fallback, + "error_kind": meta.error_kind, + "provider": provider, + "prompt_sha12": prompt_sha12, + } + logger.bind(**bind).info("llm_json_call_done") + record_llm_call( agent=meta.agent, - schema=meta.schema_name, - max_tokens=meta.max_tokens, - duration_ms=round(meta.duration_ms, 2), + schema_name=meta.schema_name, + provider=provider, + model=model, + duration_ms=meta.duration_ms, attempts=meta.attempts, parse_ok=meta.parse_ok, used_fallback=meta.used_fallback, error_kind=meta.error_kind, - ).info("llm_json_call_done") + prompt_sha12=prompt_sha12, + input_tokens=input_tokens, + output_tokens=output_tokens, + span=span, + ) def llm_json_call( @@ -288,13 +322,59 @@ def llm_json_call( http_error_vendor: LlmHttpErrorVendor = "deepseek", ) -> T: """同步:invoke → 解析 JSON → `schema.model_validate`;失败时 `fallback_factory` 或 `LLMCallError`。""" - t0 = time.perf_counter() schema_name = getattr(schema, "__name__", str(schema)) + provider, model = infer_provider_model(llm, http_error_vendor=http_error_vendor) + prompt_sha12 = _prompt_sha12(prompt) + + with llm_call_span( + agent=agent, + schema_name=schema_name, + provider=provider, + model=model, + prompt_sha12=prompt_sha12, + max_tokens=max_tokens, + ) as span: + return _llm_json_call_sync_body( + llm, + prompt, + schema, + max_tokens=max_tokens, + agent=agent, + fallback_factory=fallback_factory, + retry_empty=retry_empty, + http_error_vendor=http_error_vendor, + schema_name=schema_name, + provider=provider, + model=model, + prompt_sha12=prompt_sha12, + span=span, + ) + + +def _llm_json_call_sync_body( + llm: Any, + prompt: str, + schema: type[T], + *, + max_tokens: int, + agent: str, + fallback_factory: Callable[[], T] | None, + retry_empty: bool, + http_error_vendor: LlmHttpErrorVendor, + schema_name: str, + provider: str, + model: str, + prompt_sha12: str, + span: Any, +) -> T: + t0 = time.perf_counter() attempts_used = 0 + input_tokens = 0 + output_tokens = 0 raw = "" try: - raw, attempts_used = _invoke_raw_sync( + raw, attempts_used, input_tokens, output_tokens = _invoke_raw_sync( llm, prompt, max_tokens=max_tokens, @@ -311,6 +391,12 @@ def llm_json_call( parse_ok=True, used_fallback=False, error_kind=None, + provider=provider, + model=model, + prompt_sha12=prompt_sha12, + input_tokens=input_tokens, + output_tokens=output_tokens, + span=span, ) if agent_verbose_enabled(): log_agent_payload( @@ -331,6 +417,12 @@ def llm_json_call( parse_ok=False, used_fallback=used_fb, error_kind=e.kind, + provider=provider, + model=model, + prompt_sha12=prompt_sha12, + input_tokens=input_tokens, + output_tokens=output_tokens, + span=span, ) if agent_verbose_enabled(): log_agent_payload( @@ -354,6 +446,12 @@ def llm_json_call( parse_ok=False, used_fallback=used_fb, error_kind="invoke", + provider=provider, + model=model, + prompt_sha12=prompt_sha12, + input_tokens=input_tokens, + output_tokens=output_tokens, + span=span, ) if agent_verbose_enabled(): log_agent_payload( @@ -383,13 +481,59 @@ async def allm_json_call( http_error_vendor: LlmHttpErrorVendor = "deepseek", ) -> T: """异步版,语义与 `llm_json_call` 一致。""" - t0 = time.perf_counter() schema_name = getattr(schema, "__name__", str(schema)) + provider, model = infer_provider_model(llm, http_error_vendor=http_error_vendor) + prompt_sha12 = _prompt_sha12(prompt) + + with llm_call_span( + agent=agent, + schema_name=schema_name, + provider=provider, + model=model, + prompt_sha12=prompt_sha12, + max_tokens=max_tokens, + ) as span: + return await _allm_json_call_async_body( + llm, + prompt, + schema, + max_tokens=max_tokens, + agent=agent, + fallback_factory=fallback_factory, + retry_empty=retry_empty, + http_error_vendor=http_error_vendor, + schema_name=schema_name, + provider=provider, + model=model, + prompt_sha12=prompt_sha12, + span=span, + ) + + +async def _allm_json_call_async_body( + llm: Any, + prompt: str, + schema: type[T], + *, + max_tokens: int, + agent: str, + fallback_factory: Callable[[], T] | None, + retry_empty: bool, + http_error_vendor: LlmHttpErrorVendor, + schema_name: str, + provider: str, + model: str, + prompt_sha12: str, + span: Any, +) -> T: + t0 = time.perf_counter() attempts_used = 0 + input_tokens = 0 + output_tokens = 0 raw = "" try: - raw, attempts_used = await _invoke_raw_async( + raw, attempts_used, input_tokens, output_tokens = await _invoke_raw_async( llm, prompt, max_tokens=max_tokens, @@ -406,6 +550,12 @@ async def allm_json_call( parse_ok=True, used_fallback=False, error_kind=None, + provider=provider, + model=model, + prompt_sha12=prompt_sha12, + input_tokens=input_tokens, + output_tokens=output_tokens, + span=span, ) if agent_verbose_enabled(): log_agent_payload( @@ -426,6 +576,12 @@ async def allm_json_call( parse_ok=False, used_fallback=used_fb, error_kind=e.kind, + provider=provider, + model=model, + prompt_sha12=prompt_sha12, + input_tokens=input_tokens, + output_tokens=output_tokens, + span=span, ) if agent_verbose_enabled(): log_agent_payload( @@ -449,6 +605,12 @@ async def allm_json_call( parse_ok=False, used_fallback=used_fb, error_kind="invoke", + provider=provider, + model=model, + prompt_sha12=prompt_sha12, + input_tokens=input_tokens, + output_tokens=output_tokens, + span=span, ) if agent_verbose_enabled(): log_agent_payload( diff --git a/api/app/core/llm_gateway.py b/api/app/core/llm_gateway.py index 3e7c16d..29eadf1 100644 --- a/api/app/core/llm_gateway.py +++ b/api/app/core/llm_gateway.py @@ -14,6 +14,7 @@ from pydantic import BaseModel from app.core.dependencies import get_llm_provider, get_llm_provider_fast from app.core.llm_call import allm_json_call, llm_json_call +from app.core.llm_telemetry import langchain_invoke_span T = TypeVar("T", bound=BaseModel) @@ -58,16 +59,32 @@ class LlmGateway: else 0.7 ) ) - return await provider.complete( - messages, + resolved_model = model if model is not None else (use_case.model if use_case else None) + agent_name = use_case.name if use_case else "llm_gateway.chat" + kwargs = dict( + messages=messages, temperature=resolved_temperature, - model=model if model is not None else (use_case.model if use_case else None), + model=resolved_model, max_tokens=( max_tokens if max_tokens is not None else (use_case.max_tokens if use_case else None) ), ) + # DeepSeekProvider.complete 已包 langchain_invoke_span,避免双层 span + from app.adapters.llm.deepseek import DeepSeekLLMProvider + + if isinstance(provider, DeepSeekLLMProvider): + return await provider.complete(**kwargs) + + provider_label = type(provider).__name__.replace("Provider", "").lower() or "unknown" + with langchain_invoke_span( + agent=agent_name, + provider=provider_label, + model=resolved_model or "unknown", + call_type="chat", + ): + return await provider.complete(**kwargs) async def json_object( self, diff --git a/api/app/core/llm_telemetry.py b/api/app/core/llm_telemetry.py new file mode 100644 index 0000000..6d52d55 --- /dev/null +++ b/api/app/core/llm_telemetry.py @@ -0,0 +1,384 @@ +""" +LLM 调用 OpenTelemetry span 与 metrics(低基数 attributes,不含 prompt/response 正文)。 +""" + +from __future__ import annotations + +import time +from contextlib import contextmanager +from typing import Any, Iterator, Literal + +from opentelemetry import trace +from opentelemetry.trace import Status, StatusCode + +from app.core.config import settings +from app.core.telemetry import get_meter, get_tracer + +CallType = Literal["json", "chat", "stream"] + +_meter = None +_duration_hist = None +_call_counter = None +_tokens_in_counter = None +_tokens_out_counter = None + + +def _ensure_instruments() -> None: + global _meter, _duration_hist, _call_counter, _tokens_in_counter, _tokens_out_counter + if _meter is not None or not settings.otel_enabled: + return + _meter = get_meter("app.llm") + _duration_hist = _meter.create_histogram( + "llm.call.duration", + unit="ms", + description="LLM call wall time", + ) + _call_counter = _meter.create_counter( + "llm.call.total", + description="LLM call count by outcome", + ) + _tokens_in_counter = _meter.create_counter( + "llm.tokens.input", + description="LLM input tokens when reported by provider", + ) + _tokens_out_counter = _meter.create_counter( + "llm.tokens.output", + description="LLM output tokens when reported by provider", + ) + + +def infer_provider_model( + llm: Any, + *, + http_error_vendor: str = "deepseek", +) -> tuple[str, str]: + model = "" + for attr in ("model_name", "model"): + v = getattr(llm, attr, None) + if v: + model = str(v) + break + provider = (http_error_vendor or "unknown").strip().lower() + return provider, model + + +def _outcome_label(*, parse_ok: bool, used_fallback: bool, error_kind: str | None) -> str: + if parse_ok and not used_fallback: + return "ok" + if used_fallback: + return "fallback" + return error_kind or "error" + + +def extract_token_usage(response: Any) -> tuple[int, int]: + """从 LangChain AIMessage / chunk 解析 token 用量。""" + usage = getattr(response, "usage_metadata", None) + if usage is None and hasattr(response, "response_metadata"): + meta = getattr(response, "response_metadata", None) or {} + if isinstance(meta, dict): + usage = meta.get("token_usage") or meta.get("usage") + if usage is None: + return 0, 0 + if isinstance(usage, dict): + inp = usage.get("input_tokens") or usage.get("prompt_tokens") or 0 + out = usage.get("output_tokens") or usage.get("completion_tokens") or 0 + return int(inp or 0), int(out or 0) + inp = getattr(usage, "input_tokens", None) or getattr(usage, "prompt_tokens", None) or 0 + out = ( + getattr(usage, "output_tokens", None) + or getattr(usage, "completion_tokens", None) + or 0 + ) + return int(inp or 0), int(out or 0) + + +def record_llm_completion( + *, + agent: str, + provider: str, + model: str, + duration_ms: float, + call_type: CallType = "chat", + outcome: str = "ok", + input_tokens: int = 0, + output_tokens: int = 0, + span: trace.Span | None = None, + extra_span_attributes: dict[str, Any] | None = None, +) -> None: + if not settings.otel_enabled: + return + + _ensure_instruments() + attrs = { + "agent": agent, + "provider": provider, + "call_type": call_type, + "outcome": outcome, + } + if _duration_hist is not None: + _duration_hist.record(duration_ms, attrs) + if _call_counter is not None: + _call_counter.add(1, attrs) + if input_tokens > 0 and _tokens_in_counter is not None: + _tokens_in_counter.add(input_tokens, {"provider": provider, "agent": agent}) + if output_tokens > 0 and _tokens_out_counter is not None: + _tokens_out_counter.add(output_tokens, {"provider": provider, "agent": agent}) + + if span is not None and span.is_recording(): + span.set_attribute("llm.duration_ms", round(duration_ms, 2)) + span.set_attribute("llm.call_type", call_type) + span.set_attribute("llm.outcome", outcome) + if input_tokens: + span.set_attribute("llm.tokens.input", input_tokens) + if output_tokens: + span.set_attribute("llm.tokens.output", output_tokens) + if extra_span_attributes: + for k, v in extra_span_attributes.items(): + span.set_attribute(k, v) + if outcome == "ok": + span.set_status(Status(StatusCode.OK)) + elif outcome == "fallback": + span.set_status(Status(StatusCode.OK, "fallback")) + else: + span.set_status(Status(StatusCode.ERROR, outcome)) + + +@contextmanager +def langchain_invoke_span( + *, + agent: str, + provider: str, + model: str, + call_type: CallType, + prompt_sha12: str = "", + max_tokens: int | None = None, +) -> Iterator[dict[str, Any]]: + """ + 包住 LangChain invoke/ainvoke;yield 可变 dict 供调用方写入 response 后触发 record。 + keys: response, outcome, input_tokens, output_tokens, error_kind + """ + ctx: dict[str, Any] = { + "response": None, + "outcome": "ok", + "input_tokens": 0, + "output_tokens": 0, + } + if not settings.otel_enabled: + yield ctx + return + + tracer = get_tracer("app.llm") + span_name = { + "json": "llm.json_invoke", + "chat": "llm.chat_invoke", + "stream": "llm.stream_invoke", + }.get(call_type, "llm.invoke") + attrs: dict[str, Any] = { + "llm.agent": agent, + "llm.provider": provider, + "llm.model": model or "unknown", + "llm.call_type": call_type, + } + if prompt_sha12: + attrs["llm.prompt_sha12"] = prompt_sha12 + if max_tokens is not None: + attrs["llm.max_tokens"] = max_tokens + + t0 = time.perf_counter() + with tracer.start_as_current_span(span_name, attributes=attrs) as span: + try: + yield ctx + except Exception: + ctx["outcome"] = "error" + raise + finally: + duration_ms = (time.perf_counter() - t0) * 1000 + resp = ctx.get("response") + if resp is not None and not ctx.get("input_tokens") and not ctx.get("output_tokens"): + inp, out = extract_token_usage(resp) + ctx["input_tokens"] = inp + ctx["output_tokens"] = out + record_llm_completion( + agent=agent, + provider=provider, + model=model, + duration_ms=duration_ms, + call_type=call_type, + outcome=str(ctx.get("outcome") or "ok"), + input_tokens=int(ctx.get("input_tokens") or 0), + output_tokens=int(ctx.get("output_tokens") or 0), + span=span, + ) + + +@contextmanager +def llm_call_span( + *, + agent: str, + schema_name: str, + provider: str, + model: str, + prompt_sha12: str, + max_tokens: int, +) -> Iterator[trace.Span]: + if not settings.otel_enabled: + yield trace.INVALID_SPAN + return + tracer = get_tracer("app.llm") + with tracer.start_as_current_span( + "llm.json_call", + attributes={ + "llm.agent": agent, + "llm.schema_name": schema_name, + "llm.provider": provider, + "llm.model": model or "unknown", + "llm.prompt_sha12": prompt_sha12, + "llm.max_tokens": max_tokens, + "llm.call_type": "json", + }, + ) as span: + yield span + + +async def observe_ainvoke( + llm: Any, + messages: Any, + *, + agent: str, + provider: str = "deepseek", + model: str = "", + call_type: CallType = "chat", + extra_span_attributes: dict[str, Any] | None = None, + record_response_latency_ms: bool = True, +) -> Any: + """包装 ``ainvoke``,统一 span + metrics。""" + t0 = time.perf_counter() + with langchain_invoke_span( + agent=agent, + provider=provider, + model=model, + call_type=call_type, + ) as tel: + result = await llm.ainvoke(messages) + tel["response"] = result + span = trace.get_current_span() + if span.is_recording(): + if record_response_latency_ms: + span.set_attribute( + "llm.response_latency_ms", + round((time.perf_counter() - t0) * 1000, 2), + ) + if extra_span_attributes: + for key, value in extra_span_attributes.items(): + if value is not None: + span.set_attribute(key, value) + return result + + +async def observe_astream( + llm: Any, + prompt: Any, + *, + agent: str, + provider: str = "deepseek", + model: str = "", +): + """包装 ``astream``,记录 wall time 与可选 TTFT。""" + if not settings.otel_enabled: + async for chunk in llm.astream(prompt): + yield chunk + return + + tracer = get_tracer("app.llm") + t0 = time.perf_counter() + ttft_ms: float | None = None + last_chunk: Any = None + with tracer.start_as_current_span( + "llm.stream_invoke", + attributes={ + "llm.agent": agent, + "llm.provider": provider, + "llm.model": model or "unknown", + "llm.call_type": "stream", + }, + ) as span: + try: + async for chunk in llm.astream(prompt): + if ttft_ms is None and getattr(chunk, "content", None): + ttft_ms = (time.perf_counter() - t0) * 1000 + last_chunk = chunk + yield chunk + except Exception: + duration_ms = (time.perf_counter() - t0) * 1000 + record_llm_completion( + agent=agent, + provider=provider, + model=model, + duration_ms=duration_ms, + call_type="stream", + outcome="error", + span=span, + extra_span_attributes=( + {"llm.ttft_ms": round(ttft_ms, 2)} if ttft_ms is not None else None + ), + ) + raise + duration_ms = (time.perf_counter() - t0) * 1000 + inp, out = extract_token_usage(last_chunk) if last_chunk else (0, 0) + extra: dict[str, Any] = {} + if ttft_ms is not None: + extra["llm.ttft_ms"] = round(ttft_ms, 2) + record_llm_completion( + agent=agent, + provider=provider, + model=model, + duration_ms=duration_ms, + call_type="stream", + outcome="ok", + input_tokens=inp, + output_tokens=out, + span=span, + extra_span_attributes=extra or None, + ) + + +def record_llm_call( + *, + agent: str, + schema_name: str, + provider: str, + model: str, + duration_ms: float, + attempts: int, + parse_ok: bool, + used_fallback: bool, + error_kind: str | None, + prompt_sha12: str, + input_tokens: int = 0, + output_tokens: int = 0, + span: trace.Span | None = None, +) -> None: + outcome = _outcome_label( + parse_ok=parse_ok, + used_fallback=used_fallback, + error_kind=error_kind, + ) + record_llm_completion( + agent=agent, + provider=provider, + model=model, + duration_ms=duration_ms, + call_type="json", + outcome=outcome, + input_tokens=input_tokens, + output_tokens=output_tokens, + span=span, + extra_span_attributes={ + "llm.schema_name": schema_name, + "llm.attempts": attempts, + "llm.parse_ok": parse_ok, + "llm.used_fallback": used_fallback, + **({"llm.error_kind": error_kind} if error_kind else {}), + **({"llm.prompt_sha12": prompt_sha12} if prompt_sha12 else {}), + }, + ) diff --git a/api/app/core/logging.py b/api/app/core/logging.py index 18175bd..b065ce9 100644 --- a/api/app/core/logging.py +++ b/api/app/core/logging.py @@ -108,10 +108,32 @@ def _stdlib_emit_display(log_record: logging.LogRecord) -> tuple[str, int]: return fn, ln +def _merge_trace_context(record: Any) -> None: + """每条日志合并当前 OTel trace/span(覆盖 Celery/后台无 HTTP middleware 的场景)。""" + try: + from app.core.telemetry import current_trace_context + + ctx = current_trace_context() + if not ctx: + return + except Exception: + return + ex = record["extra"] + for k, v in ctx.items(): + if not v: + continue + cur = ex.get(k) + if cur is None or str(cur).strip() in ("", "-"): + ex[k] = v + + def _stderr_format(record: Any) -> str: - """控制台 sink:request_id / correlation_id / user_id 有值时才显示对应列。""" + """控制台 sink:request_id / correlation_id / user_id / trace_id 有值时才显示对应列。""" rid = str(record["extra"].get("request_id") or "").strip() rid_part = "rid={extra[request_id]} | " if rid and rid != "-" else "" + tid = str(record["extra"].get("trace_id") or "").strip() + tid_short = tid[:12] if len(tid) > 12 else tid + tid_part = f"tid={tid_short} | " if tid else "" cid = str(record["extra"].get("correlation_id") or "").strip() cid_part = "corr={extra[correlation_id]} | " if cid else "" uid = str(record["extra"].get("user_id") or "").strip() @@ -120,7 +142,7 @@ def _stderr_format(record: Any) -> str: "{time:YYYY-MM-DD HH:mm:ss.SSS} | " "{level.name: <8} | " "{extra[module]}:{function}:{line} | " - f"{rid_part}{cid_part}{uid_part}" + f"{rid_part}{tid_part}{cid_part}{uid_part}" "{message}\n{exception}" ) @@ -242,8 +264,8 @@ def setup_logging() -> None: enqueue=True, ) - logger.configure(extra={"request_id": "-", "module": "-"}) - logger = logger.patch(_merge_celery_worker_extra) + logger.configure(extra={"request_id": "-", "module": "-", "trace_id": "", "span_id": ""}) + logger = logger.patch(_merge_celery_worker_extra).patch(_merge_trace_context) # 仅 root 挂 InterceptHandler,避免子 logger 与 root 各处理一次导致重复行 root = logging.getLogger() diff --git a/api/app/core/middleware.py b/api/app/core/middleware.py index 9bdcdc0..c7a708d 100644 --- a/api/app/core/middleware.py +++ b/api/app/core/middleware.py @@ -8,6 +8,7 @@ from starlette.middleware.base import BaseHTTPMiddleware from starlette.requests import Request from app.core.logging import logger +from app.core.telemetry import current_trace_context class RequestIdMiddleware(BaseHTTPMiddleware): @@ -17,7 +18,8 @@ class RequestIdMiddleware(BaseHTTPMiddleware): request_id = request.headers.get("X-Request-ID") or str(uuid.uuid4()) request.state.request_id = request_id - with logger.contextualize(request_id=request_id): + bind = {"request_id": request_id, **current_trace_context()} + with logger.contextualize(**bind): response = await call_next(request) response.headers["X-Request-ID"] = request_id diff --git a/api/app/core/telemetry.py b/api/app/core/telemetry.py new file mode 100644 index 0000000..21b45a4 --- /dev/null +++ b/api/app/core/telemetry.py @@ -0,0 +1,146 @@ +""" +OpenTelemetry 初始化:traces / metrics / logs 导出至 OTLP Collector。 + +在 ``setup_logging()`` 之后、FastAPI / Celery 应用创建前调用 ``setup_telemetry(service_name=...)``。 +``OTEL_ENABLED=false`` 时无操作,便于测试与无 Collector 环境。 +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +from opentelemetry import metrics, trace +from opentelemetry._logs import set_logger_provider +from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter +from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.celery import CeleryInstrumentor +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor +from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor +from opentelemetry.instrumentation.logging import LoggingInstrumentor +from opentelemetry.instrumentation.redis import RedisInstrumentor +from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor +from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.sdk.trace.sampling import ParentBasedTraceIdRatio + +from app.core.config import settings + +if TYPE_CHECKING: + from fastapi import FastAPI + +_initialized = False +_otel_logging_handler: LoggingHandler | None = None + + +def _build_resource(service_name: str) -> Resource: + return Resource.create( + { + "service.name": service_name, + "deployment.environment": settings.app_environment, + "service.version": "0.2.0", + } + ) + + +def _build_sampler(): + from opentelemetry.sdk.trace.sampling import ( + ALWAYS_OFF, + ALWAYS_ON, + TraceIdRatioBased, + ) + + name = (settings.otel_traces_sampler or "always_on").strip().lower() + arg = settings.otel_traces_sampler_arg + if name in ("always_on", "alwayson"): + return ALWAYS_ON + if name in ("always_off", "alwaysoff"): + return ALWAYS_OFF + ratio = 0.1 if arg is None else arg + if name == "traceidratio": + return TraceIdRatioBased(ratio) + return ParentBasedTraceIdRatio(ratio) + + +def setup_telemetry(*, service_name: str) -> None: + """配置 OTLP exporter 与自动 instrumentation(幂等)。""" + global _initialized, _otel_logging_handler + if _initialized or not settings.otel_enabled: + return + + endpoint = settings.otel_exporter_otlp_endpoint.rstrip("/") + insecure = settings.otel_exporter_otlp_insecure + + resource = _build_resource(service_name) + + span_exporter = OTLPSpanExporter(endpoint=endpoint, insecure=insecure) + tracer_provider = TracerProvider(resource=resource, sampler=_build_sampler()) + tracer_provider.add_span_processor(BatchSpanProcessor(span_exporter)) + trace.set_tracer_provider(tracer_provider) + + metric_exporter = OTLPMetricExporter(endpoint=endpoint, insecure=insecure) + metric_reader = PeriodicExportingMetricReader( + metric_exporter, + export_interval_millis=settings.otel_metric_export_interval_ms, + ) + meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) + metrics.set_meter_provider(meter_provider) + + log_exporter = OTLPLogExporter(endpoint=endpoint, insecure=insecure) + log_provider = LoggerProvider(resource=resource) + log_provider.add_log_record_processor(BatchLogRecordProcessor(log_exporter)) + set_logger_provider(log_provider) + + LoggingInstrumentor().instrument(set_logging_format=True) + _otel_logging_handler = LoggingHandler( + level=logging.NOTSET, + logger_provider=log_provider, + ) + logging.getLogger().addHandler(_otel_logging_handler) + + HTTPXClientInstrumentor().instrument() + RedisInstrumentor().instrument() + SQLAlchemyInstrumentor().instrument() + + _initialized = True + + +def instrument_fastapi_app(app: FastAPI) -> None: + if not settings.otel_enabled: + return + FastAPIInstrumentor.instrument_app( + app, + excluded_urls="/health", + ) + + +def instrument_celery() -> None: + if not settings.otel_enabled: + return + CeleryInstrumentor().instrument() + + +def get_tracer(name: str): + return trace.get_tracer(name) + + +def get_meter(name: str): + return metrics.get_meter(name) + + +def current_trace_context() -> dict[str, str]: + """返回当前 span 的 trace_id / span_id(十六进制),无活跃 span 时为空 dict。""" + span = trace.get_current_span() + ctx = span.get_span_context() + if not ctx.is_valid: + return {} + return { + "trace_id": format(ctx.trace_id, "032x"), + "span_id": format(ctx.span_id, "016x"), + } diff --git a/api/app/features/conversation/ws/pipeline.py b/api/app/features/conversation/ws/pipeline.py index bfeeca6..0745491 100644 --- a/api/app/features/conversation/ws/pipeline.py +++ b/api/app/features/conversation/ws/pipeline.py @@ -20,6 +20,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.agents.chat import ChatOrchestrator from app.agents.chat.reply_limits import segments_from_llm_response from app.core.agent_logging import agent_summary_enabled +from app.core.business_telemetry import business_span from app.core.config import settings from app.core.cos_url_keys import ( TTS_PRESIGNED_EXPIRES_SEC, @@ -634,6 +635,12 @@ def _split_audio_bytes(audio_bytes: bytes, fmt: str) -> list[bytes]: async def _transcribe_long_audio(audio_bytes: bytes, fmt: str = "m4a") -> str: """超过 55 s 的音频自动切片后并行 ASR;短音频直接转写。""" asr = get_asr_provider() + return await _transcribe_long_audio_inner(audio_bytes, fmt, asr) + + +async def _transcribe_long_audio_inner( + audio_bytes: bytes, fmt: str, asr: Any +) -> str: try: chunks = await asyncio.to_thread(_split_audio_bytes, audio_bytes, fmt) except Exception as exc: @@ -938,6 +945,32 @@ async def process_user_message( tts_this_turn: Optional[bool] = None, ) -> None: """处理用户消息,生成 Agent 回应。由 ChatOrchestrator 路由到 ProfileAgent 或 InterviewAgent。""" + with business_span("conversation.ws.process_turn"): + await _process_user_message_inner( + conversation_id, + user_message, + conversation, + segment, + db, + user, + user_message_timestamp, + force_skip_tts=force_skip_tts, + tts_this_turn=tts_this_turn, + ) + + +async def _process_user_message_inner( + conversation_id: str, + user_message: str, + conversation: Conversation, + segment: Segment, + db: AsyncSession, + user: User = None, + user_message_timestamp: Optional[datetime] = None, + *, + force_skip_tts: bool = False, + tts_this_turn: Optional[bool] = None, +) -> None: store = ConversationHistoryStore(db) tts_urls: list[str] = [] user_language = _resolve_user_language(user) diff --git a/api/app/features/evaluation/judge_service.py b/api/app/features/evaluation/judge_service.py index e4b89fe..d9f9929 100644 --- a/api/app/features/evaluation/judge_service.py +++ b/api/app/features/evaluation/judge_service.py @@ -445,7 +445,16 @@ class EvalJudgeService: if hasattr(llm, "bind"): llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX) try: - async for chunk in llm.astream(prompt): + from app.core.llm_telemetry import infer_provider_model, observe_astream + + provider, model = infer_provider_model(llm, http_error_vendor="zhipu") + async for chunk in observe_astream( + llm, + prompt, + agent="EvalJudge.stream_conversation_compare", + provider=provider, + model=model, + ): piece = getattr(chunk, "content", None) if piece: yield piece diff --git a/api/app/features/memoir/story_pipeline_sync.py b/api/app/features/memoir/story_pipeline_sync.py index f4cc3f6..dc127d6 100644 --- a/api/app/features/memoir/story_pipeline_sync.py +++ b/api/app/features/memoir/story_pipeline_sync.py @@ -27,6 +27,7 @@ from app.agents.memoir.story_route_agent import ( StoryRouteAgent, default_append_target_story_id, ) +from app.core.business_telemetry import business_span from app.agents.stage_constants import ( CATEGORY_TO_CHAT_STAGE, CHAPTER_CATEGORIES, @@ -996,6 +997,46 @@ def run_story_pipeline_for_category_batch( 返回 :class:`StoryPipelineResult`。低置信路由会被延迟而不创建 Story/Chapter。 """ + with business_span( + "memoir.story_pipeline.batch", + chapter_category=chapter_category, + segment_count=len(category_segments), + ): + return _run_story_pipeline_batch_inner( + session, + user_id=user_id, + chapter_category=chapter_category, + category_segments=category_segments, + state=state, + user_profile=user_profile, + user_birth_year=user_birth_year, + llm=llm, + background_voice=background_voice, + occupation=occupation, + memoir_correlation_id=memoir_correlation_id, + llm_fast=llm_fast, + memory_evidence=memory_evidence, + language=language, + ) + + +def _run_story_pipeline_batch_inner( + session: Session, + *, + user_id: str, + chapter_category: str, + category_segments: list, + state: MemoirStateSchema, + user_profile: str, + user_birth_year: int | None, + llm: Any, + background_voice: str = "default", + occupation: str = "", + memoir_correlation_id: str | None = None, + llm_fast: Any | None = None, + memory_evidence: dict | None = None, + language: str = "zh", +) -> StoryPipelineResult: pipeline_phase_timings: dict[str, float] = {} narrative_agent = NarrativeAgent() route_agent = StoryRouteAgent() @@ -1013,9 +1054,10 @@ def run_story_pipeline_for_category_batch( top_k = int(settings.evidence_top_k_large_batch) def _oral_job() -> tuple[str, float]: - t_oral = time.perf_counter() - out = normalize_oral_for_memoir(combined_text, llm=llm) - return out, time.perf_counter() - t_oral + with business_span("memoir.story_pipeline.oral_normalize"): + t_oral = time.perf_counter() + out = normalize_oral_for_memoir(combined_text, llm=llm) + return out, time.perf_counter() - t_oral _t_parallel = time.perf_counter() with ThreadPoolExecutor(max_workers=1) as pool: @@ -1045,7 +1087,8 @@ def run_story_pipeline_for_category_batch( top_k, ) - evidence_text = format_evidence_chunks_for_prompt(evidence) + with business_span("memoir.story_pipeline.evidence_prep", chapter_category=chapter_category): + evidence_text = format_evidence_chunks_for_prompt(evidence) ct_raw = (combined_text or "").strip() om_norm = (oral_for_memoir or "").strip() if ct_raw != om_norm: @@ -1099,35 +1142,36 @@ def run_story_pipeline_for_category_batch( calculated_order_index = STAGE_TO_ORDER.get(chapter_category, 999) _t0 = time.perf_counter() - use_batch_plan = ( - llm_route - and len(category_segments) >= 2 - and len(category_segments) <= PLAN_BATCH_MAX_SEGMENTS - ) - plan: StoryBatchPlan | None = None - if use_batch_plan: - segs = _route_segment_texts(category_segments) - plan = route_agent.plan_batch( - chapter_category=chapter_category, - chapter_title=title, - segments=segs, - candidate_stories=candidates, - llm=llm_route, - valid_story_ids=valid_ids, - story_meta=story_meta, + with business_span("memoir.story_pipeline.route", chapter_category=chapter_category): + use_batch_plan = ( + llm_route + and len(category_segments) >= 2 + and len(category_segments) <= PLAN_BATCH_MAX_SEGMENTS ) + plan: StoryBatchPlan | None = None + if use_batch_plan: + segs = _route_segment_texts(category_segments) + plan = route_agent.plan_batch( + chapter_category=chapter_category, + chapter_title=title, + segments=segs, + candidate_stories=candidates, + llm=llm_route, + valid_story_ids=valid_ids, + story_meta=story_meta, + ) - single_route: Any = None - if plan is None: - single_route = route_agent.decide( - chapter_category=chapter_category, - chapter_title=title, - batch_transcript=route_transcript, - candidate_stories=candidates, - llm=llm_route, - valid_story_ids=valid_ids, - story_meta=story_meta, - ) + single_route: Any = None + if plan is None: + single_route = route_agent.decide( + chapter_category=chapter_category, + chapter_title=title, + batch_transcript=route_transcript, + candidate_stories=candidates, + llm=llm_route, + valid_story_ids=valid_ids, + story_meta=story_meta, + ) pipeline_phase_timings["route"] = time.perf_counter() - _t0 if ( @@ -1166,89 +1210,91 @@ def run_story_pipeline_for_category_batch( ) _t0 = time.perf_counter() - if plan is not None: - dispatch_ids = _run_batch_plan_writes( - session, - plan=plan, - category_segments=category_segments, - chapter=chapter, - chapter_category=chapter_category, - evidence_text=evidence_text, - evidence=evidence, - evidence_top_k=top_k, - slot_snippets=slot_snippets, - user_id=user_id, - user_profile=user_profile, - user_birth_year=user_birth_year, - llm=llm, - narrative_agent=narrative_agent, - candidate_stories=candidates, - story_meta=story_meta, - background_voice=background_voice, - occupation=occupation, - memoir_correlation_id=memoir_correlation_id, - fidelity_llm=llm_fidelity, - language=language, - ) - else: - route = single_route - decision_source = ( - route.reason - if route.reason in FALLBACK_NEW_STORY_REASONS - else ("fallback_no_llm" if not llm_route else "single_decide") - ) - target_story_id, existing_for_narrative, decision_source = ( - _resolve_append_target( + with business_span("memoir.story_pipeline.narrative_writes", chapter_category=chapter_category): + if plan is not None: + dispatch_ids = _run_batch_plan_writes( session, - route_decision=route.decision, - route_target_story_id=route.target_story_id, - user_id=user_id, + plan=plan, + category_segments=category_segments, + chapter=chapter, chapter_category=chapter_category, - oral_norm=om_norm, + evidence_text=evidence_text, + evidence=evidence, + evidence_top_k=top_k, + slot_snippets=slot_snippets, + user_id=user_id, + user_profile=user_profile, + user_birth_year=user_birth_year, + llm=llm, + narrative_agent=narrative_agent, candidate_stories=candidates, story_meta=story_meta, - decision_source=decision_source, + background_voice=background_voice, + occupation=occupation, memoir_correlation_id=memoir_correlation_id, + fidelity_llm=llm_fidelity, + language=language, + ) + else: + route = single_route + decision_source = ( + route.reason + if route.reason in FALLBACK_NEW_STORY_REASONS + else ("fallback_no_llm" if not llm_route else "single_decide") + ) + target_story_id, existing_for_narrative, decision_source = ( + _resolve_append_target( + session, + route_decision=route.decision, + route_target_story_id=route.target_story_id, + user_id=user_id, + chapter_category=chapter_category, + oral_norm=om_norm, + candidate_stories=candidates, + story_meta=story_meta, + decision_source=decision_source, + memoir_correlation_id=memoir_correlation_id, + ) ) - ) - sid, _ = _execute_narrative_unit( - session, - oral_text=oral_for_memoir, - evidence_text=evidence_text, - evidence=evidence, - evidence_top_k=top_k, - chapter=chapter, - chapter_category=chapter_category, - slot_snippets=slot_snippets, - user_id=user_id, - user_profile=user_profile, - user_birth_year=user_birth_year, - llm=llm, - narrative_agent=narrative_agent, - target_story_id=target_story_id, - existing_for_narrative=existing_for_narrative, - decision_source=decision_source, - route_decision=route.decision, - route_type="single", - segment_ids=[str(s.id) for s in category_segments], - category_segments=category_segments, - background_voice=background_voice, - occupation=occupation, - memoir_correlation_id=memoir_correlation_id, - fidelity_llm=llm_fidelity, - language=language, - ) - if sid: - dispatch_ids.add(sid) + sid, _ = _execute_narrative_unit( + session, + oral_text=oral_for_memoir, + evidence_text=evidence_text, + evidence=evidence, + evidence_top_k=top_k, + chapter=chapter, + chapter_category=chapter_category, + slot_snippets=slot_snippets, + user_id=user_id, + user_profile=user_profile, + user_birth_year=user_birth_year, + llm=llm, + narrative_agent=narrative_agent, + target_story_id=target_story_id, + existing_for_narrative=existing_for_narrative, + decision_source=decision_source, + route_decision=route.decision, + route_type="single", + segment_ids=[str(s.id) for s in category_segments], + category_segments=category_segments, + background_voice=background_voice, + occupation=occupation, + memoir_correlation_id=memoir_correlation_id, + fidelity_llm=llm_fidelity, + language=language, + ) + if sid: + dispatch_ids.add(sid) pipeline_phase_timings["narrative_writes"] = time.perf_counter() - _t0 _t0 = time.perf_counter() - reorder_chapter_story_links_by_life_order_sync(session, str(chapter.id)) - mark_chapter_dirty_sync(session, str(chapter.id)) - session.flush() - refresh_chapter_evidence_snapshot_with_retry_sync(session, str(chapter.id)) + with business_span("memoir.story_pipeline.finalize", chapter_category=chapter_category): + reorder_chapter_story_links_by_life_order_sync(session, str(chapter.id)) + mark_chapter_dirty_sync(session, str(chapter.id)) + session.flush() + refresh_chapter_evidence_snapshot_with_retry_sync(session, str(chapter.id)) pipeline_phase_timings["finalize"] = time.perf_counter() - _t0 image_settings = MemoirImageSettings.from_env() diff --git a/api/app/features/payment/alipay_client.py b/api/app/features/payment/alipay_client.py index 8f4b0f7..dad2b0a 100644 --- a/api/app/features/payment/alipay_client.py +++ b/api/app/features/payment/alipay_client.py @@ -4,6 +4,7 @@ from typing import Dict +from app.core.business_telemetry import business_span from app.core.logging import get_logger from app.features.payment.payment_config import AlipayConfig from app.features.payment.payment_exceptions import ( @@ -46,6 +47,15 @@ class AlipayClient: out_trade_no: str, total_amount: int, subject: str, + ) -> PaymentResult: + with business_span("payment.alipay.create_app_order"): + return self._create_app_order_inner(out_trade_no, total_amount, subject) + + def _create_app_order_inner( + self, + out_trade_no: str, + total_amount: int, + subject: str, ) -> PaymentResult: self._ensure_client() try: @@ -100,6 +110,10 @@ class AlipayClient: raise PaymentNotifyError(f"支付宝回调处理失败: {e}") def query_order(self, out_trade_no: str) -> PaymentStatus: + with business_span("payment.alipay.query_order"): + return self._query_order_inner(out_trade_no) + + def _query_order_inner(self, out_trade_no: str) -> PaymentStatus: self._ensure_client() try: result = self._client.api_alipay_trade_query(out_trade_no=out_trade_no) diff --git a/api/app/features/payment/wechat_client.py b/api/app/features/payment/wechat_client.py index 3268628..89b9ec6 100644 --- a/api/app/features/payment/wechat_client.py +++ b/api/app/features/payment/wechat_client.py @@ -7,6 +7,7 @@ import os import time from typing import Dict +from app.core.business_telemetry import business_span from app.core.logging import get_logger from app.features.payment.payment_config import WeChatPayConfig from app.features.payment.payment_exceptions import ( @@ -149,6 +150,15 @@ class WeChatPayClient: out_trade_no: str, total_amount: int, description: str, + ) -> PaymentResult: + with business_span("payment.wechat.create_app_order"): + return self._create_app_order_inner(out_trade_no, total_amount, description) + + def _create_app_order_inner( + self, + out_trade_no: str, + total_amount: int, + description: str, ) -> PaymentResult: self._ensure_client() try: @@ -217,6 +227,10 @@ class WeChatPayClient: raise PaymentNotifyError(f"微信支付回调处理失败: {e}") def query_order(self, out_trade_no: str) -> PaymentStatus: + with business_span("payment.wechat.query_order"): + return self._query_order_inner(out_trade_no) + + def _query_order_inner(self, out_trade_no: str) -> PaymentStatus: self._ensure_client() try: code, message = self._client.query(out_trade_no=out_trade_no) diff --git a/api/app/internal_main.py b/api/app/internal_main.py index 179b4fb..55e8354 100644 --- a/api/app/internal_main.py +++ b/api/app/internal_main.py @@ -14,12 +14,18 @@ from app.core.logging import get_logger, setup_logging setup_logging() +from app.core.config import settings +from app.core.telemetry import instrument_fastapi_app, setup_telemetry + +setup_telemetry( + service_name=settings.otel_service_name or "life-echo-internal-api", +) + from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import HTMLResponse from fastapi.staticfiles import StaticFiles -from app.core.config import settings from app.core.errors import register_exception_handlers from app.core.middleware import RequestIdMiddleware from app.features.evaluation import models as _eval_models # noqa: F401 @@ -35,6 +41,8 @@ internal_app = FastAPI( openapi_url="/openapi.json" if settings.internal_eval_enable_docs else None, ) +instrument_fastapi_app(internal_app) + internal_app.add_middleware(RequestIdMiddleware) _origins = [ o.strip() @@ -66,7 +74,7 @@ async def internal_eval_landing():

Life Echo · 内部回归评测 API

这里是 HTTP API(端口由启动命令决定),没有内置网页。 -浏览「回归评测台」请在仓库执行 ./internal-eval.shcd app-eval-web && npm run dev, +浏览「回归评测台」请在仓库执行 ./development.shcd app-eval-web && npm run dev, 在终端里打开 Vite 给出的地址(一般为 http://127.0.0.1:5174/)。

健康检查:/health

{docs_hint} diff --git a/api/app/main.py b/api/app/main.py index a9ae9dd..18a7b65 100644 --- a/api/app/main.py +++ b/api/app/main.py @@ -8,11 +8,17 @@ from app.core.logging import get_logger, setup_logging setup_logging() +from app.core.config import settings +from app.core.telemetry import instrument_fastapi_app, setup_telemetry + +setup_telemetry( + service_name=settings.otel_service_name or "life-echo-api", +) + from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles -from app.core.config import settings from app.core.errors import register_exception_handlers from app.core.middleware import RequestIdMiddleware from app.core.openapi import custom_openapi @@ -46,6 +52,8 @@ app = FastAPI( openapi_url="/openapi.json" if settings.enable_docs else None, ) +instrument_fastapi_app(app) + # OpenAPI 全局增强 app.openapi = lambda: custom_openapi(app) # type: ignore[assignment] diff --git a/api/app/tasks/celery_app.py b/api/app/tasks/celery_app.py index a8a8ad3..62a439c 100644 --- a/api/app/tasks/celery_app.py +++ b/api/app/tasks/celery_app.py @@ -14,11 +14,17 @@ from app.core.logging import get_logger, setup_logging # 与 app.main 一致:先配置 loguru + InterceptHandler,再加载会打日志的依赖 setup_logging() +from app.core.config import settings +from app.core.telemetry import instrument_celery, setup_telemetry + +# Worker 与 API 共用 .env,固定 service.name,勿读 OTEL_SERVICE_NAME(留给主站 / internal) +setup_telemetry(service_name="life-echo-celery-worker") +instrument_celery() + from celery import Celery from celery.signals import task_failure, task_postrun, task_prerun, task_success from app.core.celery_log_context import clear_celery_log_extras, set_celery_log_extras -from app.core.config import settings from app.core.log_events import celery_prerun_extras from app.features.asset import models as _asset_models # noqa: F401 - register Asset from app.features.auth import models as _auth_models # noqa: F401 @@ -123,9 +129,12 @@ def _log_task_prerun( **_: object, ) -> None: name = getattr(task, "name", None) or "?" + from app.core.telemetry import current_trace_context + extras = celery_prerun_extras(name, tuple(args or ()), dict(kwargs or {})) if task_id: extras["task_id"] = str(task_id).strip() + extras.update(current_trace_context()) set_celery_log_extras(extras if extras else None) _celery_lifecycle_log.info( "event=celery_task_start task={} task_id={} msg=Celery 任务已开始", diff --git a/api/app/tasks/memoir_tasks.py b/api/app/tasks/memoir_tasks.py index 90f9963..0d6b6da 100644 --- a/api/app/tasks/memoir_tasks.py +++ b/api/app/tasks/memoir_tasks.py @@ -26,6 +26,7 @@ from app.core.chapter_pipeline_lock import ( from app.core.chapter_pipeline_lock import ( release_chapter_pipeline_lock as _release_chapter_lock, ) +from app.core.business_telemetry import business_span from app.core.config import settings from app.core.db import AsyncSessionLocal, get_sync_db from app.core.dependencies import get_embedding_provider @@ -614,7 +615,10 @@ def process_memoir_phase2( }, ) try: - with get_sync_db() as db: + with business_span( + "memoir.phase2", + chapter_category=chapter_category, + ), get_sync_db() as db: user_convs = select(Conversation.id).where( Conversation.user_id == user_id, Conversation.deleted_at.is_(None), @@ -691,9 +695,13 @@ def process_memoir_phase2( affected_chapter_ids: Set[str] = set() lock_t0 = time.perf_counter() - lock_handle = _acquire_chapter_lock( - user_id, chapter_category, ttl_seconds=_chapter_lock_ttl() - ) + with business_span( + "memoir.phase2.lock", + chapter_category=chapter_category, + ): + lock_handle = _acquire_chapter_lock( + user_id, chapter_category, ttl_seconds=_chapter_lock_ttl() + ) lock_elapsed = time.perf_counter() - lock_t0 if lock_handle is None: logger.warning( @@ -746,22 +754,26 @@ def process_memoir_phase2( "relevant_stories": [], } pipeline_t0 = time.perf_counter() - pipeline_result = run_story_pipeline_for_category_batch( - db, - user_id=user_id, + with business_span( + "memoir.phase2.story_pipeline", chapter_category=chapter_category, - category_segments=category_segments, - state=state, - user_profile=user_profile, - user_birth_year=user_birth_year, - llm=llm, - background_voice=background_voice, - occupation=user_occupation, - memoir_correlation_id=cid, - llm_fast=llm_fast, - memory_evidence=memory_evidence, - language=user_language, - ) + ): + pipeline_result = run_story_pipeline_for_category_batch( + db, + user_id=user_id, + chapter_category=chapter_category, + category_segments=category_segments, + state=state, + user_profile=user_profile, + user_birth_year=user_birth_year, + llm=llm, + background_voice=background_voice, + occupation=user_occupation, + memoir_correlation_id=cid, + llm_fast=llm_fast, + memory_evidence=memory_evidence, + language=user_language, + ) pipeline_elapsed = time.perf_counter() - pipeline_t0 if pipeline_result.deferred: @@ -939,7 +951,10 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]): phase1_t0 = time.perf_counter() try: - with get_sync_db() as db: + with business_span( + "memoir.phase1", + segment_count=len(segment_ids), + ), get_sync_db() as db: user_obj_for_lang = db.get(User, user_id) user_language = ( "en" @@ -986,47 +1001,48 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]): }, ) ingest_t0 = time.perf_counter() - ingest_items: list[tuple[str, str, dict | None]] = [] - non_empty_segments: list = [] - for seg in segments: - text = (seg.user_input_text or "").strip() - if not text: - continue - conv_id = getattr(seg, "conversation_id", None) or "" - ln = getattr(seg, "lineage_json", None) - lineage_payload = ln if isinstance(ln, dict) else None - ingest_items.append((conv_id, text, lineage_payload)) - non_empty_segments.append(seg) + with business_span("memoir.phase1.ingest"): + ingest_items: list[tuple[str, str, dict | None]] = [] + non_empty_segments: list = [] + for seg in segments: + text = (seg.user_input_text or "").strip() + if not text: + continue + conv_id = getattr(seg, "conversation_id", None) or "" + ln = getattr(seg, "lineage_json", None) + lineage_payload = ln if isinstance(ln, dict) else None + ingest_items.append((conv_id, text, lineage_payload)) + non_empty_segments.append(seg) - ingested_source_ids: list[str] = [] - if ingest_items: - try: - ingested_source_ids = asyncio.run( - _memory_ingest_transcripts_batch( - user_id, - ingest_items, - memoir_correlation_id=memoir_correlation_id, + ingested_source_ids: list[str] = [] + if ingest_items: + try: + ingested_source_ids = asyncio.run( + _memory_ingest_transcripts_batch( + user_id, + ingest_items, + memoir_correlation_id=memoir_correlation_id, + ) ) - ) - for seg, sid in zip( - non_empty_segments, ingested_source_ids, strict=True - ): - logger.info( - "event=memory_transcript_ingested user_id={} task_id={} " - "source_id={} conversation_id={} segment_id={} transcript_chars={}", - user_id, - task_id, - sid, - getattr(seg, "conversation_id", None) or "", - seg.id, - len((seg.user_input_text or "").strip()), + for seg, sid in zip( + non_empty_segments, ingested_source_ids, strict=True + ): + logger.info( + "event=memory_transcript_ingested user_id={} task_id={} " + "source_id={} conversation_id={} segment_id={} transcript_chars={}", + user_id, + task_id, + sid, + getattr(seg, "conversation_id", None) or "", + seg.id, + len((seg.user_input_text or "").strip()), + ) + except Exception as e: + logger.warning( + "Memory batch ingest 失败: {} exc_type={}", + e, + type(e).__name__, ) - except Exception as e: - logger.warning( - "Memory batch ingest 失败: {} exc_type={}", - e, - type(e).__name__, - ) ingest_elapsed = time.perf_counter() - ingest_t0 merge_pipeline_run( memoir_correlation_id, @@ -1050,31 +1066,32 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]): ) prep_t0 = time.perf_counter() - memoir_orchestrator = MemoirOrchestrator() + with business_span("memoir.phase1.prepare_batches"): + memoir_orchestrator = MemoirOrchestrator() - def _phase1_chunk_cb(idx: int, total: int) -> None: - merge_pipeline_run( - memoir_correlation_id, - {"phase1": {"detail": {"prepare_batches_chunk": [idx, total]}}}, + def _phase1_chunk_cb(idx: int, total: int) -> None: + merge_pipeline_run( + memoir_correlation_id, + {"phase1": {"detail": {"prepare_batches_chunk": [idx, total]}}}, + ) + + prepared = memoir_orchestrator.prepare_batches( + segments=list(segments), + llm=llm, + llm_fast=llm_fast, + get_or_create_state=lambda: get_or_create_state_sync(user_id, db), + update_slot=lambda stage, slot_name, snippet, seg_ids: update_slot_sync( + user_id, + stage, + slot_name, + snippet, + seg_ids, + db, + memoir_batch=True, + ), + on_phase1_chunk=_phase1_chunk_cb, + language=user_language, ) - - prepared = memoir_orchestrator.prepare_batches( - segments=list(segments), - llm=llm, - llm_fast=llm_fast, - get_or_create_state=lambda: get_or_create_state_sync(user_id, db), - update_slot=lambda stage, slot_name, snippet, seg_ids: update_slot_sync( - user_id, - stage, - slot_name, - snippet, - seg_ids, - db, - memoir_batch=True, - ), - on_phase1_chunk=_phase1_chunk_cb, - language=user_language, - ) prep_elapsed = time.perf_counter() - prep_t0 merge_pipeline_run( memoir_correlation_id, diff --git a/api/app/tasks/memory_compaction_tasks.py b/api/app/tasks/memory_compaction_tasks.py index d906f49..4002cf7 100644 --- a/api/app/tasks/memory_compaction_tasks.py +++ b/api/app/tasks/memory_compaction_tasks.py @@ -9,6 +9,7 @@ from typing import Any from celery import shared_task +from app.core.business_telemetry import business_span from app.core.config import settings from app.core.db import AsyncSessionLocal from app.core.logging import get_logger @@ -49,7 +50,8 @@ def memory_compaction_sweep() -> dict[str, Any]: if not settings.memory_compaction_enabled: return {"skipped": True, "reason": "disabled"} hours = int(settings.memory_compaction_sweep_recent_hours) - user_ids = asyncio.run(_list_users_with_recent_chunks_async(hours)) + with business_span("memory.compaction.sweep", hours=hours): + user_ids = asyncio.run(_list_users_with_recent_chunks_async(hours)) ctx_base: dict[str, Any] = {"trigger_source": "beat", "sweep_hours": hours} for uid in user_ids: schedule_memory_compaction_run(uid, dict(ctx_base)) @@ -100,7 +102,8 @@ def memory_compaction_run( return out try: - out = asyncio.run(_run_memory_compaction_async(user_id, ctx)) + with business_span("memory.compaction.run"): + out = asyncio.run(_run_memory_compaction_async(user_id, ctx)) if out.get("new_cursor_ts") and out.get("new_cursor_id") is not None: set_incremental_cursor_pair( diff --git a/api/app/tasks/memory_enrichment_tasks.py b/api/app/tasks/memory_enrichment_tasks.py index 54434ef..2a70136 100644 --- a/api/app/tasks/memory_enrichment_tasks.py +++ b/api/app/tasks/memory_enrichment_tasks.py @@ -11,6 +11,7 @@ from typing import Any, cast from celery import shared_task +from app.core.business_telemetry import business_span from app.core.config import settings from app.core.db import AsyncSessionLocal from app.core.dependencies import get_embedding_provider @@ -166,7 +167,8 @@ def embed_memory_source( status="running", ) try: - result = asyncio.run(_embed_memory_source_async(user_id, source_id)) + with business_span("memory.embed_source"): + result = asyncio.run(_embed_memory_source_async(user_id, source_id)) ms = (time.perf_counter() - t0) * 1000 logger.info( "event=memory_embedding_done user_id={} source_id={} duration_ms={:.1f} status={} vectors_written={} msg=记忆向量化完成", @@ -241,7 +243,8 @@ def enrich_memory_source( status="running", ) try: - asyncio.run(_enrich_memory_source_async(user_id, source_id)) + with business_span("memory.enrich_source"): + asyncio.run(_enrich_memory_source_async(user_id, source_id)) ms = (time.perf_counter() - t0) * 1000 logger.info( "event=memory_enrichment_done user_id={} source_id={} duration_ms={:.1f} " diff --git a/api/deploy/observability/grafana/dashboards/life-echo-business.json b/api/deploy/observability/grafana/dashboards/life-echo-business.json new file mode 100644 index 0000000..e14f545 --- /dev/null +++ b/api/deploy/observability/grafana/dashboards/life-echo-business.json @@ -0,0 +1,75 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(business_operation_duration_milliseconds_bucket[5m])) by (le, operation))", + "legendFormat": "{{operation}} p95", + "refId": "A" + } + ], + "title": "Business operation duration p95", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "id": 2, + "targets": [ + { + "expr": "sum(rate(business_operation_duration_milliseconds_count[5m])) by (operation, outcome)", + "legendFormat": "{{operation}} / {{outcome}}", + "refId": "A" + } + ], + "title": "Business operations rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "id": 3, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(business_operation_duration_milliseconds_bucket[5m])) by (le, operation)) and on(operation) (operation=~\"conversation\\\\.ws\\\\..*|asr\\\\.transcribe|tts\\\\.synthesize\")", + "legendFormat": "{{operation}}", + "refId": "A" + } + ], + "title": "WS / ASR / TTS p95", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, + "id": 4, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(business_operation_duration_milliseconds_bucket[5m])) by (le, operation)) and on(operation) (operation=~\"memoir\\\\..*\")", + "legendFormat": "{{operation}}", + "refId": "A" + } + ], + "title": "Memoir pipeline phases p95", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": ["life-echo", "business"], + "templating": { "list": [] }, + "time": { "from": "now-6h", "to": "now" }, + "title": "Life Echo Business", + "uid": "life-echo-business", + "version": 1 +} diff --git a/api/deploy/observability/grafana/dashboards/life-echo-llm.json b/api/deploy/observability/grafana/dashboards/life-echo-llm.json new file mode 100644 index 0000000..3505ab2 --- /dev/null +++ b/api/deploy/observability/grafana/dashboards/life-echo-llm.json @@ -0,0 +1,79 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "id": 1, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(llm_call_duration_milliseconds_bucket[5m])) by (le, agent, call_type))", + "legendFormat": "{{agent}} / {{call_type}} p95", + "refId": "A" + } + ], + "title": "LLM duration p95 by agent / call_type", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "id": 2, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(llm_call_duration_milliseconds_bucket[5m])) by (le, call_type))", + "legendFormat": "{{call_type}} p50", + "refId": "A" + } + ], + "title": "LLM duration p50 by call_type (json vs chat vs stream)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "id": 3, + "targets": [ + { + "expr": "sum(rate(llm_call_total[5m])) by (outcome, call_type)", + "legendFormat": "{{outcome}} / {{call_type}}", + "refId": "A" + } + ], + "title": "LLM calls by outcome", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "id": 4, + "targets": [ + { + "expr": "sum(rate(llm_tokens_input_total[5m])) by (agent)", + "legendFormat": "in {{agent}}", + "refId": "A" + }, + { + "expr": "sum(rate(llm_tokens_output_total[5m])) by (agent)", + "legendFormat": "out {{agent}}", + "refId": "B" + } + ], + "title": "LLM tokens/min", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": ["life-echo", "llm"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "title": "Life Echo LLM", + "uid": "life-echo-llm", + "version": 1 +} diff --git a/api/deploy/observability/grafana/dashboards/life-echo-logs.json b/api/deploy/observability/grafana/dashboards/life-echo-logs.json new file mode 100644 index 0000000..3cd9ddc --- /dev/null +++ b/api/deploy/observability/grafana/dashboards/life-echo-logs.json @@ -0,0 +1,69 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "options": { "showTime": true, "sortOrder": "Descending" }, + "targets": [ + { + "expr": "{compose_service=~\".+\"} |= \"event=llm_json_call\"", + "refId": "A" + } + ], + "title": "LLM JSON calls (event=llm_json_call)", + "type": "logs" + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 10 }, + "id": 2, + "options": { "showTime": true, "sortOrder": "Descending" }, + "targets": [ + { + "expr": "{compose_service=~\".+\"} |= \"event=celery_task_failed\"", + "refId": "A" + } + ], + "title": "Celery task failures", + "type": "logs" + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 20 }, + "id": 3, + "options": { "showTime": true, "sortOrder": "Descending" }, + "targets": [ + { + "expr": "{trace_id=~\"$trace_id\"}", + "refId": "A" + } + ], + "title": "Logs by trace_id", + "type": "logs" + } + ], + "schemaVersion": 39, + "tags": ["life-echo", "logs"], + "templating": { + "list": [ + { + "current": { "text": "", "value": "" }, + "label": "trace_id", + "name": "trace_id", + "options": [], + "query": "", + "type": "textbox" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "title": "Life Echo Logs", + "uid": "life-echo-logs", + "version": 1 +} diff --git a/api/deploy/observability/grafana/dashboards/life-echo-overview.json b/api/deploy/observability/grafana/dashboards/life-echo-overview.json new file mode 100644 index 0000000..f43e4ba --- /dev/null +++ b/api/deploy/observability/grafana/dashboards/life-echo-overview.json @@ -0,0 +1,154 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 0 }, + "id": 1, + "options": { "legend": { "displayMode": "list", "placement": "bottom" } }, + "targets": [ + { + "expr": "sum(rate(http_server_request_duration_seconds_count[5m]))", + "legendFormat": "HTTP requests/s", + "refId": "A" + } + ], + "title": "API request rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 0 }, + "id": 2, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le)) * 1000", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "API latency p95", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 0 }, + "id": 3, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(llm_call_duration_milliseconds_bucket[5m])) by (le, agent, provider))", + "legendFormat": "{{agent}} / {{provider}}", + "refId": "A" + } + ], + "title": "LLM call duration p95", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "id": 4, + "targets": [ + { + "expr": "sum(rate(llm_call_total[5m])) by (outcome)", + "legendFormat": "{{outcome}}", + "refId": "A" + } + ], + "title": "LLM calls by outcome", + "type": "timeseries" + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "id": 5, + "options": { "showTime": true, "sortOrder": "Descending" }, + "targets": [ + { + "expr": "{compose_service=~\".+\"} |= \"llm_json_call\"", + "refId": "A" + } + ], + "title": "LLM JSON call logs", + "type": "logs" + }, + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "id": 6, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(db_client_operation_duration_seconds_bucket[5m])) by (le)) * 1000", + "legendFormat": "DB p95", + "refId": "A" + } + ], + "title": "DB client latency p95", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "id": 7, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(http_client_request_duration_seconds_bucket[5m])) by (le)) * 1000", + "legendFormat": "HTTP client p95", + "refId": "A" + } + ], + "title": "Outbound HTTP latency p95", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "id": 8, + "targets": [ + { + "expr": "sum(rate(http_server_request_duration_seconds_count{http_response_status_code=~\"5..\"}[5m])) / clamp_min(sum(rate(http_server_request_duration_seconds_count[5m])), 1e-9)", + "legendFormat": "5xx rate", + "refId": "A" + } + ], + "title": "HTTP 5xx error rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "Prometheus" }, + "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "id": 9, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(redis_client_operation_duration_seconds_bucket[5m])) by (le)) * 1000", + "legendFormat": "Redis p95", + "refId": "A" + } + ], + "title": "Redis client latency p95", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["life-echo"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Life Echo Overview", + "uid": "life-echo-overview", + "version": 1 +} diff --git a/api/deploy/observability/grafana/provisioning/alerting/contact_points.yml b/api/deploy/observability/grafana/provisioning/alerting/contact_points.yml new file mode 100644 index 0000000..96dcc4f --- /dev/null +++ b/api/deploy/observability/grafana/provisioning/alerting/contact_points.yml @@ -0,0 +1,4 @@ +apiVersion: 1 + +# 本地 dev 占位:不配置真实通知渠道。在 Grafana UI 中可绑定 Slack/Webhook。 +contactPoints: [] diff --git a/api/deploy/observability/grafana/provisioning/alerting/rules.yml b/api/deploy/observability/grafana/provisioning/alerting/rules.yml new file mode 100644 index 0000000..5228f33 --- /dev/null +++ b/api/deploy/observability/grafana/provisioning/alerting/rules.yml @@ -0,0 +1,147 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: life-echo-alerts + folder: Life Echo + interval: 1m + rules: + - uid: life_echo_api_p95_high + title: API latency p95 > 2s + condition: C + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: Prometheus + model: + expr: histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le)) * 1000 + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: reduce + expression: A + reducer: last + refId: B + - refId: C + datasourceUid: __expr__ + model: + type: threshold + expression: B + conditions: + - evaluator: { type: gt, params: [2000] } + operator: { type: and } + reducer: { type: last } + refId: C + noDataState: NoData + execErrState: Error + for: 5m + annotations: + summary: API p95 latency above 2s for 5 minutes + labels: + severity: warning + + - uid: life_echo_llm_error_rate + title: LLM error rate > 5% + condition: C + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: Prometheus + model: + expr: sum(rate(llm_call_total{outcome="error"}[5m])) / clamp_min(sum(rate(llm_call_total[5m])), 1e-9) + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: reduce + expression: A + reducer: last + refId: B + - refId: C + datasourceUid: __expr__ + model: + type: threshold + expression: B + conditions: + - evaluator: { type: gt, params: [0.05] } + operator: { type: and } + reducer: { type: last } + refId: C + noDataState: NoData + execErrState: Error + for: 5m + annotations: + summary: LLM call error rate above 5% + labels: + severity: warning + + - uid: life_echo_otel_collector_down + title: OTel Collector scrape down + condition: C + data: + - refId: A + relativeTimeRange: { from: 120, to: 0 } + datasourceUid: Prometheus + model: + expr: up{job="otel-collector"} + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: reduce + expression: A + reducer: last + refId: B + - refId: C + datasourceUid: __expr__ + model: + type: threshold + expression: B + conditions: + - evaluator: { type: lt, params: [1] } + operator: { type: and } + reducer: { type: last } + refId: C + noDataState: Alerting + execErrState: Error + for: 2m + annotations: + summary: Prometheus cannot scrape otel-collector + labels: + severity: critical + + - uid: life_echo_celery_task_failed + title: Celery task failures detected + condition: C + data: + - refId: A + relativeTimeRange: { from: 300, to: 0 } + datasourceUid: loki + model: + expr: sum(count_over_time({compose_service=~".+"} |= "event=celery_task_failed" [5m])) + refId: A + - refId: B + datasourceUid: __expr__ + model: + type: reduce + expression: A + reducer: last + refId: B + - refId: C + datasourceUid: __expr__ + model: + type: threshold + expression: B + conditions: + - evaluator: { type: gt, params: [0] } + operator: { type: and } + reducer: { type: last } + refId: C + noDataState: NoData + execErrState: Error + for: 5m + annotations: + summary: Celery task failure logs in last 5 minutes + labels: + severity: warning diff --git a/api/deploy/observability/grafana/provisioning/dashboards/dashboards.yml b/api/deploy/observability/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..c2f6cff --- /dev/null +++ b/api/deploy/observability/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: Life Echo + orgId: 1 + folder: Life Echo + type: file + disableDeletion: false + editable: true + options: + path: /etc/grafana/dashboards diff --git a/api/deploy/observability/grafana/provisioning/datasources/datasources.yml b/api/deploy/observability/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 0000000..89fd49e --- /dev/null +++ b/api/deploy/observability/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,43 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + + - name: Tempo + type: tempo + access: proxy + url: http://tempo:3200 + editable: false + jsonData: + httpMethod: GET + tracesToLogsV2: + datasourceUid: loki + spanStartTimeShift: -1m + spanEndTimeShift: 1m + filterByTraceID: true + filterBySpanID: false + customQuery: true + query: '{container=~".+"} | json | trace_id="$${__trace.traceId}"' + serviceMap: + datasourceUid: prometheus + nodeGraph: + enabled: true + + - name: Loki + type: loki + uid: loki + access: proxy + url: http://loki:3100 + editable: false + jsonData: + derivedFields: + - datasourceUid: tempo + matcherRegex: '"trace_id":"([a-f0-9]+)"' + name: TraceID + url: "$${__value.raw}" + urlDisplayLabel: View Trace diff --git a/api/deploy/observability/loki-config.yaml b/api/deploy/observability/loki-config.yaml new file mode 100644 index 0000000..4a09ace --- /dev/null +++ b/api/deploy/observability/loki-config.yaml @@ -0,0 +1,32 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: 168h + +ruler: + alertmanager_url: http://localhost:9093 diff --git a/api/deploy/observability/otel-collector-config.yaml b/api/deploy/observability/otel-collector-config.yaml new file mode 100644 index 0000000..d8fcef7 --- /dev/null +++ b/api/deploy/observability/otel-collector-config.yaml @@ -0,0 +1,53 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 5s + send_batch_size: 1024 + memory_limiter: + check_interval: 1s + limit_mib: 512 + spike_limit_mib: 128 + resource: + attributes: + - key: deployment.environment + value: development + action: upsert + +exporters: + otlp/tempo: + endpoint: tempo:4317 + tls: + insecure: true + prometheus: + endpoint: 0.0.0.0:8889 + loki: + endpoint: http://loki:3100/loki/api/v1/push + tls: + insecure: true + +extensions: + health_check: + endpoint: 0.0.0.0:13133 + +service: + extensions: [health_check] + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [otlp/tempo] + metrics: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [prometheus] + logs: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [loki] diff --git a/api/deploy/observability/prometheus.yml b/api/deploy/observability/prometheus.yml new file mode 100644 index 0000000..ea02974 --- /dev/null +++ b/api/deploy/observability/prometheus.yml @@ -0,0 +1,12 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: ["localhost:9090"] + + - job_name: otel-collector + static_configs: + - targets: ["otel-collector:8889"] diff --git a/api/deploy/observability/promtail-config.yaml b/api/deploy/observability/promtail-config.yaml new file mode 100644 index 0000000..e09c808 --- /dev/null +++ b/api/deploy/observability/promtail-config.yaml @@ -0,0 +1,41 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + relabel_configs: + - source_labels: ["__meta_docker_container_name"] + regex: "/(.*)" + target_label: container + - source_labels: ["__meta_docker_container_log_stream"] + target_label: stream + - source_labels: ["__meta_docker_container_label_com_docker_compose_service"] + target_label: compose_service + pipeline_stages: + - regex: + expression: '(?:tid=|trace_id=)(?P[0-9a-f]{12,32})' + - regex: + expression: 'event=(?P[a-zA-Z0-9_.-]+)' + - regex: + expression: 'duration_ms=(?P[0-9.]+)' + - json: + expressions: + trace_id: trace_id + span_id: span_id + request_id: request_id + event: event + - structured_metadata: + trace_id: + - labels: + request_id: + event: diff --git a/api/deploy/observability/tempo.yaml b/api/deploy/observability/tempo.yaml new file mode 100644 index 0000000..9c0d969 --- /dev/null +++ b/api/deploy/observability/tempo.yaml @@ -0,0 +1,29 @@ +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + +ingester: + max_block_duration: 5m + +compactor: + compaction: + block_retention: 48h + +storage: + trace: + backend: local + local: + path: /var/tempo/traces + wal: + path: /var/tempo/wal + +query_frontend: + search: + duration_slo: 5s + throughput_bytes_slo: 1.073741824e+09 diff --git a/api/development.sh b/api/development.sh index d64c9c9..ceb3576 100755 --- a/api/development.sh +++ b/api/development.sh @@ -25,10 +25,19 @@ API_PORT="${API_PORT:-8000}" CELERY_POOL="${CELERY_POOL:-solo}" SKIP_INSTALL="${SKIP_INSTALL:-0}" SKIP_INFRA="${SKIP_INFRA:-0}" +# 可观测性:空=若 .env 中 OTEL_ENABLED=true 则启动 compose;0=不启;1=强制启动 +START_OBSERVABILITY="${START_OBSERVABILITY:-}" SHUTDOWN_TIMEOUT="${SHUTDOWN_TIMEOUT:-12}" -# 由 internal-eval.sh 开启:在 main:app + Celery 之外再启 internal_main(:8001) 与 app-eval-web -LIFE_ECHO_WITH_INTERNAL_EVAL="${LIFE_ECHO_WITH_INTERNAL_EVAL:-0}" +# 与 docker-compose.observability.yml / .env.example 默认宿主机端口一致 +OTEL_GRPC_HOST_PORT="${OTEL_GRPC_HOST_PORT:-48317}" +GRAFANA_HOST_PORT="${GRAFANA_HOST_PORT:-48300}" +PROMETHEUS_HOST_PORT="${PROMETHEUS_HOST_PORT:-49090}" + +# 默认一并启动 internal_main + app-eval-web(设 0 可仅主站) +LIFE_ECHO_WITH_INTERNAL_EVAL="${LIFE_ECHO_WITH_INTERNAL_EVAL:-1}" +# 自动用 Google Chrome 打开 Grafana / 评测 Web(勿用 Vite --open,避免落到 Safari) +OPEN_OBSERVABILITY_UI="${OPEN_OBSERVABILITY_UI:-1}" # 若 :8000 已由其他 development 实例占用,仅附加 :8001 + 前端(需自备同一份 Celery/主站) EVAL_ATTACH_ONLY="${EVAL_ATTACH_ONLY:-0}" INTERNAL_EVAL_HOST="${INTERNAL_EVAL_HOST:-0.0.0.0}" @@ -43,6 +52,9 @@ INTERNAL_EVAL_PID="" EVAL_WEB_PID="" CLEANED_UP=0 INFRA_STARTED=0 +OBSERVABILITY_STARTED=0 +OBSERVABILITY_BROWSER_SCHEDULED=0 +EVAL_WEB_BROWSER_SCHEDULED=0 print_header() { echo -e "\n${BLUE}========================================${NC}" @@ -62,6 +74,64 @@ print_err() { echo -e "${RED}✗ $1${NC}" } +open_browser_url() { + local url="$1" + if command -v open >/dev/null 2>&1 && [[ "$(uname -s)" == "Darwin" ]]; then + if open -a "Google Chrome" "${url}" >/dev/null 2>&1; then + return 0 + fi + print_warn "未找到 Google Chrome,请手动打开: ${url}" + return 1 + fi + if command -v google-chrome >/dev/null 2>&1; then + google-chrome "${url}" >/dev/null 2>&1 & + return 0 + fi + if command -v chromium-browser >/dev/null 2>&1; then + chromium-browser "${url}" >/dev/null 2>&1 & + return 0 + fi + if command -v chromium >/dev/null 2>&1; then + chromium "${url}" >/dev/null 2>&1 & + return 0 + fi + print_warn "未找到 Chrome/Chromium,请手动打开: ${url}" + return 1 +} + +schedule_observability_browser() { + if [[ "${OPEN_OBSERVABILITY_UI}" != "1" ]] || [[ "${OBSERVABILITY_BROWSER_SCHEDULED}" == "1" ]]; then + return 0 + fi + OBSERVABILITY_BROWSER_SCHEDULED=1 + local grafana_url="http://127.0.0.1:${GRAFANA_HOST_PORT}" + ( + sleep 4 + open_browser_url "${grafana_url}" + ) & + print_ok "将自动打开 Grafana: ${grafana_url}" +} + +schedule_eval_web_browser() { + if [[ "${OPEN_EVAL_WEB}" != "1" ]] || [[ "${EVAL_WEB_BROWSER_SCHEDULED:-0}" == "1" ]]; then + return 0 + fi + EVAL_WEB_BROWSER_SCHEDULED=1 + local eval_url="http://127.0.0.1:${EVAL_WEB_PORT}/" + ( + local i=0 + while (( i < 30 )); do + if is_port_listening "${EVAL_WEB_PORT}"; then + break + fi + sleep 1 + i=$((i + 1)) + done + open_browser_url "${eval_url}" + ) & + print_ok "将自动打开评测 Web (Chrome): ${eval_url}" +} + is_pid_alive() { local pid="$1" [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null @@ -147,11 +217,9 @@ cleanup() { fi if [[ "${INFRA_STARTED}" == "1" ]]; then - print_warn "正在停止 PostgreSQL / Redis 容器..." - ( - cd "${ROOT_DIR}" && docker compose -f docker-compose.dev.yml stop - ) >/dev/null 2>&1 || true - print_ok "PostgreSQL/Redis 容器已停止" + print_warn "正在停止 Docker 基础设施..." + docker_compose_cmd stop >/dev/null 2>&1 || true + print_ok "Docker 容器已停止" fi } @@ -163,12 +231,107 @@ require_cmd() { fi } +read_env_bool() { + local key="$1" + local default="${2:-0}" + local line val + + if [[ -n "${!key:-}" ]]; then + val="${!key}" + case "${val}" in + 1 | true | TRUE | yes | YES | on | ON) return 0 ;; + *) return 1 ;; + esac + fi + + if [[ ! -f "${ROOT_DIR}/.env" ]]; then + [[ "${default}" == "1" ]] + return + fi + + line="$(grep -E "^${key}=" "${ROOT_DIR}/.env" | tail -1 | cut -d= -f2- | tr -d '\r' | sed 's/^"//;s/"$//')" + case "${line}" in + 1 | true | TRUE | yes | YES | on | ON) return 0 ;; + *) [[ "${default}" == "1" ]] ;; + esac +} + +should_start_observability() { + case "${START_OBSERVABILITY}" in + 0 | false | FALSE | no | NO | off | OFF) return 1 ;; + 1 | true | TRUE | yes | YES | on | ON) return 0 ;; + esac + read_env_bool "OTEL_ENABLED" "0" +} + +docker_compose_cmd() { + # 统一 compose -f,兼容 macOS 自带 bash 3.2(勿用 local -n / local arr=(-f …)) + if should_start_observability; then + (cd "${ROOT_DIR}" && docker compose \ + -f docker-compose.dev.yml \ + -f docker-compose.observability.yml \ + "$@") + return + fi + if [[ "$1" == "up" ]]; then + (cd "${ROOT_DIR}" && docker compose -f docker-compose.dev.yml "$@" --remove-orphans) + else + (cd "${ROOT_DIR}" && docker compose -f docker-compose.dev.yml "$@") + fi +} + +wait_otel_collector_ready() { + local retries="${1:-30}" + local i=0 + while (( i < retries )); do + if is_port_listening "${OTEL_GRPC_HOST_PORT}"; then + return 0 + fi + sleep 1 + i=$((i + 1)) + done + return 1 +} + +check_otel_collector_ready() { + if ! read_env_bool "OTEL_ENABLED" "0"; then + return 0 + fi + if is_port_listening "${OTEL_GRPC_HOST_PORT}"; then + print_ok "OTel Collector 端口已监听 (:${OTEL_GRPC_HOST_PORT})" + return 0 + fi + if [[ "${OBSERVABILITY_STARTED}" == "1" ]]; then + print_warn "等待 OTel Collector 端口 :${OTEL_GRPC_HOST_PORT} …" + if wait_otel_collector_ready 45; then + print_ok "OTel Collector 端口已监听 (:${OTEL_GRPC_HOST_PORT})" + return 0 + fi + fi + print_warn "OTEL_ENABLED=true 但 :${OTEL_GRPC_HOST_PORT} 未监听" + print_warn "请确认本次启动日志中有「启动可观测性栈」;或手动执行:" + print_warn " docker compose -f docker-compose.dev.yml -f docker-compose.observability.yml up -d" + print_warn "不需要可观测性时在 .env.development 设 OTEL_ENABLED=false" + return 1 +} + start_infra() { - print_header "启动 PostgreSQL 和 Redis" - cd "${ROOT_DIR}" - docker compose -f docker-compose.dev.yml up -d + if should_start_observability; then + print_header "启动 PostgreSQL、Redis 与可观测性栈 (OTel / Grafana LGTM)" + OBSERVABILITY_STARTED=1 + else + print_header "启动 PostgreSQL 和 Redis" + fi + docker_compose_cmd up -d INFRA_STARTED=1 print_ok "PostgreSQL 127.0.0.1:48291,Redis 127.0.0.1:48307(见 docker-compose.dev.yml / .env.example)" + if [[ "${OBSERVABILITY_STARTED}" == "1" ]]; then + print_ok "Grafana http://127.0.0.1:${GRAFANA_HOST_PORT} (admin/admin)" + print_ok "Prometheus http://127.0.0.1:${PROMETHEUS_HOST_PORT}" + print_ok "OTLP gRPC 127.0.0.1:${OTEL_GRPC_HOST_PORT}(应用读 .env 中 OTEL_*,无需 export)" + print_ok "详见 docs/observability.md" + schedule_observability_browser + fi print_ok "基础设施已就绪" } @@ -467,19 +630,15 @@ start_eval_web() { exit 1 fi - local vite_extra=() - if [[ "${OPEN_EVAL_WEB}" == "1" ]]; then - vite_extra+=(--open) - fi - ( cd "${EVAL_WEB_DIR}" VITE_EVAL_API_KEY="${api_key}" \ VITE_EVAL_PROXY_TARGET="http://127.0.0.1:${INTERNAL_EVAL_PORT}" \ - npm run dev -- --host 127.0.0.1 --port "${EVAL_WEB_PORT}" "${vite_extra[@]}" + npm run dev -- --host 127.0.0.1 --port "${EVAL_WEB_PORT}" ) & EVAL_WEB_PID=$! print_ok "eval-web 已启动 (PID: ${EVAL_WEB_PID}) → http://127.0.0.1:${EVAL_WEB_PORT}/" + schedule_eval_web_browser } start_internal_eval_http() { @@ -493,7 +652,8 @@ start_internal_eval_http() { exit 1 fi - "${UVICORN_BIN}" app.internal_main:internal_app --reload \ + OTEL_SERVICE_NAME="${INTERNAL_EVAL_OTEL_SERVICE_NAME:-life-echo-internal-api}" \ + "${UVICORN_BIN}" app.internal_main:internal_app --reload \ --reload-exclude 'alembic/**' \ --reload-exclude 'alembic.ini' \ --host "${INTERNAL_EVAL_HOST}" --port "${INTERNAL_EVAL_PORT}" & @@ -547,7 +707,7 @@ start_services() { fi if [[ "${skip_main}" == "1" ]] && [[ "${LIFE_ECHO_WITH_INTERNAL_EVAL}" != "1" ]]; then - print_err "EVAL_ATTACH_ONLY=1 仅用于在已有主站时附加内部评测;请使用 ./internal-eval.sh 或导出 LIFE_ECHO_WITH_INTERNAL_EVAL=1" + print_err "EVAL_ATTACH_ONLY=1 仅用于在已有主站时附加内部评测;请设置 LIFE_ECHO_WITH_INTERNAL_EVAL=1" exit 1 fi @@ -601,14 +761,27 @@ start_services() { echo "主站文档: http://localhost:${API_PORT}/docs" echo "健康检查: http://localhost:${API_PORT}/health" fi + if [[ "${LIFE_ECHO_WITH_INTERNAL_EVAL}" == "1" ]]; then + echo "评测 Web UI: http://127.0.0.1:${EVAL_WEB_PORT}/" + echo "内部评测 API: http://127.0.0.1:${INTERNAL_EVAL_PORT}/health" + fi + if read_env_bool "OTEL_ENABLED" "0"; then + echo "可观测性: Grafana http://127.0.0.1:${GRAFANA_HOST_PORT} | Prometheus http://127.0.0.1:${PROMETHEUS_HOST_PORT}" + if is_port_listening "${GRAFANA_HOST_PORT}"; then + schedule_observability_browser + fi + fi + if [[ "${LIFE_ECHO_WITH_INTERNAL_EVAL}" == "1" ]] && is_pid_alive "${EVAL_WEB_PID}"; then + schedule_eval_web_browser + fi echo "按 Ctrl+C 停止所有进程" } main() { if [[ "${LIFE_ECHO_WITH_INTERNAL_EVAL}" == "1" ]]; then - print_header "Life Echo 开发环境 + 内部评测(主站 + :${INTERNAL_EVAL_PORT} + Eval Web)" + print_header "Life Echo 开发环境(主站 + 内部评测 + 可观测性)" else - print_header "Life Echo 开发环境一键启动" + print_header "Life Echo 开发环境一键启动(无内部评测)" fi require_cmd "uv" @@ -618,16 +791,22 @@ main() { trap cleanup EXIT INT TERM + ensure_venv + # 必须在 start_infra 之前同步,否则 should_start_observability 读不到 .env.development 里的 OTEL_ENABLED + ensure_dotenv_from_development + if [[ "${SKIP_INFRA}" != "1" ]]; then start_infra wait_postgres_ready || true else print_warn "已跳过 docker 基础设施 (SKIP_INFRA=1)" + if should_start_observability; then + print_warn "SKIP_INFRA=1 未自动启动 observability;若需 LGTM 请手动 docker compose up observability overlay" + fi fi - ensure_venv - ensure_dotenv_from_development check_env_file + check_otel_collector_ready || true wait_host_infra_ready run_migrations start_services diff --git a/api/docker-compose.observability.yml b/api/docker-compose.observability.yml new file mode 100644 index 0000000..a251fb2 --- /dev/null +++ b/api/docker-compose.observability.yml @@ -0,0 +1,122 @@ +# 本地可观测性栈 overlay(与 docker-compose.dev.yml 一起使用) +# +# docker compose -f docker-compose.dev.yml -f docker-compose.observability.yml up -d +# +# 宿主机端口刻意避开 3000/9090/4317 等常用口,与 .env.example 中 OTEL_* / *_HOST_PORT 对齐。 +# Grafana: http://127.0.0.1:${GRAFANA_HOST_PORT:-48300} (admin / admin) +# OTLP: 127.0.0.1:${OTEL_GRPC_HOST_PORT:-48317} (gRPC) :${OTEL_HTTP_HOST_PORT:-48318} (HTTP) + +services: + otel-collector: + image: otel/opentelemetry-collector-contrib:0.120.0 + container_name: life-echo-otel-collector + command: ["--config=/etc/otelcol/config.yaml"] + volumes: + - ./deploy/observability/otel-collector-config.yaml:/etc/otelcol/config.yaml:ro + ports: + - "127.0.0.1:${OTEL_GRPC_HOST_PORT:-48317}:4317" + - "127.0.0.1:${OTEL_HTTP_HOST_PORT:-48318}:4318" + - "127.0.0.1:${OTEL_COLLECTOR_HEALTH_HOST_PORT:-48333}:13133" + depends_on: + tempo: + condition: service_started + loki: + condition: service_started + networks: + - default + restart: unless-stopped + + tempo: + image: grafana/tempo:2.7.2 + container_name: life-echo-tempo + command: ["-config.file=/etc/tempo.yaml"] + volumes: + - ./deploy/observability/tempo.yaml:/etc/tempo.yaml:ro + - tempo_data:/var/tempo + ports: + - "127.0.0.1:${TEMPO_HTTP_HOST_PORT:-43200}:3200" + networks: + - default + restart: unless-stopped + + loki: + image: grafana/loki:3.4.2 + container_name: life-echo-loki + command: ["-config.file=/etc/loki/loki-config.yaml"] + volumes: + - ./deploy/observability/loki-config.yaml:/etc/loki/loki-config.yaml:ro + - loki_data:/loki + ports: + - "127.0.0.1:${LOKI_HTTP_HOST_PORT:-43100}:3100" + networks: + - default + restart: unless-stopped + + promtail: + image: grafana/promtail:3.4.2 + container_name: life-echo-promtail + command: ["-config.file=/etc/promtail/config.yml"] + volumes: + - ./deploy/observability/promtail-config.yaml:/etc/promtail/config.yml:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + depends_on: + loki: + condition: service_started + networks: + - default + restart: unless-stopped + + prometheus: + image: prom/prometheus:v3.2.1 + container_name: life-echo-prometheus + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --web.enable-lifecycle + volumes: + - ./deploy/observability/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus_data:/prometheus + ports: + - "127.0.0.1:${PROMETHEUS_HOST_PORT:-49090}:9090" + depends_on: + otel-collector: + condition: service_started + networks: + - default + restart: unless-stopped + + grafana: + image: grafana/grafana:11.5.2 + container_name: life-echo-grafana + environment: + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD: admin + GF_USERS_ALLOW_SIGN_UP: "false" + GF_AUTH_ANONYMOUS_ENABLED: "false" + volumes: + - ./deploy/observability/grafana/provisioning:/etc/grafana/provisioning:ro + - ./deploy/observability/grafana/dashboards:/etc/grafana/dashboards:ro + - grafana_data:/var/lib/grafana + ports: + - "127.0.0.1:${GRAFANA_HOST_PORT:-48300}:3000" + depends_on: + prometheus: + condition: service_started + tempo: + condition: service_started + loki: + condition: service_started + networks: + - default + restart: unless-stopped + +volumes: + tempo_data: + loki_data: + prometheus_data: + grafana_data: + +networks: + default: + name: life-echo-dev + external: true diff --git a/api/docs/internal-eval.md b/api/docs/internal-eval.md index 7b16fce..d6824e1 100644 --- a/api/docs/internal-eval.md +++ b/api/docs/internal-eval.md @@ -4,29 +4,30 @@ ## 启动 -**推荐一条命令**:`internal-eval.sh` 实际调用 `development.sh`,在同一进程树里启动主站 `main:app`(**8000**)、**一份** Celery、内部评测 `internal_app`(默认 **8001**)以及 `app-eval-web`(默认 **5174**)。不需要再并行执行两份启动脚本。 +**推荐一条命令**:`./development.sh` 默认启动主站(**8000**)、Celery、内部评测 API(默认 **7999**)、评测 Web(**5174**);`.env` 中 `OTEL_ENABLED=true` 时并起 Grafana 且自动打开浏览器。`./internal-eval.sh` 仅为兼容转发。 -| | 单一命令 `./internal-eval.sh` | +| | `./development.sh`(默认) | |---|-------------------------------| -| HTTP | 主站 **8000** + internal **8001** | -| Celery | 仅 **一个** worker(与主站共用队列) | -| 前端 | 默认启动 `app-eval-web`(`START_EVAL_WEB=0` 可关) | +| HTTP | 主站 **8000** + internal **7999** | +| Celery | 仅 **一个** worker | +| 评测 UI | `open` → http://127.0.0.1:5174/(`OPEN_EVAL_WEB=0` 可关) | +| 可观测性 | Grafana :48300(`OPEN_OBSERVABILITY_UI=0` 可关) | 若 **主站 + Celery 已在其他终端** 由 `./development.sh` 跑起来了,只在同一台机器上多开评测 HTTP 与前端、且 **不再起第二份 Worker**: ```bash cd api # 确保 .env.development / .env 含 INTERNAL_EVAL_API_KEY;:8000 已被主站监听 -SKIP_INFRA=1 SKIP_INSTALL=1 EVAL_ATTACH_ONLY=1 ./internal-eval.sh +SKIP_INFRA=1 SKIP_INSTALL=1 EVAL_ATTACH_ONLY=1 ./development.sh ``` 兼容旧写法:`SKIP_CELERY=1` 会映射为 `EVAL_ATTACH_ONLY=1`(仍要求 **8000 已在监听**)。 -仅主业务、不要评测台时照旧:`./development.sh`(不设置 `LIFE_ECHO_WITH_INTERNAL_EVAL`)。 +仅主业务、不要评测台:`LIFE_ECHO_WITH_INTERNAL_EVAL=0 ./development.sh`。 -若你只需要 **8001**、刻意不启主站 **8000**,请用下文「手动 uvicorn」配合既有 Celery,不要用 `./internal-eval.sh`(一键脚本会顺带拉起主站)。 +若只需 **7999**、不启主站 **8000**,见下文「手动 uvicorn」;不要用一键脚本。 -**默认会起 `app-eval-web`,并用 Vite `--open` 尝试打开浏览器**(`http://127.0.0.1:5174/`)。不要前端时设 `START_EVAL_WEB=0`;只要前端但不要弹窗时设 `OPEN_EVAL_WEB=0`。 +**默认会起 `app-eval-web`,并用系统浏览器打开评测台**(`http://127.0.0.1:5174/`,与 Grafana 同为 `open`)。不要前端时设 `START_EVAL_WEB=0`;只要前端但不要弹窗时设 `OPEN_EVAL_WEB=0`。 数据库与主服务共用;需配置环境变量后启动专用进程: diff --git a/api/docs/observability.md b/api/docs/observability.md new file mode 100644 index 0000000..4c67c29 --- /dev/null +++ b/api/docs/observability.md @@ -0,0 +1,139 @@ +# 可观测性(OpenTelemetry + Grafana LGTM) + +本地开发使用 **OpenTelemetry** 采集 traces / metrics / logs,经 **OTel Collector** 写入 **Tempo / Prometheus / Loki**,在 **Grafana** 统一查看。 + +配置写在 **`.env`**(由 `.env.development` 经 `development.sh` 同步,或从 [`.env.example`](../.env.example) 复制),`app.core.config.settings` 启动时自动读取,**无需**在 shell 里 `export OTEL_*`。 + +## 启动栈 + +在 `api/` 目录: + +```bash +# 1. 数据库与 Redis +docker compose -f docker-compose.dev.yml up -d + +# 2. 可观测性(需已存在 life-echo-dev 网络;端口来自 .env 或下列默认) +docker compose -f docker-compose.dev.yml -f docker-compose.observability.yml up -d +``` + +| 服务 | 默认宿主机地址 | compose 变量 | +|------|----------------|--------------| +| Grafana | http://127.0.0.1:48300 (admin / admin) | `GRAFANA_HOST_PORT` | +| Prometheus | http://127.0.0.1:49090 | `PROMETHEUS_HOST_PORT` | +| OTLP gRPC | http://127.0.0.1:48317 | `OTEL_GRPC_HOST_PORT` | +| OTLP HTTP | http://127.0.0.1:48318 | `OTEL_HTTP_HOST_PORT` | +| Collector health | http://127.0.0.1:48333 | `OTEL_COLLECTOR_HEALTH_HOST_PORT` | + +容器**内部**仍使用标准端口(如 Collector `4317`);仅宿主机映射使用 `48xxx` 段,与 Postgres `48291`、Redis `48307` 同一风格。 + +预置 Dashboard(**Life Echo** 文件夹): + +| Dashboard | 用途 | +|-----------|------| +| Life Echo Overview | API RED、LLM 摘要、依赖延迟 | +| Life Echo LLM | `call_type` / agent / tokens、outcome 分布 | +| Life Echo Business | 回忆录阶段、WS/ASR/TTS、Celery 业务 span | +| Life Echo Logs | Loki 按 `event` / `trace_id` 检索 | + +## 启用应用导出 + +在 [`.env.example`](../.env.example) 已给出本地默认值,同步到 `.env` 即可,例如: + +```env +OTEL_ENABLED=true +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:48317 +OTEL_TRACES_SAMPLER=always_on +OTEL_SERVICE_NAME=life-echo-api +``` + +推荐与全栈一并启动(`./development.sh` 在 `.env` 里 `OTEL_ENABLED=true` 时会起 observability compose,并默认打开 Grafana 浏览器标签): + +```bash +cd api +./development.sh +``` + +仅手动起 API(不自动开 Grafana): + +```bash +cd api +uv run uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 +``` + +Celery worker 同一 `.env`;未设 `OTEL_SERVICE_NAME` 时 worker 默认为 `life-echo-celery-worker`。 + +若 API 跑在 **Docker compose** 里,应设 `OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317`(服务名 + 容器内端口),而不是 `localhost`。 + +不需要可观测性时:`.env` 中 `OTEL_ENABLED=false`(或未启动 observability compose)。 + +## 采集内容 + +| 类型 | 来源 | +|------|------| +| HTTP | FastAPI 自动 instrumentation(`/health` 排除) | +| DB | SQLAlchemy | +| Redis | redis-py | +| 出站 HTTP | httpx(DeepSeek 等) | +| Celery | 任务 span + W3C trace 传播 | +| LLM | `llm_telemetry`(LangChain / DeepSeek / `llm_call`)+ `llm.call.*` / `llm.tokens.*` metrics | +| 业务 | `business_telemetry`:WS 回合、回忆录 phase、ASR/TTS、支付等子 span | +| 日志 | loguru patcher 注入 `trace_id`;Promtail 解析 `event` / `tid=`;可选 `LOG_JSON_FILE` JSON sink | + +日志字段:`request_id`、`trace_id`、`span_id`。HTTP 由中间件 `contextualize`;**Celery / 后台**由 loguru **patcher** 从当前 OTel span 合并,无需经过 HTTP 中间件。 + +## 常用排查 + +1. **API 慢**:Grafana → Tempo,按 `service.name=life-echo-api` 查 trace;看 DB / httpx / `llm.*` / `conversation.ws.*` 子 span。 +2. **LLM 慢**:**Life Echo LLM** Dashboard,或 Loki:`{compose_service=~".+"} |= "event=llm_json_call"`。 +3. **回忆录卡阶段**:Tempo 搜 `memoir.phase1` / `memoir.phase2` / `memoir.story_pipeline.*`;**Life Echo Business** Dashboard 看 `business_operation_duration_milliseconds`。 +4. **日志 ↔ Trace**:在 Tempo 复制 `trace_id` → Loki:`{compose_service=~".+"} |= "tid=<前12位>"`(控制台短格式);Promtail 将 `trace_id` 写入 **structured metadata**(非高基数 label)。 +5. **Celery 堆积**:Tempo 过滤 `life-echo-celery-worker`;Loki `event=celery_task_failed`。 +6. **无数据**:`.env` 中 `OTEL_ENABLED=true`、`OTEL_EXPORTER_OTLP_ENDPOINT` 端口与 `OTEL_GRPC_HOST_PORT` 一致;Collector health `http://127.0.0.1:48333`;Prometheus target `otel-collector:8889` UP。 + +### LOG_JSON_FILE 与 Promtail + +- **默认**:loguru 人类可读行 → Docker stdout → Promtail **regex** 提取 `tid` / `event` / `duration_ms`;`trace_id` 进 structured metadata,**不作为 Loki label**。 +- **可选**:`LOG_JSON_FILE=/path/to/app.jsonl` 开启 JSON sink(`serialize=true`),便于与 OTLP logs 或自建采集对齐;与 Promtail 可**并存**(同一容器 stdout 仍走 regex)。 + +## 采样(staging/prod 第二阶段) + +| 环境 | 建议 | +|------|------| +| development | `OTEL_TRACES_SAMPLER=always_on` | +| staging/production | `OTEL_TRACES_SAMPLER=parentbased_traceidratio`,`OTEL_TRACES_SAMPLER_ARG=0.1` | + +关闭 telemetry:`OTEL_ENABLED=false`,无 exporter 开销。 + +## Prometheus 指标名(OTel → Prometheus) + +| OTel 仪器 | Prometheus 系列(histogram) | +|-----------|------------------------------| +| `llm.call.duration` (ms) | `llm_call_duration_milliseconds_bucket` | +| `business.operation.duration` (ms) | `business_operation_duration_milliseconds_bucket` | +| `http.server.request.duration` (s) | `http_server_request_duration_seconds_bucket` | +| `db.client.operation.duration` (s) | `db_client_operation_duration_seconds_bucket` | +| `http.client.request.duration` (s) | `http_client_request_duration_seconds_bucket` | + +Counter 示例:`llm_call_total`、`llm_tokens_input_total`。 + +校验脚本(需 observability compose + 有流量): + +```bash +chmod +x scripts/verify_observability_metrics.sh +./scripts/verify_observability_metrics.sh +``` + +## 验收清单(本地 E2E) + +- [ ] `OTEL_ENABLED=true`,启动 compose + API + Celery worker +- [ ] 跑一条 WS 对话;Tempo 可见 `conversation.ws.process_turn`、`llm.chat_invoke` +- [ ] 触发 memoir phase1;Tempo 可见 `memoir.phase1.*`、`memoir.story_pipeline.*` +- [ ] Prometheus:`call_type` label 存在;真实 LLM 后 `llm_tokens_input_total` > 0 +- [ ] Loki:`|= "tid="` 能查到同次请求日志 +- [ ] `./scripts/verify_observability_metrics.sh` 通过 +- [ ] Grafana Alerting 页无 provisioning 错误(通知渠道可空) + +## 配置目录 + +- [`deploy/observability/`](../deploy/observability/):Collector、Tempo、Loki、Prometheus、Grafana provisioning +- [`docker-compose.observability.yml`](../docker-compose.observability.yml):本地 overlay diff --git a/api/docs/部署指南.md b/api/docs/部署指南.md index dda4786..75f3a1d 100644 --- a/api/docs/部署指南.md +++ b/api/docs/部署指南.md @@ -305,11 +305,13 @@ sudo journalctl -u life-echo-api -f ### 8. 监控与告警 +本地开发与预发可观测性栈(OpenTelemetry + Grafana LGTM)见 **[可观测性指南](observability.md)**。staging/production 全量接入为第二阶段(`docker-compose` profile)。 + #### 8.1 配置日志监控 建议使用以下工具: +- **Grafana + Loki + Tempo + Prometheus**(仓库内 `deploy/observability/`,推荐) - ELK Stack (Elasticsearch + Logstash + Kibana) -- Grafana + Loki - 云服务商的日志服务 #### 8.2 配置性能监控 diff --git a/api/internal-eval.sh b/api/internal-eval.sh index 32130e2..3557df7 100755 --- a/api/internal-eval.sh +++ b/api/internal-eval.sh @@ -1,22 +1,18 @@ #!/usr/bin/env bash -# 在 development.sh 全栈之上附加 internal_main(默认 :8001)与 app-eval-web。 -# 只需一条命令,无需再并行跑两份脚本;共用同一份 Postgres/Redis/Celery(本脚本只起一个 Worker)。 +# 已合并入 development.sh(默认启动评测台 + 自动打开 Grafana / 评测 UI)。 +# 本脚本保留为兼容入口,行为与 ./development.sh 相同。 # -# 用法:cd api && ./internal-eval.sh +# 若主站已在其他终端占用 :8000,仅附加评测 HTTP + 前端(不再起 Celery): +# SKIP_INFRA=1 SKIP_INSTALL=1 EVAL_ATTACH_ONLY=1 ./development.sh # -# 若主站已在其他终端由 ./development.sh 占用 :8000,仅多开评测 HTTP + 前端(不再起第二份 Celery): -# SKIP_INFRA=1 SKIP_INSTALL=1 EVAL_ATTACH_ONLY=1 ./internal-eval.sh -# -# 兼容旧环境变量:SKIP_CELERY=1 等价于 EVAL_ATTACH_ONLY=1(仍要求 :8000 已有监听)。 -# -# 其他可选变量与 development.sh 一致,例如: -# SKIP_INFRA=1 SKIP_INSTALL=1 START_EVAL_WEB=0 OPEN_EVAL_WEB=0 -# INTERNAL_EVAL_PORT EVAL_WEB_PORT INTERNAL_EVAL_API_KEY +# 兼容旧变量:SKIP_CELERY=1 等价于 EVAL_ATTACH_ONLY=1 set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +echo -e "\033[1;33m⚠ internal-eval.sh 已并入 development.sh,正在转发…\033[0m" >&2 + export LIFE_ECHO_WITH_INTERNAL_EVAL=1 if [[ "${SKIP_CELERY:-}" == "1" ]]; then diff --git a/api/pyproject.toml b/api/pyproject.toml index cf056f0..95d2598 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -17,6 +17,15 @@ dependencies = [ "langchain-openai>=1.1.11", "loguru>=0.7.3", "openai>=2.26.0", + "opentelemetry-api>=1.42.0", + "opentelemetry-exporter-otlp-proto-grpc>=1.42.0", + "opentelemetry-instrumentation-celery>=0.63b0", + "opentelemetry-instrumentation-fastapi>=0.63b0", + "opentelemetry-instrumentation-httpx>=0.63b0", + "opentelemetry-instrumentation-logging>=0.63b0", + "opentelemetry-instrumentation-redis>=0.63b0", + "opentelemetry-instrumentation-sqlalchemy>=0.63b0", + "opentelemetry-sdk>=1.42.0", "pgvector>=0.4.2", "pillow>=12.1.1", "psycopg[binary]>=3.2.0", diff --git a/api/scripts/verify_observability_metrics.sh b/api/scripts/verify_observability_metrics.sh new file mode 100755 index 0000000..1ea22c5 --- /dev/null +++ b/api/scripts/verify_observability_metrics.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# 校验本地 Prometheus 是否已暴露 OTel 导出指标(需 observability compose 运行中)。 +set -euo pipefail + +PROM_URL="${PROMETHEUS_URL:-http://127.0.0.1:49090}" +QUERY_ENDPOINT="${PROM_URL}/api/v1/query" + +check_metric() { + local name="$1" + local result + result="$(curl -sf "${QUERY_ENDPOINT}?query=${name}" | python3 -c " +import json, sys +data = json.load(sys.stdin) +r = data.get('data', {}).get('result', []) +print('ok' if r else 'missing') +")" + if [[ "${result}" != "ok" ]]; then + echo "MISSING: ${name}" + return 1 + fi + echo "OK: ${name}" +} + +echo "Checking Prometheus at ${PROM_URL} ..." +fail=0 +for m in \ + "llm_call_duration_milliseconds_bucket" \ + "llm_call_total" \ + "business_operation_duration_milliseconds_bucket" \ + "http_server_request_duration_seconds_bucket" +do + check_metric "${m}" || fail=1 +done + +if [[ "${fail}" -ne 0 ]]; then + echo "" + echo "Some metrics missing. Ensure OTEL_ENABLED=true, API/worker running, and traffic generated." + exit 1 +fi +echo "All required metrics present." diff --git a/api/tests/core/test_business_telemetry.py b/api/tests/core/test_business_telemetry.py new file mode 100644 index 0000000..9ed81da --- /dev/null +++ b/api/tests/core/test_business_telemetry.py @@ -0,0 +1,64 @@ +"""Business telemetry helpers (no real Collector required).""" + +from __future__ import annotations + +import pytest +from opentelemetry import trace + +from app.core.business_telemetry import business_span +from app.core.config import settings + + +class TestBusinessSpan: + def test_disabled_is_noop(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(settings, "otel_enabled", False) + with business_span("memoir.phase1", user_id="u1") as span: + assert span == trace.INVALID_SPAN + + def test_filters_high_cardinality_attrs(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(settings, "otel_enabled", True) + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor + from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, + ) + + exporter = InMemorySpanExporter() + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(exporter)) + monkeypatch.setattr( + "app.core.business_telemetry.get_tracer", + lambda _name: provider.get_tracer("test"), + ) + + with business_span( + "memoir.phase2", + user_id="user-123", + chapter_category="childhood", + ): + pass + + spans = exporter.get_finished_spans() + assert spans + attrs = dict(spans[0].attributes or {}) + assert attrs.get("business.chapter_category") == "childhood" + assert "business.user_id" not in attrs + + def test_enabled_yields_span(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(settings, "otel_enabled", True) + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor + from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, + ) + + exporter = InMemorySpanExporter() + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(exporter)) + monkeypatch.setattr( + "app.core.business_telemetry.get_tracer", + lambda _name: provider.get_tracer("test"), + ) + + with business_span("conversation.ws.process_turn") as span: + assert span.is_recording() diff --git a/api/tests/core/test_llm_telemetry.py b/api/tests/core/test_llm_telemetry.py new file mode 100644 index 0000000..05a3658 --- /dev/null +++ b/api/tests/core/test_llm_telemetry.py @@ -0,0 +1,118 @@ +"""LLM telemetry helpers (no real Collector required).""" + +from __future__ import annotations + +from types import SimpleNamespace +from unittest.mock import patch + +import pytest + +from app.core import llm_telemetry +from app.core.config import settings + + +class TestExtractTokenUsage: + def test_usage_metadata_object(self) -> None: + msg = SimpleNamespace(usage_metadata=SimpleNamespace(input_tokens=10, output_tokens=4)) + assert llm_telemetry.extract_token_usage(msg) == (10, 4) + + def test_response_metadata_dict(self) -> None: + msg = SimpleNamespace( + usage_metadata=None, + response_metadata={"token_usage": {"prompt_tokens": 3, "completion_tokens": 7}}, + ) + assert llm_telemetry.extract_token_usage(msg) == (3, 7) + + def test_missing_usage_returns_zero(self) -> None: + assert llm_telemetry.extract_token_usage(SimpleNamespace()) == (0, 0) + + +class TestOtelDisabledNoOp: + def test_record_llm_completion_disabled(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(settings, "otel_enabled", False) + llm_telemetry.record_llm_completion( + agent="Test", + provider="mock", + model="m", + duration_ms=1.0, + input_tokens=5, + output_tokens=2, + ) + + def test_langchain_invoke_span_disabled(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(settings, "otel_enabled", False) + with llm_telemetry.langchain_invoke_span( + agent="Test", + provider="mock", + model="m", + call_type="chat", + ) as ctx: + ctx["response"] = SimpleNamespace( + usage_metadata=SimpleNamespace(input_tokens=1, output_tokens=1) + ) + assert ctx["outcome"] == "ok" + + +class TestLangchainInvokeSpanRecordsTokens: + def test_records_completion_with_tokens(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(settings, "otel_enabled", True) + recorded: list[dict] = [] + + def _capture(**kwargs: object) -> None: + recorded.append(kwargs) + + with patch.object(llm_telemetry, "record_llm_completion", side_effect=_capture): + with llm_telemetry.langchain_invoke_span( + agent="TestAgent", + provider="mock", + model="m1", + call_type="chat", + ) as ctx: + ctx["response"] = SimpleNamespace( + usage_metadata=SimpleNamespace(input_tokens=11, output_tokens=5) + ) + + assert len(recorded) == 1 + assert recorded[0]["input_tokens"] == 11 + assert recorded[0]["output_tokens"] == 5 + assert recorded[0]["agent"] == "TestAgent" + + +class TestObserveAinvokeExtraAttributes: + @pytest.mark.asyncio + async def test_response_latency_on_span(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(settings, "otel_enabled", True) + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor + from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, + ) + from opentelemetry import trace + + exporter = InMemorySpanExporter() + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(exporter)) + monkeypatch.setattr( + "app.core.llm_telemetry.get_tracer", + lambda _name: provider.get_tracer("test"), + ) + + class _LLM: + async def ainvoke(self, messages: list) -> SimpleNamespace: + return SimpleNamespace( + usage_metadata=SimpleNamespace(input_tokens=1, output_tokens=1) + ) + + await llm_telemetry.observe_ainvoke( + _LLM(), + [], + agent="Test", + provider="mock", + model="m", + extra_span_attributes={"llm.custom": "x"}, + ) + spans = exporter.get_finished_spans() + assert spans + attrs = dict(spans[-1].attributes or {}) + assert "llm.response_latency_ms" in attrs + assert attrs.get("llm.custom") == "x" diff --git a/api/uv.lock b/api/uv.lock index 38daf95..1645a62 100644 --- a/api/uv.lock +++ b/api/uv.lock @@ -88,6 +88,15 @@ dependencies = [ { name = "langchain-openai" }, { name = "loguru" }, { name = "openai" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp-proto-grpc" }, + { name = "opentelemetry-instrumentation-celery" }, + { name = "opentelemetry-instrumentation-fastapi" }, + { name = "opentelemetry-instrumentation-httpx" }, + { name = "opentelemetry-instrumentation-logging" }, + { name = "opentelemetry-instrumentation-redis" }, + { name = "opentelemetry-instrumentation-sqlalchemy" }, + { name = "opentelemetry-sdk" }, { name = "pgvector" }, { name = "pillow" }, { name = "psycopg", extra = ["binary"] }, @@ -129,6 +138,15 @@ requires-dist = [ { name = "langchain-openai", specifier = ">=1.1.11" }, { name = "loguru", specifier = ">=0.7.3" }, { name = "openai", specifier = ">=2.26.0" }, + { name = "opentelemetry-api", specifier = ">=1.42.0" }, + { name = "opentelemetry-exporter-otlp-proto-grpc", specifier = ">=1.42.0" }, + { name = "opentelemetry-instrumentation-celery", specifier = ">=0.63b0" }, + { name = "opentelemetry-instrumentation-fastapi", specifier = ">=0.63b0" }, + { name = "opentelemetry-instrumentation-httpx", specifier = ">=0.63b0" }, + { name = "opentelemetry-instrumentation-logging", specifier = ">=0.63b0" }, + { name = "opentelemetry-instrumentation-redis", specifier = ">=0.63b0" }, + { name = "opentelemetry-instrumentation-sqlalchemy", specifier = ">=0.63b0" }, + { name = "opentelemetry-sdk", specifier = ">=1.42.0" }, { name = "pgvector", specifier = ">=0.4.2" }, { name = "pillow", specifier = ">=12.1.1" }, { name = "psycopg", extras = ["binary"], specifier = ">=3.2.0" }, @@ -156,6 +174,15 @@ dev = [ { name = "ruff", specifier = ">=0.15.6" }, ] +[[package]] +name = "asgiref" +version = "3.11.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/63/40/f03da1264ae8f7cfdbf9146542e5e7e8100a4c66ab48e791df9a03d3f6c0/asgiref-3.11.1.tar.gz", hash = "sha256:5f184dc43b7e763efe848065441eac62229c9f7b0475f41f80e207a114eda4ce", size = 38550, upload-time = "2026-02-03T13:30:14.33Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5c/0a/a72d10ed65068e115044937873362e6e32fab1b7dce0046aeb224682c989/asgiref-3.11.1-py3-none-any.whl", hash = "sha256:e8667a091e69529631969fd45dc268fa79b99c92c5fcdda727757e52146ec133", size = 24345, upload-time = "2026-02-03T13:30:13.039Z" }, +] + [[package]] name = "av" version = "16.1.0" @@ -920,6 +947,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" }, ] +[[package]] +name = "googleapis-common-protos" +version = "1.75.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b5/c8/f439cffde755cffa462bfbb156278fa6f9d09119719af9814b858fd4f81f/googleapis_common_protos-1.75.0.tar.gz", hash = "sha256:53a062ff3c32552fbd62c11fe23768b78e4ddf0494d5e5fd97d3f4689c75fbbd", size = 151035, upload-time = "2026-05-07T08:04:49.423Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/c8/e2645aa8ed02fd4c7a2f59d68783b65b1f3cbdfe39a6308e156509d1fee8/googleapis_common_protos-1.75.0-py3-none-any.whl", hash = "sha256:961ed60399c457ceb0ee8f285a84c870aabc9c6a832b9d37bb281b5bebde43ed", size = 300631, upload-time = "2026-05-07T08:03:30.345Z" }, +] + [[package]] name = "greenlet" version = "3.3.2" @@ -954,6 +993,37 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/29/4b/45d90626aef8e65336bed690106d1382f7a43665e2249017e9527df8823b/greenlet-3.3.2-cp314-cp314t-win_amd64.whl", hash = "sha256:c04c5e06ec3e022cbfe2cd4a846e1d4e50087444f875ff6d2c2ad8445495cf1a", size = 237086, upload-time = "2026-02-20T20:20:45.786Z" }, ] +[[package]] +name = "grpcio" +version = "1.80.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b7/48/af6173dbca4454f4637a4678b67f52ca7e0c1ed7d5894d89d434fecede05/grpcio-1.80.0.tar.gz", hash = "sha256:29aca15edd0688c22ba01d7cc01cb000d72b2033f4a3c72a81a19b56fd143257", size = 12978905, upload-time = "2026-03-30T08:49:10.502Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2f/3a/7c3c25789e3f069e581dc342e03613c5b1cb012c4e8c7d9d5cf960a75856/grpcio-1.80.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:e9e408fc016dffd20661f0126c53d8a31c2821b5c13c5d67a0f5ed5de93319ad", size = 6017243, upload-time = "2026-03-30T08:47:40.075Z" }, + { url = "https://files.pythonhosted.org/packages/04/19/21a9806eb8240e174fd1ab0cd5b9aa948bb0e05c2f2f55f9d5d7405e6d08/grpcio-1.80.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:92d787312e613754d4d8b9ca6d3297e69994a7912a32fa38c4c4e01c272974b0", size = 12010840, upload-time = "2026-03-30T08:47:43.11Z" }, + { url = "https://files.pythonhosted.org/packages/18/3a/23347d35f76f639e807fb7a36fad3068aed100996849a33809591f26eca6/grpcio-1.80.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8ac393b58aa16991a2f1144ec578084d544038c12242da3a215966b512904d0f", size = 6567644, upload-time = "2026-03-30T08:47:46.806Z" }, + { url = "https://files.pythonhosted.org/packages/ff/40/96e07ecb604a6a67ae6ab151e3e35b132875d98bc68ec65f3e5ab3e781d7/grpcio-1.80.0-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:68e5851ac4b9afe07e7f84483803ad167852570d65326b34d54ca560bfa53fb6", size = 7277830, upload-time = "2026-03-30T08:47:49.643Z" }, + { url = "https://files.pythonhosted.org/packages/9b/e2/da1506ecea1f34a5e365964644b35edef53803052b763ca214ba3870c856/grpcio-1.80.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:873ff5d17d68992ef6605330127425d2fc4e77e612fa3c3e0ed4e668685e3140", size = 6783216, upload-time = "2026-03-30T08:47:52.817Z" }, + { url = "https://files.pythonhosted.org/packages/44/83/3b20ff58d0c3b7f6caaa3af9a4174d4023701df40a3f39f7f1c8e7c48f9d/grpcio-1.80.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2bea16af2750fd0a899bf1abd9022244418b55d1f37da2202249ba4ba673838d", size = 7385866, upload-time = "2026-03-30T08:47:55.687Z" }, + { url = "https://files.pythonhosted.org/packages/47/45/55c507599c5520416de5eefecc927d6a0d7af55e91cfffb2e410607e5744/grpcio-1.80.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ba0db34f7e1d803a878284cd70e4c63cb6ae2510ba51937bf8f45ba997cefcf7", size = 8391602, upload-time = "2026-03-30T08:47:58.303Z" }, + { url = "https://files.pythonhosted.org/packages/10/bb/dd06f4c24c01db9cf11341b547d0a016b2c90ed7dbbb086a5710df7dd1d7/grpcio-1.80.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8eb613f02d34721f1acf3626dfdb3545bd3c8505b0e52bf8b5710a28d02e8aa7", size = 7826752, upload-time = "2026-03-30T08:48:01.311Z" }, + { url = "https://files.pythonhosted.org/packages/f9/1e/9d67992ba23371fd63d4527096eb8c6b76d74d52b500df992a3343fd7251/grpcio-1.80.0-cp313-cp313-win32.whl", hash = "sha256:93b6f823810720912fd131f561f91f5fed0fda372b6b7028a2681b8194d5d294", size = 4142310, upload-time = "2026-03-30T08:48:04.594Z" }, + { url = "https://files.pythonhosted.org/packages/cf/e6/283326a27da9e2c3038bc93eeea36fb118ce0b2d03922a9cda6688f53c5b/grpcio-1.80.0-cp313-cp313-win_amd64.whl", hash = "sha256:e172cf795a3ba5246d3529e4d34c53db70e888fa582a8ffebd2e6e48bc0cba50", size = 4882833, upload-time = "2026-03-30T08:48:07.363Z" }, + { url = "https://files.pythonhosted.org/packages/c5/6d/e65307ce20f5a09244ba9e9d8476e99fb039de7154f37fb85f26978b59c3/grpcio-1.80.0-cp314-cp314-linux_armv7l.whl", hash = "sha256:3d4147a97c8344d065d01bbf8b6acec2cf86fb0400d40696c8bdad34a64ffc0e", size = 6017376, upload-time = "2026-03-30T08:48:10.005Z" }, + { url = "https://files.pythonhosted.org/packages/69/10/9cef5d9650c72625a699c549940f0abb3c4bfdb5ed45a5ce431f92f31806/grpcio-1.80.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:d8e11f167935b3eb089ac9038e1a063e6d7dbe995c0bb4a661e614583352e76f", size = 12018133, upload-time = "2026-03-30T08:48:12.927Z" }, + { url = "https://files.pythonhosted.org/packages/04/82/983aabaad82ba26113caceeb9091706a0696b25da004fe3defb5b346e15b/grpcio-1.80.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f14b618fc30de822681ee986cfdcc2d9327229dc4c98aed16896761cacd468b9", size = 6574748, upload-time = "2026-03-30T08:48:16.386Z" }, + { url = "https://files.pythonhosted.org/packages/07/d7/031666ef155aa0bf399ed7e19439656c38bbd143779ae0861b038ce82abd/grpcio-1.80.0-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:4ed39fbdcf9b87370f6e8df4e39ca7b38b3e5e9d1b0013c7b6be9639d6578d14", size = 7277711, upload-time = "2026-03-30T08:48:19.627Z" }, + { url = "https://files.pythonhosted.org/packages/e8/43/f437a78f7f4f1d311804189e8f11fb311a01049b2e08557c1068d470cb2e/grpcio-1.80.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2dcc70e9f0ba987526e8e8603a610fb4f460e42899e74e7a518bf3c68fe1bf05", size = 6785372, upload-time = "2026-03-30T08:48:22.373Z" }, + { url = "https://files.pythonhosted.org/packages/93/3d/f6558e9c6296cb4227faa5c43c54a34c68d32654b829f53288313d16a86e/grpcio-1.80.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:448c884b668b868562b1bda833c5fce6272d26e1926ec46747cda05741d302c1", size = 7395268, upload-time = "2026-03-30T08:48:25.638Z" }, + { url = "https://files.pythonhosted.org/packages/06/21/0fdd77e84720b08843c371a2efa6f2e19dbebf56adc72df73d891f5506f0/grpcio-1.80.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:a1dc80fe55685b4a543555e6eef975303b36c8db1023b1599b094b92aa77965f", size = 8392000, upload-time = "2026-03-30T08:48:28.974Z" }, + { url = "https://files.pythonhosted.org/packages/f5/68/67f4947ed55d2e69f2cc199ab9fd85e0a0034d813bbeef84df6d2ba4d4b7/grpcio-1.80.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:31b9ac4ad1aa28ffee5503821fafd09e4da0a261ce1c1281c6c8da0423c83b6e", size = 7828477, upload-time = "2026-03-30T08:48:32.054Z" }, + { url = "https://files.pythonhosted.org/packages/44/b6/8d4096691b2e385e8271911a0de4f35f0a6c7d05aff7098e296c3de86939/grpcio-1.80.0-cp314-cp314-win32.whl", hash = "sha256:367ce30ba67d05e0592470428f0ec1c31714cab9ef19b8f2e37be1f4c7d32fae", size = 4218563, upload-time = "2026-03-30T08:48:34.538Z" }, + { url = "https://files.pythonhosted.org/packages/e5/8c/bbe6baf2557262834f2070cf668515fa308b2d38a4bbf771f8f7872a7036/grpcio-1.80.0-cp314-cp314-win_amd64.whl", hash = "sha256:3b01e1f5464c583d2f567b2e46ff0d516ef979978f72091fd81f5ab7fa6e2e7f", size = 5019457, upload-time = "2026-03-30T08:48:37.308Z" }, +] + [[package]] name = "h11" version = "0.16.0" @@ -1523,6 +1593,218 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c6/2e/3f73e8ca53718952222cacd0cf7eecc9db439d020f0c1fe7ae717e4e199a/openai-2.26.0-py3-none-any.whl", hash = "sha256:6151bf8f83802f036117f06cc8a57b3a4da60da9926826cc96747888b57f394f", size = 1136409, upload-time = "2026-03-05T23:17:34.072Z" }, ] +[[package]] +name = "opentelemetry-api" +version = "1.42.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/86/ca/25288069c399be6769159d9fb7b1190b603537d82aad2fa2746a0cc2c8c6/opentelemetry_api-1.42.0.tar.gz", hash = "sha256:ea84c893ad177791d138e0349d6ceebd8d3bf006440900400ce220008dafc372", size = 72300, upload-time = "2026-05-19T09:46:29.885Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/0b/be5daf659b82b525338fde371dfcfab09b606a19bb5620c37076964710ec/opentelemetry_api-1.42.0-py3-none-any.whl", hash = "sha256:558d88f88192a973579910ef6f2c13db47a268d5ec2e53e83e50e74a39a02922", size = 61310, upload-time = "2026-05-19T09:46:06.561Z" }, +] + +[[package]] +name = "opentelemetry-exporter-otlp-proto-common" +version = "1.42.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-proto" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/76/a9/1496f27ecdfc7d504eac80f5e16474ee9d47cd08cda1f2917b58cf1c299c/opentelemetry_exporter_otlp_proto_common-1.42.0.tar.gz", hash = "sha256:c7a1a61f3a4c4dfa83127353edb1c75b873289d9ee42379db46eb835963b72e3", size = 21430, upload-time = "2026-05-19T09:46:32.838Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8b/7b/1542eb6e3d941a7dd93648d485b7c8495bc2841a2bb7dd5f394f370cf607/opentelemetry_exporter_otlp_proto_common-1.42.0-py3-none-any.whl", hash = "sha256:92de67f096c9200770f16fbdb63b96fb6061d604b4bc266726d8355caeb864e8", size = 17328, upload-time = "2026-05-19T09:46:11.291Z" }, +] + +[[package]] +name = "opentelemetry-exporter-otlp-proto-grpc" +version = "1.42.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "googleapis-common-protos" }, + { name = "grpcio" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-exporter-otlp-proto-common" }, + { name = "opentelemetry-proto" }, + { name = "opentelemetry-sdk" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/01/6a/63812e4f67d3658b21e94bc890b67296951f3aa8f6950fdf735f763500e5/opentelemetry_exporter_otlp_proto_grpc-1.42.0.tar.gz", hash = "sha256:75eac4e9d0bd69bea8199d75dfeb585cce05a9baa8215d1f7aad9e3583bf5ef9", size = 27136, upload-time = "2026-05-19T09:46:33.594Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/e9/308c4c03b536005a1443bee0d9f06de38aad8b94f59f58ac688ead7a8cf9/opentelemetry_exporter_otlp_proto_grpc-1.42.0-py3-none-any.whl", hash = "sha256:5d6d1691586f2e656fd14187f2f2f5fa06e94834e1acdce71edcbbe35730b31d", size = 19614, upload-time = "2026-05-19T09:46:12.331Z" }, +] + +[[package]] +name = "opentelemetry-instrumentation" +version = "0.63b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "packaging" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2a/2d/322d464f4105966fb8555f871a84f43e821ce9aaf64ecae9586e9691c6a2/opentelemetry_instrumentation-0.63b0.tar.gz", hash = "sha256:80a339ef030a8d0fd1962375a9801dd31954e5063d74c00bc3d4e6581f43bab1", size = 41083, upload-time = "2026-05-19T09:47:06.194Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ae/45/a38e74da3f1b5c82c97289da91d978caa04321877f0ab170fc620a0753f2/opentelemetry_instrumentation-0.63b0-py3-none-any.whl", hash = "sha256:984b18763b652a881ac5a596098d89923f74cf53a658c2dde660387e018147ca", size = 35574, upload-time = "2026-05-19T09:46:07.257Z" }, +] + +[[package]] +name = "opentelemetry-instrumentation-asgi" +version = "0.63b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asgiref" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-instrumentation" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "opentelemetry-util-http" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b2/ba/dd540189230d211898ccc4df899874bc0d84f5c54a1e07a13a2bde606a57/opentelemetry_instrumentation_asgi-0.63b0.tar.gz", hash = "sha256:e201eed7616f7da0840adf8ab8c5ea64db7ab19b920373b38983e2bac8d3645d", size = 26154, upload-time = "2026-05-19T09:47:10.023Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ad/4f/caa793347febb9dae45f3d03d8bac04bf0752170a19c53016a0a91a214a0/opentelemetry_instrumentation_asgi-0.63b0-py3-none-any.whl", hash = "sha256:4e89555c110677226b9ca1734eda248360916bccf0ebadf8db8baf0015c9efca", size = 15907, upload-time = "2026-05-19T09:46:13.675Z" }, +] + +[[package]] +name = "opentelemetry-instrumentation-celery" +version = "0.63b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-instrumentation" }, + { name = "opentelemetry-semantic-conventions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/67/a7/82e696152b65178d13f9ee2241cadb72b7b908603c692a8519f0c0295e35/opentelemetry_instrumentation_celery-0.63b0.tar.gz", hash = "sha256:c02371fe46073b57ecf1287d833bfe00c02f79ba600549752ae7bd4fbcd8f06a", size = 15520, upload-time = "2026-05-19T09:47:15.445Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/9a/03f9168c0a07a0441129a9a426405f6b0efc3804f4c0c1e200f0a3a7c568/opentelemetry_instrumentation_celery-0.63b0-py3-none-any.whl", hash = "sha256:732d3a0b883cb777d8e0213ebbfa49fe8a8ee987ea49a6d45ec1351cb09e8b93", size = 13170, upload-time = "2026-05-19T09:46:21.78Z" }, +] + +[[package]] +name = "opentelemetry-instrumentation-fastapi" +version = "0.63b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-instrumentation" }, + { name = "opentelemetry-instrumentation-asgi" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "opentelemetry-util-http" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/51/73/6e44cd21b17d4affd41a621804421d476940b1dab352254b1a9c08a08df6/opentelemetry_instrumentation_fastapi-0.63b0.tar.gz", hash = "sha256:5117df842d0ce47e1fb9eb3c2ad2a7594bd139b129de9f3fa1ce5b28e970c046", size = 25387, upload-time = "2026-05-19T09:47:20.726Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/2d/f869b45eddbb7332cce7a863a4d1e758d58a9c890db6dbf0fe6aedd3eda1/opentelemetry_instrumentation_fastapi-0.63b0-py3-none-any.whl", hash = "sha256:ed43d2358164df83d811a8d69a7578cad3ab66fde4db027296c1ee20f703e3f0", size = 12797, upload-time = "2026-05-19T09:46:28.885Z" }, +] + +[[package]] +name = "opentelemetry-instrumentation-httpx" +version = "0.63b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-instrumentation" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "opentelemetry-util-http" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d3/22/21c1d745b82eb28c41c4f0635be1d7b9d9d77bbe0b6c718d7e7d7fcc6f20/opentelemetry_instrumentation_httpx-0.63b0.tar.gz", hash = "sha256:aafb9e336be48b4c0c19ae1f003621e23d75b3560797d42baa656dcc3a555266", size = 23556, upload-time = "2026-05-19T09:47:22.997Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/f1/0c9ba71e48129390a9db60ec92ab0149cf97d1a983c11a77e1a04ec5dc7b/opentelemetry_instrumentation_httpx-0.63b0-py3-none-any.whl", hash = "sha256:e4359d317a3313fa8607b7ab4c47088a428856349363c754013fbd595f60fb23", size = 16338, upload-time = "2026-05-19T09:46:32.015Z" }, +] + +[[package]] +name = "opentelemetry-instrumentation-logging" +version = "0.63b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-instrumentation" }, + { name = "opentelemetry-semantic-conventions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/11/a8/e5ae9bf71babc3589252d826ffd212c004582a42699ab24245ecf8004f4a/opentelemetry_instrumentation_logging-0.63b0.tar.gz", hash = "sha256:c4b875cdd712e01e2a0b904d9c9248f4f03a8f41a8acd64000984359841b98d8", size = 19824, upload-time = "2026-05-19T09:47:24.771Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d7/1c/f61d7aa67ecf4ecc04bba5a276f6dc67f0803f6d0a61eceb585f3bb2fcb9/opentelemetry_instrumentation_logging-0.63b0-py3-none-any.whl", hash = "sha256:8fe17ed310de42683dc585f1bf6af6ccaa3192c997c431c57177e15bee6885f5", size = 15992, upload-time = "2026-05-19T09:46:35.553Z" }, +] + +[[package]] +name = "opentelemetry-instrumentation-redis" +version = "0.63b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-instrumentation" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9c/bc/98f3355db9dd0f2885f168a2544739783349df7ed495cba2c06dddb3c183/opentelemetry_instrumentation_redis-0.63b0.tar.gz", hash = "sha256:a369c140eb7cdd8b59192255eb4e361755dc5353be5aa0ff25a2cbf964fb993c", size = 16713, upload-time = "2026-05-19T09:47:32.264Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/6a/b9955b1e659793e9e5e787e90d6b203b17fcf2b88811794fe1efa584ee94/opentelemetry_instrumentation_redis-0.63b0-py3-none-any.whl", hash = "sha256:61e1c18f1f87d2ebec1ed69dd187e233c4482ae528e02929150ef2699d15120a", size = 14538, upload-time = "2026-05-19T09:46:46.242Z" }, +] + +[[package]] +name = "opentelemetry-instrumentation-sqlalchemy" +version = "0.63b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-instrumentation" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "packaging" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/36/72/0def937531c0e7a423af06cbffaf235caea7af0275082c6bca13a25701ec/opentelemetry_instrumentation_sqlalchemy-0.63b0.tar.gz", hash = "sha256:b854ac9fd5707a8f79dc9b252cdec6873217e5a6e7e5fdb43dca6858a26342cb", size = 18007, upload-time = "2026-05-19T09:47:34.518Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/91/98/eb7430900f683fd6cec4745736bc69ca7260442b6b20ad05194abe97a187/opentelemetry_instrumentation_sqlalchemy-0.63b0-py3-none-any.whl", hash = "sha256:6a31bf004798f8eabb74f75e1d90cf081c7d470933867be6a5c8c985925ddb3e", size = 14410, upload-time = "2026-05-19T09:46:49.328Z" }, +] + +[[package]] +name = "opentelemetry-proto" +version = "1.42.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/2c/7c56a19498b46da4c54dc4e765c95d17f8fec2ba86bec1817b41ae635360/opentelemetry_proto-1.42.0.tar.gz", hash = "sha256:5d56a9067b631ea931a135d7b86428ae99649f591d4db69b9fc8c8e0465fce65", size = 45841, upload-time = "2026-05-19T09:46:42.058Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/ad/ff5f619a04cddb4936ead0dd8f590c5b373c5b4b9f2eef555e9d3d951ccb/opentelemetry_proto-1.42.0-py3-none-any.whl", hash = "sha256:2c0716a37e5c12efef37cbd01906d649b7fb85c85ac687518d0bd28527c6498e", size = 71779, upload-time = "2026-05-19T09:46:24.536Z" }, +] + +[[package]] +name = "opentelemetry-sdk" +version = "1.42.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b7/c9/dabaaf1c754a57b82b5a36aeca3806d92c1877ccfb12a697b65f88bf027c/opentelemetry_sdk-1.42.0.tar.gz", hash = "sha256:2479e462cc69357825c2c847ce4a601bc1b17e1279aa7f80d3490f0ae614d0e5", size = 239072, upload-time = "2026-05-19T09:46:42.992Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7b/7d/16bf9a9d42ebbd1679e0cda018d57a0712f3b6f6f1e7ae5ef3c7ee5927c0/opentelemetry_sdk-1.42.0-py3-none-any.whl", hash = "sha256:ec4a4f69e15220b3d7bccd93217aac745682bb6435b9381f7bb44cb7e07b4f2b", size = 170879, upload-time = "2026-05-19T09:46:25.871Z" }, +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.63b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/20/f8/be4625838aae098c2f9fbdc062a1b3128ebb9e799b891b654ee8cad94897/opentelemetry_semantic_conventions-0.63b0.tar.gz", hash = "sha256:cfea295264654fa324fcef24aa56fb1836fdc0da27db128645dc6aa76115cc6c", size = 148333, upload-time = "2026-05-19T09:46:44.01Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/6f/8d0ce225b8fdbb72c97cf4130107d861eafcb3d8e5c3f5891e8556177316/opentelemetry_semantic_conventions-0.63b0-py3-none-any.whl", hash = "sha256:1f3962732b04f43e4fef28173c9a3615b8847b4b2d6386fdc085361b29875ab9", size = 203712, upload-time = "2026-05-19T09:46:27.569Z" }, +] + +[[package]] +name = "opentelemetry-util-http" +version = "0.63b0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/cf/0b53c5fe1113fb01e23c6c88b66d8289f979e61cece444576b286a3415fd/opentelemetry_util_http-0.63b0.tar.gz", hash = "sha256:401ddd686cd943ef801b9384b0722b904250f6bf3906951ce4f27bb6b63b04a3", size = 11101, upload-time = "2026-05-19T09:47:42.885Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6e/8c/7fd6f06139cca88a6341bebf2b01f3e97bb8fd8d12e7d3ad3d2ad88b8c49/opentelemetry_util_http-0.63b0-py3-none-any.whl", hash = "sha256:80536361b6348e57503cdae8c1b1be79574d14c30e879367336c5a076fd4f673", size = 8209, upload-time = "2026-05-19T09:47:01.712Z" }, +] + [[package]] name = "orjson" version = "3.11.7" @@ -1693,17 +1975,17 @@ wheels = [ [[package]] name = "protobuf" -version = "7.34.0" +version = "6.33.6" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f2/00/04a2ab36b70a52d0356852979e08b44edde0435f2115dc66e25f2100f3ab/protobuf-7.34.0.tar.gz", hash = "sha256:3871a3df67c710aaf7bb8d214cc997342e63ceebd940c8c7fc65c9b3d697591a", size = 454726, upload-time = "2026-02-27T00:30:25.421Z" } +sdist = { url = "https://files.pythonhosted.org/packages/66/70/e908e9c5e52ef7c3a6c7902c9dfbb34c7e29c25d2f81ade3856445fd5c94/protobuf-6.33.6.tar.gz", hash = "sha256:a6768d25248312c297558af96a9f9c929e8c4cee0659cb07e780731095f38135", size = 444531, upload-time = "2026-03-18T19:05:00.988Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/13/c4/6322ab5c8f279c4c358bc14eb8aefc0550b97222a39f04eb3c1af7a830fa/protobuf-7.34.0-cp310-abi3-macosx_10_9_universal2.whl", hash = "sha256:8e329966799f2c271d5e05e236459fe1cbfdb8755aaa3b0914fa60947ddea408", size = 429248, upload-time = "2026-02-27T00:30:14.924Z" }, - { url = "https://files.pythonhosted.org/packages/45/99/b029bbbc61e8937545da5b79aa405ab2d9cf307a728f8c9459ad60d7a481/protobuf-7.34.0-cp310-abi3-manylinux2014_aarch64.whl", hash = "sha256:9d7a5005fb96f3c1e64f397f91500b0eb371b28da81296ae73a6b08a5b76cdd6", size = 325753, upload-time = "2026-02-27T00:30:17.247Z" }, - { url = "https://files.pythonhosted.org/packages/cc/79/09f02671eb75b251c5550a1c48e7b3d4b0623efd7c95a15a50f6f9fc1e2e/protobuf-7.34.0-cp310-abi3-manylinux2014_s390x.whl", hash = "sha256:4a72a8ec94e7a9f7ef7fe818ed26d073305f347f8b3b5ba31e22f81fd85fca02", size = 340200, upload-time = "2026-02-27T00:30:18.672Z" }, - { url = "https://files.pythonhosted.org/packages/b5/57/89727baef7578897af5ed166735ceb315819f1c184da8c3441271dbcfde7/protobuf-7.34.0-cp310-abi3-manylinux2014_x86_64.whl", hash = "sha256:964cf977e07f479c0697964e83deda72bcbc75c3badab506fb061b352d991b01", size = 324268, upload-time = "2026-02-27T00:30:20.088Z" }, - { url = "https://files.pythonhosted.org/packages/1f/3e/38ff2ddee5cc946f575c9d8cc822e34bde205cf61acf8099ad88ef19d7d2/protobuf-7.34.0-cp310-abi3-win32.whl", hash = "sha256:f791ec509707a1d91bd02e07df157e75e4fb9fbdad12a81b7396201ec244e2e3", size = 426628, upload-time = "2026-02-27T00:30:21.555Z" }, - { url = "https://files.pythonhosted.org/packages/cb/71/7c32eaf34a61a1bae1b62a2ac4ffe09b8d1bb0cf93ad505f42040023db89/protobuf-7.34.0-cp310-abi3-win_amd64.whl", hash = "sha256:9f9079f1dde4e32342ecbd1c118d76367090d4aaa19da78230c38101c5b3dd40", size = 437901, upload-time = "2026-02-27T00:30:22.836Z" }, - { url = "https://files.pythonhosted.org/packages/a4/e7/14dc9366696dcb53a413449881743426ed289d687bcf3d5aee4726c32ebb/protobuf-7.34.0-py3-none-any.whl", hash = "sha256:e3b914dd77fa33fa06ab2baa97937746ab25695f389869afdf03e81f34e45dc7", size = 170716, upload-time = "2026-02-27T00:30:23.994Z" }, + { url = "https://files.pythonhosted.org/packages/fc/9f/2f509339e89cfa6f6a4c4ff50438db9ca488dec341f7e454adad60150b00/protobuf-6.33.6-cp310-abi3-win32.whl", hash = "sha256:7d29d9b65f8afef196f8334e80d6bc1d5d4adedb449971fefd3723824e6e77d3", size = 425739, upload-time = "2026-03-18T19:04:48.373Z" }, + { url = "https://files.pythonhosted.org/packages/76/5d/683efcd4798e0030c1bab27374fd13a89f7c2515fb1f3123efdfaa5eab57/protobuf-6.33.6-cp310-abi3-win_amd64.whl", hash = "sha256:0cd27b587afca21b7cfa59a74dcbd48a50f0a6400cfb59391340ad729d91d326", size = 437089, upload-time = "2026-03-18T19:04:50.381Z" }, + { url = "https://files.pythonhosted.org/packages/5c/01/a3c3ed5cd186f39e7880f8303cc51385a198a81469d53d0fdecf1f64d929/protobuf-6.33.6-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:9720e6961b251bde64edfdab7d500725a2af5280f3f4c87e57c0208376aa8c3a", size = 427737, upload-time = "2026-03-18T19:04:51.866Z" }, + { url = "https://files.pythonhosted.org/packages/ee/90/b3c01fdec7d2f627b3a6884243ba328c1217ed2d978def5c12dc50d328a3/protobuf-6.33.6-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:e2afbae9b8e1825e3529f88d514754e094278bb95eadc0e199751cdd9a2e82a2", size = 324610, upload-time = "2026-03-18T19:04:53.096Z" }, + { url = "https://files.pythonhosted.org/packages/9b/ca/25afc144934014700c52e05103c2421997482d561f3101ff352e1292fb81/protobuf-6.33.6-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:c96c37eec15086b79762ed265d59ab204dabc53056e3443e702d2681f4b39ce3", size = 339381, upload-time = "2026-03-18T19:04:54.616Z" }, + { url = "https://files.pythonhosted.org/packages/16/92/d1e32e3e0d894fe00b15ce28ad4944ab692713f2e7f0a99787405e43533a/protobuf-6.33.6-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:e9db7e292e0ab79dd108d7f1a94fe31601ce1ee3f7b79e0692043423020b0593", size = 323436, upload-time = "2026-03-18T19:04:55.768Z" }, + { url = "https://files.pythonhosted.org/packages/c4/72/02445137af02769918a93807b2b7890047c32bfb9f90371cbc12688819eb/protobuf-6.33.6-py3-none-any.whl", hash = "sha256:77179e006c476e69bf8e8ce866640091ec42e1beb80b213c3900006ecfba6901", size = 170656, upload-time = "2026-03-18T19:04:59.826Z" }, ] [[package]] @@ -2869,6 +3151,59 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" }, ] +[[package]] +name = "wrapt" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2e/64/925f213fdcbb9baeb1530449ac71a4d57fc361c053d06bf78d0c5c7cd80c/wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e", size = 81678, upload-time = "2026-03-06T02:53:25.134Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4c/7a/d936840735c828b38d26a854e85d5338894cda544cb7a85a9d5b8b9c4df7/wrapt-2.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:787fd6f4d67befa6fe2abdffcbd3de2d82dfc6fb8a6d850407c53332709d030b", size = 61259, upload-time = "2026-03-06T02:53:41.922Z" }, + { url = "https://files.pythonhosted.org/packages/5e/88/9a9b9a90ac8ca11c2fdb6a286cb3a1fc7dd774c00ed70929a6434f6bc634/wrapt-2.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4bdf26e03e6d0da3f0e9422fd36bcebf7bc0eeb55fdf9c727a09abc6b9fe472e", size = 61851, upload-time = "2026-03-06T02:52:48.672Z" }, + { url = "https://files.pythonhosted.org/packages/03/a9/5b7d6a16fd6533fed2756900fc8fc923f678179aea62ada6d65c92718c00/wrapt-2.1.2-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bbac24d879aa22998e87f6b3f481a5216311e7d53c7db87f189a7a0266dafffb", size = 121446, upload-time = "2026-03-06T02:54:14.013Z" }, + { url = "https://files.pythonhosted.org/packages/45/bb/34c443690c847835cfe9f892be78c533d4f32366ad2888972c094a897e39/wrapt-2.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16997dfb9d67addc2e3f41b62a104341e80cac52f91110dece393923c0ebd5ca", size = 123056, upload-time = "2026-03-06T02:54:10.829Z" }, + { url = "https://files.pythonhosted.org/packages/93/b9/ff205f391cb708f67f41ea148545f2b53ff543a7ac293b30d178af4d2271/wrapt-2.1.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:162e4e2ba7542da9027821cb6e7c5e068d64f9a10b5f15512ea28e954893a267", size = 117359, upload-time = "2026-03-06T02:53:03.623Z" }, + { url = "https://files.pythonhosted.org/packages/1f/3d/1ea04d7747825119c3c9a5e0874a40b33594ada92e5649347c457d982805/wrapt-2.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f29c827a8d9936ac320746747a016c4bc66ef639f5cd0d32df24f5eacbf9c69f", size = 121479, upload-time = "2026-03-06T02:53:45.844Z" }, + { url = "https://files.pythonhosted.org/packages/78/cc/ee3a011920c7a023b25e8df26f306b2484a531ab84ca5c96260a73de76c0/wrapt-2.1.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:a9dd9813825f7ecb018c17fd147a01845eb330254dff86d3b5816f20f4d6aaf8", size = 116271, upload-time = "2026-03-06T02:54:46.356Z" }, + { url = "https://files.pythonhosted.org/packages/98/fd/e5ff7ded41b76d802cf1191288473e850d24ba2e39a6ec540f21ae3b57cb/wrapt-2.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6f8dbdd3719e534860d6a78526aafc220e0241f981367018c2875178cf83a413", size = 120573, upload-time = "2026-03-06T02:52:50.163Z" }, + { url = "https://files.pythonhosted.org/packages/47/c5/242cae3b5b080cd09bacef0591691ba1879739050cc7c801ff35c8886b66/wrapt-2.1.2-cp313-cp313-win32.whl", hash = "sha256:5c35b5d82b16a3bc6e0a04349b606a0582bc29f573786aebe98e0c159bc48db6", size = 58205, upload-time = "2026-03-06T02:53:47.494Z" }, + { url = "https://files.pythonhosted.org/packages/12/69/c358c61e7a50f290958809b3c61ebe8b3838ea3e070d7aac9814f95a0528/wrapt-2.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:f8bc1c264d8d1cf5b3560a87bbdd31131573eb25f9f9447bb6252b8d4c44a3a1", size = 60452, upload-time = "2026-03-06T02:53:30.038Z" }, + { url = "https://files.pythonhosted.org/packages/8e/66/c8a6fcfe321295fd8c0ab1bd685b5a01462a9b3aa2f597254462fc2bc975/wrapt-2.1.2-cp313-cp313-win_arm64.whl", hash = "sha256:3beb22f674550d5634642c645aba4c72a2c66fb185ae1aebe1e955fae5a13baf", size = 58842, upload-time = "2026-03-06T02:52:52.114Z" }, + { url = "https://files.pythonhosted.org/packages/da/55/9c7052c349106e0b3f17ae8db4b23a691a963c334de7f9dbd60f8f74a831/wrapt-2.1.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0fc04bc8664a8bc4c8e00b37b5355cffca2535209fba1abb09ae2b7c76ddf82b", size = 63075, upload-time = "2026-03-06T02:53:19.108Z" }, + { url = "https://files.pythonhosted.org/packages/09/a8/ce7b4006f7218248dd71b7b2b732d0710845a0e49213b18faef64811ffef/wrapt-2.1.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a9b9d50c9af998875a1482a038eb05755dfd6fe303a313f6a940bb53a83c3f18", size = 63719, upload-time = "2026-03-06T02:54:33.452Z" }, + { url = "https://files.pythonhosted.org/packages/e4/e5/2ca472e80b9e2b7a17f106bb8f9df1db11e62101652ce210f66935c6af67/wrapt-2.1.2-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2d3ff4f0024dd224290c0eabf0240f1bfc1f26363431505fb1b0283d3b08f11d", size = 152643, upload-time = "2026-03-06T02:52:42.721Z" }, + { url = "https://files.pythonhosted.org/packages/36/42/30f0f2cefca9d9cbf6835f544d825064570203c3e70aa873d8ae12e23791/wrapt-2.1.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3278c471f4468ad544a691b31bb856374fbdefb7fee1a152153e64019379f015", size = 158805, upload-time = "2026-03-06T02:54:25.441Z" }, + { url = "https://files.pythonhosted.org/packages/bb/67/d08672f801f604889dcf58f1a0b424fe3808860ede9e03affc1876b295af/wrapt-2.1.2-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a8914c754d3134a3032601c6984db1c576e6abaf3fc68094bb8ab1379d75ff92", size = 145990, upload-time = "2026-03-06T02:53:57.456Z" }, + { url = "https://files.pythonhosted.org/packages/68/a7/fd371b02e73babec1de6ade596e8cd9691051058cfdadbfd62a5898f3295/wrapt-2.1.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ff95d4264e55839be37bafe1536db2ab2de19da6b65f9244f01f332b5286cfbf", size = 155670, upload-time = "2026-03-06T02:54:55.309Z" }, + { url = "https://files.pythonhosted.org/packages/86/2d/9fe0095dfdb621009f40117dcebf41d7396c2c22dca6eac779f4c007b86c/wrapt-2.1.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:76405518ca4e1b76fbb1b9f686cff93aebae03920cc55ceeec48ff9f719c5f67", size = 144357, upload-time = "2026-03-06T02:54:24.092Z" }, + { url = "https://files.pythonhosted.org/packages/0e/b6/ec7b4a254abbe4cde9fa15c5d2cca4518f6b07d0f1b77d4ee9655e30280e/wrapt-2.1.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c0be8b5a74c5824e9359b53e7e58bef71a729bacc82e16587db1c4ebc91f7c5a", size = 150269, upload-time = "2026-03-06T02:53:31.268Z" }, + { url = "https://files.pythonhosted.org/packages/6e/6b/2fabe8ebf148f4ee3c782aae86a795cc68ffe7d432ef550f234025ce0cfa/wrapt-2.1.2-cp313-cp313t-win32.whl", hash = "sha256:f01277d9a5fc1862f26f7626da9cf443bebc0abd2f303f41c5e995b15887dabd", size = 59894, upload-time = "2026-03-06T02:54:15.391Z" }, + { url = "https://files.pythonhosted.org/packages/ca/fb/9ba66fc2dedc936de5f8073c0217b5d4484e966d87723415cc8262c5d9c2/wrapt-2.1.2-cp313-cp313t-win_amd64.whl", hash = "sha256:84ce8f1c2104d2f6daa912b1b5b039f331febfeee74f8042ad4e04992bd95c8f", size = 63197, upload-time = "2026-03-06T02:54:41.943Z" }, + { url = "https://files.pythonhosted.org/packages/c0/1c/012d7423c95d0e337117723eb8ecf73c622ce15a97847e84cf3f8f26cd7e/wrapt-2.1.2-cp313-cp313t-win_arm64.whl", hash = "sha256:a93cd767e37faeddbe07d8fc4212d5cba660af59bdb0f6372c93faaa13e6e679", size = 60363, upload-time = "2026-03-06T02:54:48.093Z" }, + { url = "https://files.pythonhosted.org/packages/39/25/e7ea0b417db02bb796182a5316398a75792cd9a22528783d868755e1f669/wrapt-2.1.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:1370e516598854e5b4366e09ce81e08bfe94d42b0fd569b88ec46cc56d9164a9", size = 61418, upload-time = "2026-03-06T02:53:55.706Z" }, + { url = "https://files.pythonhosted.org/packages/ec/0f/fa539e2f6a770249907757eaeb9a5ff4deb41c026f8466c1c6d799088a9b/wrapt-2.1.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:6de1a3851c27e0bd6a04ca993ea6f80fc53e6c742ee1601f486c08e9f9b900a9", size = 61914, upload-time = "2026-03-06T02:52:53.37Z" }, + { url = "https://files.pythonhosted.org/packages/53/37/02af1867f5b1441aaeda9c82deed061b7cd1372572ddcd717f6df90b5e93/wrapt-2.1.2-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:de9f1a2bbc5ac7f6012ec24525bdd444765a2ff64b5985ac6e0692144838542e", size = 120417, upload-time = "2026-03-06T02:54:30.74Z" }, + { url = "https://files.pythonhosted.org/packages/c3/b7/0138a6238c8ba7476c77cf786a807f871672b37f37a422970342308276e7/wrapt-2.1.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:970d57ed83fa040d8b20c52fe74a6ae7e3775ae8cff5efd6a81e06b19078484c", size = 122797, upload-time = "2026-03-06T02:54:51.539Z" }, + { url = "https://files.pythonhosted.org/packages/e1/ad/819ae558036d6a15b7ed290d5b14e209ca795dd4da9c58e50c067d5927b0/wrapt-2.1.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3969c56e4563c375861c8df14fa55146e81ac11c8db49ea6fb7f2ba58bc1ff9a", size = 117350, upload-time = "2026-03-06T02:54:37.651Z" }, + { url = "https://files.pythonhosted.org/packages/8b/2d/afc18dc57a4600a6e594f77a9ae09db54f55ba455440a54886694a84c71b/wrapt-2.1.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:57d7c0c980abdc5f1d98b11a2aa3bb159790add80258c717fa49a99921456d90", size = 121223, upload-time = "2026-03-06T02:54:35.221Z" }, + { url = "https://files.pythonhosted.org/packages/b9/5b/5ec189b22205697bc56eb3b62aed87a1e0423e9c8285d0781c7a83170d15/wrapt-2.1.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:776867878e83130c7a04237010463372e877c1c994d449ca6aaafeab6aab2586", size = 116287, upload-time = "2026-03-06T02:54:19.654Z" }, + { url = "https://files.pythonhosted.org/packages/f7/2d/f84939a7c9b5e6cdd8a8d0f6a26cabf36a0f7e468b967720e8b0cd2bdf69/wrapt-2.1.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fab036efe5464ec3291411fabb80a7a39e2dd80bae9bcbeeca5087fdfa891e19", size = 119593, upload-time = "2026-03-06T02:54:16.697Z" }, + { url = "https://files.pythonhosted.org/packages/0b/fe/ccd22a1263159c4ac811ab9374c061bcb4a702773f6e06e38de5f81a1bdc/wrapt-2.1.2-cp314-cp314-win32.whl", hash = "sha256:e6ed62c82ddf58d001096ae84ce7f833db97ae2263bff31c9b336ba8cfe3f508", size = 58631, upload-time = "2026-03-06T02:53:06.498Z" }, + { url = "https://files.pythonhosted.org/packages/65/0a/6bd83be7bff2e7efaac7b4ac9748da9d75a34634bbbbc8ad077d527146df/wrapt-2.1.2-cp314-cp314-win_amd64.whl", hash = "sha256:467e7c76315390331c67073073d00662015bb730c566820c9ca9b54e4d67fd04", size = 60875, upload-time = "2026-03-06T02:53:50.252Z" }, + { url = "https://files.pythonhosted.org/packages/6c/c0/0b3056397fe02ff80e5a5d72d627c11eb885d1ca78e71b1a5c1e8c7d45de/wrapt-2.1.2-cp314-cp314-win_arm64.whl", hash = "sha256:da1f00a557c66225d53b095a97eace0fc5349e3bfda28fa34ffae238978ee575", size = 59164, upload-time = "2026-03-06T02:53:59.128Z" }, + { url = "https://files.pythonhosted.org/packages/71/ed/5d89c798741993b2371396eb9d4634f009ff1ad8a6c78d366fe2883ea7a6/wrapt-2.1.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:62503ffbc2d3a69891cf29beeaccdb4d5e0a126e2b6a851688d4777e01428dbb", size = 63163, upload-time = "2026-03-06T02:52:54.873Z" }, + { url = "https://files.pythonhosted.org/packages/c6/8c/05d277d182bf36b0a13d6bd393ed1dec3468a25b59d01fba2dd70fe4d6ae/wrapt-2.1.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c7e6cd120ef837d5b6f860a6ea3745f8763805c418bb2f12eeb1fa6e25f22d22", size = 63723, upload-time = "2026-03-06T02:52:56.374Z" }, + { url = "https://files.pythonhosted.org/packages/f4/27/6c51ec1eff4413c57e72d6106bb8dec6f0c7cdba6503d78f0fa98767bcc9/wrapt-2.1.2-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3769a77df8e756d65fbc050333f423c01ae012b4f6731aaf70cf2bef61b34596", size = 152652, upload-time = "2026-03-06T02:53:23.79Z" }, + { url = "https://files.pythonhosted.org/packages/db/4c/d7dd662d6963fc7335bfe29d512b02b71cdfa23eeca7ab3ac74a67505deb/wrapt-2.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a76d61a2e851996150ba0f80582dd92a870643fa481f3b3846f229de88caf044", size = 158807, upload-time = "2026-03-06T02:53:35.742Z" }, + { url = "https://files.pythonhosted.org/packages/b4/4d/1e5eea1a78d539d346765727422976676615814029522c76b87a95f6bcdd/wrapt-2.1.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6f97edc9842cf215312b75fe737ee7c8adda75a89979f8e11558dfff6343cc4b", size = 146061, upload-time = "2026-03-06T02:52:57.574Z" }, + { url = "https://files.pythonhosted.org/packages/89/bc/62cabea7695cd12a288023251eeefdcb8465056ddaab6227cb78a2de005b/wrapt-2.1.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4006c351de6d5007aa33a551f600404ba44228a89e833d2fadc5caa5de8edfbf", size = 155667, upload-time = "2026-03-06T02:53:39.422Z" }, + { url = "https://files.pythonhosted.org/packages/e9/99/6f2888cd68588f24df3a76572c69c2de28287acb9e1972bf0c83ce97dbc1/wrapt-2.1.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:a9372fc3639a878c8e7d87e1556fa209091b0a66e912c611e3f833e2c4202be2", size = 144392, upload-time = "2026-03-06T02:54:22.41Z" }, + { url = "https://files.pythonhosted.org/packages/40/51/1dfc783a6c57971614c48e361a82ca3b6da9055879952587bc99fe1a7171/wrapt-2.1.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3144b027ff30cbd2fca07c0a87e67011adb717eb5f5bd8496325c17e454257a3", size = 150296, upload-time = "2026-03-06T02:54:07.848Z" }, + { url = "https://files.pythonhosted.org/packages/6c/38/cbb8b933a0201076c1f64fc42883b0023002bdc14a4964219154e6ff3350/wrapt-2.1.2-cp314-cp314t-win32.whl", hash = "sha256:3b8d15e52e195813efe5db8cec156eebe339aaf84222f4f4f051a6c01f237ed7", size = 60539, upload-time = "2026-03-06T02:54:00.594Z" }, + { url = "https://files.pythonhosted.org/packages/82/dd/e5176e4b241c9f528402cebb238a36785a628179d7d8b71091154b3e4c9e/wrapt-2.1.2-cp314-cp314t-win_amd64.whl", hash = "sha256:08ffa54146a7559f5b8df4b289b46d963a8e74ed16ba3687f99896101a3990c5", size = 63969, upload-time = "2026-03-06T02:54:39Z" }, + { url = "https://files.pythonhosted.org/packages/5c/99/79f17046cf67e4a95b9987ea129632ba8bcec0bc81f3fb3d19bdb0bd60cd/wrapt-2.1.2-cp314-cp314t-win_arm64.whl", hash = "sha256:72aaa9d0d8e4ed0e2e98019cea47a21f823c9dd4b43c7b77bba6679ffcca6a00", size = 60554, upload-time = "2026-03-06T02:53:14.132Z" }, + { url = "https://files.pythonhosted.org/packages/1a/c7/8528ac2dfa2c1e6708f647df7ae144ead13f0a31146f43c7264b4942bf12/wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8", size = 43993, upload-time = "2026-03-06T02:53:12.905Z" }, +] + [[package]] name = "xmltodict" version = "1.0.4" diff --git a/app-expo/.env.example b/app-expo/.env.example index f525d74..f952c6c 100644 --- a/app-expo/.env.example +++ b/app-expo/.env.example @@ -20,8 +20,12 @@ # EXPO_PUBLIC_API_URL=http://127.0.0.1:8000 # EXPO_PUBLIC_WS_URL=ws://127.0.0.1:8000 -# --- staging --- +# --- staging(必填,无默认值;示例见 env/staging)--- # APP_VARIANT=staging # EXPO_PUBLIC_APP_VARIANT=staging -EXPO_PUBLIC_API_URL=https://your-api.example.com -EXPO_PUBLIC_WS_URL=wss://your-api.example.com +# EXPO_PUBLIC_API_URL=http://your-staging-host:8000 +# EXPO_PUBLIC_WS_URL=ws://your-staging-host:8000 + +# --- production --- +# EXPO_PUBLIC_API_URL=https://your-api.example.com +# EXPO_PUBLIC_WS_URL=wss://your-api.example.com diff --git a/app-expo/app.config.ts b/app-expo/app.config.ts index 3931de8..602cfe1 100644 --- a/app-expo/app.config.ts +++ b/app-expo/app.config.ts @@ -28,7 +28,16 @@ const LOCALES: Record = { const SUPPORTED_LOCALES = ['zh', 'en'] as const; const PRIMARY_LOCALE = process.env.EXPO_PUBLIC_PRIMARY_LOCALE ?? 'zh'; -const API_BASE_URL = process.env.EXPO_PUBLIC_API_URL ?? ''; +const API_BASE_URL = process.env.EXPO_PUBLIC_API_URL?.trim() ?? ''; +const WS_BASE_URL = process.env.EXPO_PUBLIC_WS_URL?.trim() ?? ''; + +if (!API_BASE_URL || !WS_BASE_URL) { + throw new Error( + '[app.config] Missing EXPO_PUBLIC_API_URL or EXPO_PUBLIC_WS_URL. ' + + 'Run `npm run use-env -- ` in app-expo before prebuild or Metro.', + ); +} + const ALLOW_INSECURE_HTTP = API_BASE_URL.startsWith('http://'); const APP_VARIANT = @@ -176,7 +185,14 @@ export default ({ config }: ConfigContext): ExpoConfig => { './plugins/withAndroidCleartextTraffic', { enabled: ALLOW_INSECURE_HTTP }, ], - ['./plugins/withIosInsecureHttp', { enabled: ALLOW_INSECURE_HTTP }], + [ + './plugins/withIosInsecureHttp', + { + enabled: ALLOW_INSECURE_HTTP, + apiUrl: API_BASE_URL, + wsUrl: WS_BASE_URL, + }, + ], 'expo-router', [ 'expo-splash-screen', diff --git a/app-expo/jest.config.js b/app-expo/jest.config.js index 1082f91..7db9baa 100644 --- a/app-expo/jest.config.js +++ b/app-expo/jest.config.js @@ -1,5 +1,6 @@ module.exports = { preset: 'jest-expo', + setupFiles: ['/tests/jest.setup.ts'], clearMocks: true, moduleNameMapper: { '^@/(.*)$': '/src/$1', diff --git a/app-expo/plugins/withIosInsecureHttp.js b/app-expo/plugins/withIosInsecureHttp.js index 9e9810c..1203de5 100644 --- a/app-expo/plugins/withIosInsecureHttp.js +++ b/app-expo/plugins/withIosInsecureHttp.js @@ -1,43 +1,81 @@ // @ts-check /** - * Allow HTTP / WS to staging API host via App Transport Security exception. + * Allow HTTP / WS to staging API hosts via App Transport Security. * * Enabled when EXPO_PUBLIC_API_URL uses http:// (same rule as Android cleartext). - * Host is parsed from the URL so IP:port staging endpoints work without hard-coding. + * Collects hosts from both API and WS URLs (IP:port staging often differs only by scheme). */ const { withInfoPlist } = require('@expo/config-plugins'); /** + * @param {string | undefined} raw * @returns {string | null} */ -function getHttpExceptionHost() { - const raw = process.env.EXPO_PUBLIC_API_URL ?? ''; - if (!raw.startsWith('http://')) { +function insecureHttpHostFromUrl(raw) { + if (!raw || !raw.startsWith('http://')) { return null; } try { - return new URL(raw).hostname; + return new URL(raw).hostname || null; } catch { return null; } } +/** + * @param {string | undefined} raw + * @returns {string | null} + */ +function insecureWsHostFromUrl(raw) { + if (!raw || !raw.startsWith('ws://')) { + return null; + } + try { + return new URL(raw).hostname || null; + } catch { + return null; + } +} + +/** + * @param {string | undefined} apiUrl + * @param {string | undefined} wsUrl + * @returns {string[]} + */ +function collectInsecureHosts(apiUrl, wsUrl) { + const hosts = new Set( + [insecureHttpHostFromUrl(apiUrl), insecureWsHostFromUrl(wsUrl)].filter( + (h) => typeof h === 'string' && h.length > 0, + ), + ); + return [...hosts]; +} + +/** + * @param {string} host + */ +function isIpv4Literal(host) { + return /^\d{1,3}(\.\d{1,3}){3}$/u.test(host); +} + /** * @param {import('expo/config').ExpoConfig} config - * @param {{ enabled?: boolean }} props + * @param {{ enabled?: boolean; apiUrl?: string; wsUrl?: string }} props */ function withIosInsecureHttp(config, props = {}) { const enabled = props.enabled ?? false; + const apiUrl = props.apiUrl ?? process.env.EXPO_PUBLIC_API_URL ?? ''; + const wsUrl = props.wsUrl ?? process.env.EXPO_PUBLIC_WS_URL ?? ''; return withInfoPlist(config, (mod) => { if (!enabled) { return mod; } - const host = getHttpExceptionHost(); - if (!host) { + const hosts = collectInsecureHosts(apiUrl, wsUrl); + if (hosts.length === 0) { console.warn( - '[withIosInsecureHttp] enabled but EXPO_PUBLIC_API_URL has no http host; skipping ATS exception.', + '[withIosInsecureHttp] enabled but no http/ws hosts found in apiUrl/wsUrl; skipping ATS exception.', ); return mod; } @@ -45,17 +83,32 @@ function withIosInsecureHttp(config, props = {}) { const existing = mod.modResults.NSAppTransportSecurity ?? {}; const existingDomains = existing.NSExceptionDomains ?? {}; + /** @type {Record} */ + const exceptionDomains = { ...existingDomains }; + + for (const host of hosts) { + exceptionDomains[host] = { + NSExceptionAllowsInsecureHTTPLoads: true, + // IP literals have no subdomains; false avoids odd ATS behavior on some iOS versions. + NSIncludesSubdomains: !isIpv4Literal(host), + NSExceptionRequiresForwardSecrecy: false, + }; + } + mod.modResults.NSAppTransportSecurity = { ...existing, - NSExceptionDomains: { - ...existingDomains, - [host]: { - NSExceptionAllowsInsecureHTTPLoads: true, - NSIncludesSubdomains: true, - }, - }, + /** + * Staging often uses bare IP:port HTTP. Domain exceptions alone can fail on + * newer iOS builds; allow cleartext while this plugin is enabled (http:// API only). + */ + NSAllowsArbitraryLoads: true, + NSExceptionDomains: exceptionDomains, }; + console.log( + `[withIosInsecureHttp] ATS cleartext enabled for host(s): ${hosts.join(', ')}`, + ); + return mod; }); } diff --git a/app-expo/src/app/(tabs)/memoir.tsx b/app-expo/src/app/(tabs)/memoir.tsx index 1b6045e..266ce29 100644 --- a/app-expo/src/app/(tabs)/memoir.tsx +++ b/app-expo/src/app/(tabs)/memoir.tsx @@ -1,5 +1,5 @@ import { Image } from 'expo-image'; -import { router } from 'expo-router'; +import { router, useFocusEffect } from 'expo-router'; import React, { useCallback, useEffect, @@ -17,22 +17,26 @@ import { } from 'react-native'; import { SafeAreaView } from 'react-native-safe-area-context'; import { useTranslation } from 'react-i18next'; -import { FileText } from 'lucide-react-native'; +import { FileText, MessageCirclePlus } from 'lucide-react-native'; import { Icon } from '@/components/ui/icon'; import { Skeleton } from '@/components/ui/skeleton'; import { Text } from '@/components/ui/text'; import { ScreenGutter } from '@/constants/layout'; +import { ApiError, NetworkError } from '@/core/api/types'; +import { config, shouldShowAboutBackendUrl } from '@/core/config'; import { useTypography } from '@/core/typography-context'; import { buildFrameworkChapterPlaceholders, mergeFrameworkChaptersWithFetched, } from '@/features/memoir/framework-chapter-keys'; import { + hasAnyMemoirDraftingActivity, memoirDraftCharsRemaining, memoirDraftHasStarted, resolvedChapterCategory, } from '@/features/memoir/draft-progress'; +import { useSession } from '@/features/auth/hooks'; import { useChapters, useCheckCoverGeneration, @@ -286,13 +290,41 @@ function ChapterCard({ return null; } -function MemoirLoadError({ onRetry }: { onRetry: () => void }) { +function formatChapterLoadErrorHint(error: unknown): string | null { + if (!shouldShowAboutBackendUrl()) return null; + if (error instanceof NetworkError) { + return `${error.message}\n${config.apiBaseUrl}`; + } + if (error instanceof ApiError) { + return `HTTP ${error.status}: ${error.message}`; + } + if (error instanceof Error) return error.message; + return null; +} + +function MemoirLoadError({ + error, + onRetry, +}: { + error: unknown; + onRetry: () => void; +}) { const { t } = useTranslation('memoir'); + const hint = formatChapterLoadErrorHint(error); return ( {t('loadErrorMessage')} + {hint ? ( + + {hint} + + ) : null} void }) { ); } +function MemoirEmptyState({ onStartChat }: { onStartChat: () => void }) { + const { t } = useTranslation('memoir'); + return ( + + + + + {t('emptyTitle')} + + + {t('emptySubtitle')} + + + + ); +} + export default function MemoirScreen() { const { t } = useTranslation('memoir'); - const { viewModels: chapters, isLoading, isError, refetch } = useChapters(); - const { data: memoirState, refetch: refetchMemoirState } = useMemoirState(); + const { isAuthenticated } = useSession(); + const { + viewModels: chapters, + isLoading, + hasCompletedChapters, + isEmptyList, + showLoadError, + error: chaptersError, + refetch, + } = useChapters({ enabled: isAuthenticated }); + const { + data: memoirState, + isLoading: isMemoirStateLoading, + refetch: refetchMemoirState, + } = useMemoirState({ enabled: isAuthenticated }); const checkCover = useCheckCoverGeneration(); const [refreshing, setRefreshing] = useState(false); const didRunInitialCoverCheckRef = useRef(false); @@ -327,6 +396,29 @@ export default function MemoirScreen() { [frameworkPlaceholders, chapters], ); + const hasDraftingActivity = useMemo(() => { + if (hasCompletedChapters) return true; + if (chapters.some((ch) => !ch.isEmpty || ch.wordCount > 0)) return true; + return hasAnyMemoirDraftingActivity(memoirState?.slots); + }, [chapters, hasCompletedChapters, memoirState?.slots]); + + const isBootstrapping = + isLoading || (isEmptyList && isMemoirStateLoading); + + const isEmptyMemoir = + !isBootstrapping && + !showLoadError && + isEmptyList && + !hasDraftingActivity; + + useFocusEffect( + useCallback(() => { + if (!isAuthenticated) return; + void refetch(); + void refetchMemoirState(); + }, [isAuthenticated, refetch, refetchMemoirState]), + ); + useEffect(() => { if (didRunInitialCoverCheckRef.current) return; didRunInitialCoverCheckRef.current = true; @@ -336,7 +428,7 @@ export default function MemoirScreen() { const handleRefresh = useCallback(async () => { setRefreshing(true); try { - await checkCover.mutateAsync(undefined); + await checkCover.mutateAsync(undefined).catch(() => undefined); await Promise.all([refetch(), refetchMemoirState()]); } finally { setRefreshing(false); @@ -347,6 +439,10 @@ export default function MemoirScreen() { router.push(`/(main)/chapter/${chapterId}`); }, []); + const handleStartChat = useCallback(() => { + router.push('/(tabs)'); + }, []); + return ( @@ -361,19 +457,27 @@ export default function MemoirScreen() { paddingTop: 24, paddingBottom: 96, gap: 24, - ...(!isLoading && isError + ...(!isBootstrapping && (showLoadError || isEmptyMemoir) ? { flexGrow: 1, justifyContent: 'center' } : {}), }} > - {isLoading ? ( + {isBootstrapping ? ( <> - ) : isError ? ( - void refetch()} /> + ) : showLoadError ? ( + { + void refetch(); + void refetchMemoirState(); + }} + /> + ) : isEmptyMemoir ? ( + ) : ( displayChapters.map((item) => { const variant = getChapterVariant(item); diff --git a/app-expo/src/core/config.ts b/app-expo/src/core/config.ts index f2349e6..7fc0f50 100644 --- a/app-expo/src/core/config.ts +++ b/app-expo/src/core/config.ts @@ -4,6 +4,57 @@ function trimTrailingSlashes(value: string): string { export type AppVariant = 'development' | 'staging' | 'production'; +const MISSING_ENV_HINT = + 'Run `npm run use-env -- ` in app-expo, ' + + 'then restart Metro or re-run `expo prebuild` before building.'; + +/** + * EXPO_PUBLIC_* must be set at bundle time (Metro / EAS / Xcode Archive). + * Refuses silent fallbacks to a hard-coded LAN IP. + */ +export function requirePublicEnv(name: string): string { + const value = process.env[name]?.trim(); + if (!value) { + throw new Error(`[config] Missing ${name}. ${MISSING_ENV_HINT}`); + } + return value; +} + +function parseBackendUrl(raw: string, envName: string): URL { + let parsed: URL; + try { + parsed = new URL(raw); + } catch { + throw new Error(`[config] Invalid ${envName}: ${raw}`); + } + if (!parsed.protocol || parsed.protocol === ':') { + throw new Error(`[config] ${envName} must include a scheme (http/https or ws/wss): ${raw}`); + } + return parsed; +} + +function resolveApiBaseUrl(): string { + const raw = requirePublicEnv('EXPO_PUBLIC_API_URL'); + const parsed = parseBackendUrl(raw, 'EXPO_PUBLIC_API_URL'); + if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') { + throw new Error( + `[config] EXPO_PUBLIC_API_URL must use http:// or https:// (got ${parsed.protocol})`, + ); + } + return trimTrailingSlashes(raw); +} + +function resolveWsBaseUrl(): string { + const raw = requirePublicEnv('EXPO_PUBLIC_WS_URL'); + const parsed = parseBackendUrl(raw, 'EXPO_PUBLIC_WS_URL'); + if (parsed.protocol !== 'ws:' && parsed.protocol !== 'wss:') { + throw new Error( + `[config] EXPO_PUBLIC_WS_URL must use ws:// or wss:// (got ${parsed.protocol})`, + ); + } + return trimTrailingSlashes(raw); +} + function resolveAppVariant(): AppVariant { const raw = process.env.EXPO_PUBLIC_APP_VARIANT; if (raw === 'development' || raw === 'staging' || raw === 'production') { @@ -33,12 +84,8 @@ export function shouldShowAboutBackendUrl(variant: AppVariant = appVariant): boo export const appVariant = resolveAppVariant(); export const config = { - apiBaseUrl: trimTrailingSlashes( - process.env.EXPO_PUBLIC_API_URL ?? 'http://192.168.10.151:8000', - ), - wsBaseUrl: trimTrailingSlashes( - process.env.EXPO_PUBLIC_WS_URL ?? 'ws://192.168.10.151:8000', - ), + apiBaseUrl: resolveApiBaseUrl(), + wsBaseUrl: resolveWsBaseUrl(), isDebugMode: __DEV__, appVariant, showAboutBackendUrl: shouldShowAboutBackendUrl(), diff --git a/app-expo/src/features/auth/hooks.ts b/app-expo/src/features/auth/hooks.ts index b6cfefc..7c06a4e 100644 --- a/app-expo/src/features/auth/hooks.ts +++ b/app-expo/src/features/auth/hooks.ts @@ -6,6 +6,8 @@ import { tokenManager } from '@/core/auth/token-manager'; import { clearLocalSessionAndReplayEntry } from '@/features/auth/clear-local-session-and-replay-entry'; import { getDeviceLanguage } from '@/i18n'; +import { memoirKeys } from '@/features/memoir/query-keys'; + import { authApi } from './api'; import { authKeys } from './auth-query-keys'; import type { @@ -126,7 +128,10 @@ function usePostAuthSetup() { async (tokens: TokenResponse) => { await tokenManager.setTokens(tokens.access_token, tokens.refresh_token); queryClient.setQueryData(authKeys.tokenCheck, true); - await queryClient.invalidateQueries({ queryKey: authKeys.session }); + await Promise.all([ + queryClient.invalidateQueries({ queryKey: authKeys.session }), + queryClient.invalidateQueries({ queryKey: memoirKeys.all }), + ]); }, [queryClient], ); diff --git a/app-expo/src/features/memoir/api.ts b/app-expo/src/features/memoir/api.ts index 82266e9..d054e07 100644 --- a/app-expo/src/features/memoir/api.ts +++ b/app-expo/src/features/memoir/api.ts @@ -1,5 +1,9 @@ import { api } from '@/core/api/client'; +import { + isChapterListNotFoundError, + normalizeChapterList, +} from './chapter-list-response'; import type { Book, Chapter, @@ -32,10 +36,18 @@ export const memoirApi = { return api.post('/api/books/export-pdf', { body }); }, - fetchChapters(isNew?: boolean) { - return api.get('/api/chapters', { - params: isNew !== undefined ? { is_new: isNew } : undefined, - }); + async fetchChapters(isNew?: boolean): Promise { + try { + const data = await api.get('/api/chapters', { + params: isNew !== undefined ? { is_new: isNew } : undefined, + }); + return normalizeChapterList(data); + } catch (error) { + if (isChapterListNotFoundError(error)) { + return []; + } + throw error; + } }, fetchChapterDetail(chapterId: string) { diff --git a/app-expo/src/features/memoir/chapter-list-response.ts b/app-expo/src/features/memoir/chapter-list-response.ts new file mode 100644 index 0000000..980d760 --- /dev/null +++ b/app-expo/src/features/memoir/chapter-list-response.ts @@ -0,0 +1,47 @@ +import { ApiError, AuthError } from '@/core/api/types'; + +import type { Chapter } from './types'; + +/** Normalize GET /api/chapters payload; reject non-arrays without surfacing as query errors. */ +export function normalizeChapterList(data: unknown): Chapter[] { + if (data == null) return []; + if (Array.isArray(data)) return data as Chapter[]; + return []; +} + +export function isChapterListNotFoundError(error: unknown): boolean { + return error instanceof ApiError && error.status === 404; +} + +/** 未登录/无权限:不应展示「加载章节失败」(会话层会处理或展示框架位)。 */ +export function isChapterListAuthError(error: unknown): boolean { + if (error instanceof AuthError) return true; + return ( + error instanceof ApiError && + (error.status === 401 || error.status === 403) + ); +} + +/** + * True when GET /api/chapters succeeded but there are no list items (incl. filtered + * non-displayable chapters). Distinct from transport/auth failures. + */ +export function isChapterListEmptySuccess( + isSuccess: boolean, + chapters: Chapter[], +): boolean { + return isSuccess && chapters.length === 0; +} + +/** Only show "Could not load chapters" for real failures, not empty memoir or auth redirect. */ +export function shouldShowChapterListLoadError( + error: unknown, + isSuccess: boolean, + chapterCount: number, +): boolean { + if (isSuccess && chapterCount === 0) return false; + if (error == null) return false; + if (isChapterListNotFoundError(error)) return false; + if (isChapterListAuthError(error)) return false; + return true; +} diff --git a/app-expo/src/features/memoir/draft-progress.ts b/app-expo/src/features/memoir/draft-progress.ts index 34d04ff..d74fe83 100644 --- a/app-expo/src/features/memoir/draft-progress.ts +++ b/app-expo/src/features/memoir/draft-progress.ts @@ -37,6 +37,16 @@ export function interviewStageHasSnippetMaterial( ); } +/** 访谈槽位是否已有任意口述片段(尚无成稿章节时仍视为「进行中」)。 */ +export function hasAnyMemoirDraftingActivity( + slots: MemoirState['slots'] | undefined, +): boolean { + if (!slots) return false; + return Object.keys(slots).some((stage) => + interviewStageHasSnippetMaterial(slots, stage), + ); +} + export function memoirDraftHasStarted( slots: MemoirState['slots'] | undefined, chapterCategory: string, diff --git a/app-expo/src/features/memoir/hooks.ts b/app-expo/src/features/memoir/hooks.ts index 1972816..c55667e 100644 --- a/app-expo/src/features/memoir/hooks.ts +++ b/app-expo/src/features/memoir/hooks.ts @@ -1,6 +1,9 @@ import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; +import { AuthError } from '@/core/api/types'; + import { memoirApi } from './api'; +import { shouldShowChapterListLoadError } from './chapter-list-response'; import { toChapterViewModels } from './mappers'; import { memoirKeys } from './query-keys'; import type { ExportPdfRequest, UpdateBookRequest } from './types'; @@ -38,15 +41,42 @@ export function useUpdateBookTitle() { // ─── Chapters ─── -export function useChapters() { +export function hasCompletedMemoirChapter( + chapters: { isEmpty: boolean }[], +): boolean { + return chapters.some((ch) => !ch.isEmpty); +} + +export function useChapters(options?: { enabled?: boolean }) { + const enabled = options?.enabled ?? true; const query = useQuery({ queryKey: memoirKeys.chapters(), queryFn: () => memoirApi.fetchChapters(), + enabled, + retry: (failureCount, error) => { + if (error instanceof AuthError) return false; + return failureCount < 1; + }, }); + const viewModels = query.data ? toChapterViewModels(query.data) : []; + const hasCompletedChapters = hasCompletedMemoirChapter(viewModels); + const isEmptyList = + query.isSuccess && viewModels.length === 0 && !hasCompletedChapters; + const showLoadError = + !query.isLoading && + shouldShowChapterListLoadError( + query.error, + query.isSuccess, + viewModels.length, + ); + return { ...query, - viewModels: query.data ? toChapterViewModels(query.data) : [], + viewModels, + hasCompletedChapters, + isEmptyList, + showLoadError, }; } @@ -84,10 +114,12 @@ export function useCheckCoverGeneration() { // ─── Memoir state ─── -export function useMemoirState() { +export function useMemoirState(options?: { enabled?: boolean }) { + const enabled = options?.enabled ?? true; return useQuery({ queryKey: memoirKeys.state(), queryFn: () => memoirApi.fetchMemoirState(), + enabled, }); } diff --git a/app-expo/tests/core/config.test.ts b/app-expo/tests/core/config.test.ts index 6cffd7e..0e74766 100644 --- a/app-expo/tests/core/config.test.ts +++ b/app-expo/tests/core/config.test.ts @@ -1,10 +1,37 @@ import { appVariant, config, + requirePublicEnv, shouldShowAboutBackendUrl, type AppVariant, } from '@/core/config'; +describe('requirePublicEnv', () => { + it('throws when variable is missing or blank', () => { + const key = 'EXPO_PUBLIC_API_URL'; + const previous = process.env[key]; + try { + delete process.env[key]; + expect(() => requirePublicEnv(key)).toThrow(/Missing EXPO_PUBLIC_API_URL/); + process.env[key] = ' '; + expect(() => requirePublicEnv(key)).toThrow(/Missing EXPO_PUBLIC_API_URL/); + } finally { + if (previous === undefined) { + process.env[key] = 'http://127.0.0.1:8000'; + } else { + process.env[key] = previous; + } + } + }); +}); + +describe('config backend URLs', () => { + it('loads API and WS from EXPO_PUBLIC_* (jest.setup defaults)', () => { + expect(config.apiBaseUrl).toBe('http://127.0.0.1:8000'); + expect(config.wsBaseUrl).toBe('ws://127.0.0.1:8000'); + }); +}); + describe('shouldShowAboutBackendUrl', () => { it('shows backend URL for development and staging', () => { expect(shouldShowAboutBackendUrl('development')).toBe(true); diff --git a/app-expo/tests/features/memoir/chapter-list-response.test.ts b/app-expo/tests/features/memoir/chapter-list-response.test.ts new file mode 100644 index 0000000..b5136da --- /dev/null +++ b/app-expo/tests/features/memoir/chapter-list-response.test.ts @@ -0,0 +1,71 @@ +import { ApiError, AuthError, NetworkError } from '@/core/api/types'; +import { + isChapterListAuthError, + isChapterListEmptySuccess, + isChapterListNotFoundError, + normalizeChapterList, + shouldShowChapterListLoadError, +} from '@/features/memoir/chapter-list-response'; +import type { Chapter } from '@/features/memoir/types'; + +describe('normalizeChapterList', () => { + it('returns empty array for nullish or non-array payloads', () => { + expect(normalizeChapterList(null)).toEqual([]); + expect(normalizeChapterList(undefined)).toEqual([]); + expect(normalizeChapterList({ items: [] })).toEqual([]); + }); + + it('passes through chapter arrays', () => { + const chapters = [{ id: 'ch-1' }] as Chapter[]; + expect(normalizeChapterList(chapters)).toBe(chapters); + }); +}); + +describe('isChapterListNotFoundError', () => { + it('detects ApiError 404', () => { + expect(isChapterListNotFoundError(new ApiError('missing', 404))).toBe(true); + expect(isChapterListNotFoundError(new ApiError('bad', 500))).toBe(false); + expect(isChapterListNotFoundError(new Error('other'))).toBe(false); + }); +}); + +describe('isChapterListEmptySuccess', () => { + it('is true only for successful empty arrays', () => { + expect(isChapterListEmptySuccess(true, [])).toBe(true); + expect(isChapterListEmptySuccess(true, [{ id: 'x' } as never])).toBe( + false, + ); + expect(isChapterListEmptySuccess(false, [])).toBe(false); + }); +}); + +describe('isChapterListAuthError', () => { + it('treats AuthError and 401/403 ApiError as auth errors', () => { + expect(isChapterListAuthError(new AuthError())).toBe(true); + expect(isChapterListAuthError(new ApiError('unauthorized', 401))).toBe(true); + expect(isChapterListAuthError(new ApiError('forbidden', 403))).toBe(true); + expect(isChapterListAuthError(new ApiError('server', 500))).toBe(false); + }); +}); + +describe('shouldShowChapterListLoadError', () => { + it('hides load error for empty success, 404, and auth failures', () => { + expect(shouldShowChapterListLoadError(null, true, 0)).toBe(false); + expect(shouldShowChapterListLoadError(new ApiError('nope', 404), false, 0)).toBe( + false, + ); + expect(shouldShowChapterListLoadError(new AuthError(), false, 0)).toBe(false); + expect( + shouldShowChapterListLoadError(new ApiError('unauthorized', 401), false, 0), + ).toBe(false); + }); + + it('shows load error for network and server failures', () => { + expect( + shouldShowChapterListLoadError(new NetworkError('offline'), false, 0), + ).toBe(true); + expect( + shouldShowChapterListLoadError(new ApiError('boom', 500), false, 0), + ).toBe(true); + }); +}); diff --git a/app-expo/tests/features/memoir/draft-progress.test.ts b/app-expo/tests/features/memoir/draft-progress.test.ts index 7916f79..cabc32e 100644 --- a/app-expo/tests/features/memoir/draft-progress.test.ts +++ b/app-expo/tests/features/memoir/draft-progress.test.ts @@ -1,5 +1,6 @@ import { chapterCategoryToInterviewStage, + hasAnyMemoirDraftingActivity, memoirDraftCharsRemaining, memoirDraftHasStarted, MIN_CHAPTER_DISPLAY_CHARS, @@ -23,6 +24,14 @@ describe('draft-progress', () => { ).toBe('career_early'); }); + test('hasAnyMemoirDraftingActivity when any stage has snippet', () => { + const slots = { + childhood: { q1: { snippet: '小时候…', status: 'filled' } }, + }; + expect(hasAnyMemoirDraftingActivity(slots)).toBe(true); + expect(hasAnyMemoirDraftingActivity({})).toBe(false); + }); + test('memoirDraftHasStarted when interview slots have snippet', () => { const slots = { childhood: { place: { snippet: '老家在小城', segment_ids: [] } }, diff --git a/app-expo/tests/jest.setup.ts b/app-expo/tests/jest.setup.ts new file mode 100644 index 0000000..9179b0c --- /dev/null +++ b/app-expo/tests/jest.setup.ts @@ -0,0 +1,9 @@ +/** + * Jest loads config at import time; EXPO_PUBLIC_* must be set before any @/core/config import. + */ +process.env.EXPO_PUBLIC_API_URL = + process.env.EXPO_PUBLIC_API_URL ?? 'http://127.0.0.1:8000'; +process.env.EXPO_PUBLIC_WS_URL = + process.env.EXPO_PUBLIC_WS_URL ?? 'ws://127.0.0.1:8000'; +process.env.EXPO_PUBLIC_APP_VARIANT = + process.env.EXPO_PUBLIC_APP_VARIANT ?? 'development';