From f09ae248f9b909aea9db8c81be8e24eb7b9c0069 Mon Sep 17 00:00:00 2001
From: Sully <101929462+Sullivansome@users.noreply.github.com>
Date: Wed, 20 May 2026 15:14:13 +0800
Subject: [PATCH] feat: OpenTelemetry LGTM observability, dev tooling, and
 memoir UX fixes (#31) (#32)

* add staging ios app build script

* feat(api): add OpenTelemetry LGTM stack for local observability

Wire OTel traces, metrics, and logs through a collector to Tempo,
Prometheus, and Loki, with custom LLM instrumentation, dev compose overlay,
Grafana provisioning, env templates, and development.sh auto-start.


* feat: expand observability, harden dev tooling, and fix expo staging UX

Add business and LLM Prometheus metrics with Grafana dashboards, alerting,
and a metrics verification script. Wire telemetry through adapters and core
LLM paths, and document the local LGTM workflow.

Fix development.sh for macOS bash 3.2, open Grafana and eval-web in Chrome,
and repair eval-web auto-open (unbound EVAL_WEB_BROWSER_SCHEDULED). Merge
internal-eval into the main dev script with improved compose handling.

Require EXPO_PUBLIC_* at build time, improve iOS HTTP ATS for staging IPs,
show memoir empty state instead of load errors when no chapters exist, and
add jest env setup plus chapter list response normalization.


* chore: enable Grafana Assistant Cursor plugin


* fix: memoir empty state and repair withdrawn 0020_chapters_book_id stamp

Show empty memoir UI when the chapter list succeeds with no items; treat auth/404 as non-fatal. Extend alembic revision repair so local dev DBs stamped with the removed 0020_chapters_book_id migration can roll back and upgrade to 0019.


---------

Co-authored-by: Kevin <kevin@brighteng.org>
Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .cursor/settings.json                         |   3 +
 api/.env.example                              |  33 +-
 api/.env.production                           |  12 +
 api/.env.staging                              |  12 +
 api/README.md                                 |  10 +
 api/app/adapters/asr/tencent_asr.py           |   5 +
 api/app/adapters/asr/whisper_local.py         |   5 +
 api/app/adapters/embedding/zhipu.py           |  25 +-
 api/app/adapters/llm/deepseek.py              |  21 +-
 api/app/adapters/sms/tencent.py               |   5 +
 api/app/adapters/tts/openai_tts.py            |   5 +
 api/app/adapters/tts/tencent_tts.py           |  11 +
 api/app/agents/chat/interview_agent.py        |  41 +-
 api/app/core/agent_logging.py                 |  38 +-
 api/app/core/alembic_revision_repair.py       |   2 +
 api/app/core/business_telemetry.py            |  81 ++++
 api/app/core/config.py                        |  30 ++
 api/app/core/langchain_llm.py                 | 101 +++--
 api/app/core/llm_call.py                      | 194 ++++++++-
 api/app/core/llm_gateway.py                   |  23 +-
 api/app/core/llm_telemetry.py                 | 384 ++++++++++++++++++
 api/app/core/logging.py                       |  30 +-
 api/app/core/middleware.py                    |   4 +-
 api/app/core/telemetry.py                     | 146 +++++++
 api/app/features/conversation/ws/pipeline.py  |  33 ++
 api/app/features/evaluation/judge_service.py  |  11 +-
 .../features/memoir/story_pipeline_sync.py    | 252 +++++++-----
 api/app/features/payment/alipay_client.py     |  14 +
 api/app/features/payment/wechat_client.py     |  14 +
 api/app/internal_main.py                      |  12 +-
 api/app/main.py                               |  10 +-
 api/app/tasks/celery_app.py                   |  11 +-
 api/app/tasks/memoir_tasks.py                 | 179 ++++----
 api/app/tasks/memory_compaction_tasks.py      |   7 +-
 api/app/tasks/memory_enrichment_tasks.py      |   7 +-
 .../dashboards/life-echo-business.json        |  75 ++++
 .../grafana/dashboards/life-echo-llm.json     |  79 ++++
 .../grafana/dashboards/life-echo-logs.json    |  69 ++++
 .../dashboards/life-echo-overview.json        | 154 +++++++
 .../provisioning/alerting/contact_points.yml  |   4 +
 .../grafana/provisioning/alerting/rules.yml   | 147 +++++++
 .../provisioning/dashboards/dashboards.yml    |  11 +
 .../provisioning/datasources/datasources.yml  |  43 ++
 api/deploy/observability/loki-config.yaml     |  32 ++
 .../observability/otel-collector-config.yaml  |  53 +++
 api/deploy/observability/prometheus.yml       |  12 +
 api/deploy/observability/promtail-config.yaml |  41 ++
 api/deploy/observability/tempo.yaml           |  29 ++
 api/development.sh                            | 223 +++++++++-
 api/docker-compose.observability.yml          | 122 ++++++
 api/docs/internal-eval.md                     |  19 +-
 api/docs/observability.md                     | 139 +++++++
 api/docs/部署指南.md                          |   4 +-
 api/internal-eval.sh                          |  18 +-
 api/pyproject.toml                            |   9 +
 api/scripts/verify_observability_metrics.sh   |  40 ++
 api/tests/core/test_business_telemetry.py     |  64 +++
 api/tests/core/test_llm_telemetry.py          | 118 ++++++
 api/uv.lock                                   | 353 +++++++++++++++-
 app-expo/.env.example                         |  10 +-
 app-expo/app.config.ts                        |  20 +-
 app-expo/jest.config.js                       |   1 +
 app-expo/plugins/withIosInsecureHttp.js       |  87 +++-
 app-expo/src/app/(tabs)/memoir.tsx            | 124 +++++-
 app-expo/src/core/config.ts                   |  59 ++-
 app-expo/src/features/auth/hooks.ts           |   7 +-
 app-expo/src/features/memoir/api.ts           |  20 +-
 .../features/memoir/chapter-list-response.ts  |  47 +++
 .../src/features/memoir/draft-progress.ts     |  10 +
 app-expo/src/features/memoir/hooks.ts         |  38 +-
 app-expo/tests/core/config.test.ts            |  27 ++
 .../memoir/chapter-list-response.test.ts      |  71 ++++
 .../features/memoir/draft-progress.test.ts    |   9 +
 app-expo/tests/jest.setup.ts                  |   9 +
 74 files changed, 3793 insertions(+), 375 deletions(-)
 create mode 100644 api/app/core/business_telemetry.py
 create mode 100644 api/app/core/llm_telemetry.py
 create mode 100644 api/app/core/telemetry.py
 create mode 100644 api/deploy/observability/grafana/dashboards/life-echo-business.json
 create mode 100644 api/deploy/observability/grafana/dashboards/life-echo-llm.json
 create mode 100644 api/deploy/observability/grafana/dashboards/life-echo-logs.json
 create mode 100644 api/deploy/observability/grafana/dashboards/life-echo-overview.json
 create mode 100644 api/deploy/observability/grafana/provisioning/alerting/contact_points.yml
 create mode 100644 api/deploy/observability/grafana/provisioning/alerting/rules.yml
 create mode 100644 api/deploy/observability/grafana/provisioning/dashboards/dashboards.yml
 create mode 100644 api/deploy/observability/grafana/provisioning/datasources/datasources.yml
 create mode 100644 api/deploy/observability/loki-config.yaml
 create mode 100644 api/deploy/observability/otel-collector-config.yaml
 create mode 100644 api/deploy/observability/prometheus.yml
 create mode 100644 api/deploy/observability/promtail-config.yaml
 create mode 100644 api/deploy/observability/tempo.yaml
 create mode 100644 api/docker-compose.observability.yml
 create mode 100644 api/docs/observability.md
 create mode 100755 api/scripts/verify_observability_metrics.sh
 create mode 100644 api/tests/core/test_business_telemetry.py
 create mode 100644 api/tests/core/test_llm_telemetry.py
 create mode 100644 app-expo/src/features/memoir/chapter-list-response.ts
 create mode 100644 app-expo/tests/features/memoir/chapter-list-response.test.ts
 create mode 100644 app-expo/tests/jest.setup.ts

diff --git a/.cursor/settings.json b/.cursor/settings.json
index 9c2ee48..331de8f 100644
--- a/.cursor/settings.json
+++ b/.cursor/settings.json
@@ -5,6 +5,9 @@
     },
     "postman": {
       "enabled": true
+    },
+    "grafana-assistant": {
+      "enabled": true
     }
   }
 }
diff --git a/api/.env.example b/api/.env.example
index 35cd342..ed543ba 100644
--- a/api/.env.example
+++ b/api/.env.example
@@ -16,6 +16,37 @@
 # LIFE_ECHO_API_HOST_PORT=8000
 # 若 Caddy 跑在独立容器且非 host 网络，不要用 127.0.0.1，应把 Caddy 加入与本 compose 相同的 Docker 网络，并对 http://life-echo-api-prod:8000 做 reverse_proxy。
 
+# =============================================================================
+# OpenTelemetry（见 docs/observability.md；Settings 只读 .env，勿 shell export）
+# =============================================================================
+# docker-compose.observability.yml 宿主机端口（高位口，避免 3000/9090/4317 冲突）
+# GRAFANA_HOST_PORT=48300
+# PROMETHEUS_HOST_PORT=49090
+# OTEL_GRPC_HOST_PORT=48317
+# OTEL_HTTP_HOST_PORT=48318
+# OTEL_COLLECTOR_HEALTH_HOST_PORT=48333
+# TEMPO_HTTP_HOST_PORT=43200
+# LOKI_HTTP_HOST_PORT=43100
+#
+# --- development（.env.development）：本机 uvicorn/celery ---
+# OTEL_ENABLED=true
+# OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:48317
+# OTEL_TRACES_SAMPLER=always_on
+#
+# --- staging / production（.env.staging / .env.production）：容器内 compose ---
+# OTEL_ENABLED=false
+# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
+# OTEL_TRACES_SAMPLER=parentbased_traceidratio
+# OTEL_TRACES_SAMPLER_ARG=0.1
+#
+OTEL_ENABLED=true
+OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:48317
+OTEL_EXPORTER_OTLP_INSECURE=true
+OTEL_SERVICE_NAME=life-echo-api
+OTEL_TRACES_SAMPLER=always_on
+# OTEL_TRACES_SAMPLER_ARG=0.1
+# OTEL_METRIC_EXPORT_INTERVAL_MS=10000
+
 # =============================================================================
 # Logging（loguru sink 最低级别：TRACE / DEBUG / INFO / WARNING / ERROR / CRITICAL）
 # =============================================================================
@@ -140,7 +171,7 @@ REDIS_SESSION_TTL=86400
 # CELERY_MEMORY_ENRICHMENT_QUEUE=memory_idle
 
 # =============================================================================
-# Internal evaluation API（internal_main / internal-eval.sh；与主 API 进程隔离）
+# Internal evaluation API（internal_main；development.sh 默认一并启动；与主 API 进程隔离）
 # =============================================================================
 # 本地：`openssl rand -hex 32`；不用 internal eval 时可留空
 INTERNAL_EVAL_API_KEY=
diff --git a/api/.env.production b/api/.env.production
index c059d7b..6650d28 100644
--- a/api/.env.production
+++ b/api/.env.production
@@ -33,6 +33,18 @@ LOG_LEVEL=INFO
 # CELERY_LOG_LEVEL=
 # HTTPX_LOG_LEVEL=
 
+# =============================================================================
+# OpenTelemetry（生产；第二阶段 compose profile 接入后设 OTEL_ENABLED=true，见 docs/observability.md）
+# 容器内 API/Celery → http://otel-collector:4317；勿用 localhost
+# =============================================================================
+OTEL_ENABLED=false
+OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
+OTEL_EXPORTER_OTLP_INSECURE=true
+OTEL_SERVICE_NAME=life-echo-api
+OTEL_TRACES_SAMPLER=parentbased_traceidratio
+OTEL_TRACES_SAMPLER_ARG=0.1
+# OTEL_METRIC_EXPORT_INTERVAL_MS=10000
+
 # =============================================================================
 # LLM / DeepSeek
 # =============================================================================
diff --git a/api/.env.staging b/api/.env.staging
index 7a6dcc6..fa1ed8b 100644
--- a/api/.env.staging
+++ b/api/.env.staging
@@ -32,6 +32,18 @@ LOG_LEVEL=INFO
 # CELERY_LOG_LEVEL=
 # HTTPX_LOG_LEVEL=
 
+# =============================================================================
+# OpenTelemetry（预发；compose 接入 LGTM 后设 OTEL_ENABLED=true，见 docs/observability.md）
+# API/Celery 容器内 endpoint 用服务名；Grafana 宿主机端口见 observability compose（默认 48300 等）
+# =============================================================================
+OTEL_ENABLED=false
+OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
+OTEL_EXPORTER_OTLP_INSECURE=true
+OTEL_SERVICE_NAME=life-echo-api
+OTEL_TRACES_SAMPLER=parentbased_traceidratio
+OTEL_TRACES_SAMPLER_ARG=0.1
+# OTEL_METRIC_EXPORT_INTERVAL_MS=10000
+
 # =============================================================================
 # LLM / DeepSeek
 # =============================================================================
diff --git a/api/README.md b/api/README.md
index fa26cd1..715351f 100644
--- a/api/README.md
+++ b/api/README.md
@@ -32,6 +32,16 @@ Life Echo API 是一个智能对话系统，通过 WebSocket 实时连接，使
 - **ASR/TTS**: OpenAI Whisper API
 - **认证**: JWT (python-jose) + bcrypt
 - **其他**: Pydantic, python-dotenv
+- **可观测性**: OpenTelemetry → Grafana LGTM（Tempo / Prometheus / Loki），见 [`docs/observability.md`](docs/observability.md)
+
+## 可观测性（本地）
+
+```bash
+docker compose -f docker-compose.dev.yml -f docker-compose.observability.yml up -d
+# Grafana: http://127.0.0.1:48300
+```
+
+在 `.env` 中配置 `OTEL_*`（见 [`.env.example`](.env.example)），与 Postgres/Redis 一样由 Settings 加载，无需 shell export。详见 [`docs/observability.md`](docs/observability.md)。
 
 ## 项目结构
 
diff --git a/api/app/adapters/asr/tencent_asr.py b/api/app/adapters/asr/tencent_asr.py
index 0620910..d80fccf 100644
--- a/api/app/adapters/asr/tencent_asr.py
+++ b/api/app/adapters/asr/tencent_asr.py
@@ -3,6 +3,7 @@
 import asyncio
 import base64
 
+from app.core.business_telemetry import business_span
 from app.core.logging import get_logger
 from app.ports.asr import ASRTranscriptionError
 
@@ -39,6 +40,10 @@ class TencentASRProvider:
         return bool(self._secret_id and self._secret_key and self._get_client())
 
     async def transcribe(self, audio: bytes, format: str = "m4a") -> str:
+        with business_span("asr.transcribe", provider="tencent"):
+            return await self._transcribe_inner(audio, format)
+
+    async def _transcribe_inner(self, audio: bytes, format: str) -> str:
         client = self._get_client()
         if not client:
             raise ASRTranscriptionError(
diff --git a/api/app/adapters/asr/whisper_local.py b/api/app/adapters/asr/whisper_local.py
index a64a0a5..dfe6bd9 100644
--- a/api/app/adapters/asr/whisper_local.py
+++ b/api/app/adapters/asr/whisper_local.py
@@ -8,6 +8,7 @@ import re
 import tempfile
 from typing import Any, Iterable
 
+from app.core.business_telemetry import business_span
 from app.core.logging import get_logger
 from app.ports.asr import ASRTranscriptionError
 
@@ -102,6 +103,10 @@ class WhisperASRProvider:
         return self._load_model()
 
     async def transcribe(self, audio: bytes, format: str = "m4a") -> str:
+        with business_span("asr.transcribe", provider="whisper"):
+            return await self._transcribe_inner(audio, format)
+
+    async def _transcribe_inner(self, audio: bytes, format: str) -> str:
         # 与 v1.1.0 相同的单次 transcribe；推理放线程池，避免阻塞 asyncio（tag 上为同步调用）。
         self._load_model()
         if not self._model:
diff --git a/api/app/adapters/embedding/zhipu.py b/api/app/adapters/embedding/zhipu.py
index cb018ad..249d415 100644
--- a/api/app/adapters/embedding/zhipu.py
+++ b/api/app/adapters/embedding/zhipu.py
@@ -6,6 +6,7 @@ import asyncio
 
 from zai import ZhipuAiClient
 
+from app.core.business_telemetry import business_span
 from app.core.embedding import MEMORY_EMBEDDING_DIMENSION
 from app.core.logging import get_logger
 
@@ -57,12 +58,13 @@ class ZhipuEmbeddingProvider:
     async def embed_texts(self, texts: list[str]) -> list[list[float]]:
         if not self._client or not texts:
             return []
-        out: list[list[float]] = []
-        for i in range(0, len(texts), _EMBED_BATCH_SIZE):
-            batch = texts[i : i + _EMBED_BATCH_SIZE]
-            part = await asyncio.to_thread(self._create_vectors_sync, batch)
-            out.extend(part)
-        return out
+        with business_span("embedding.zhipu.embed", batch_size=len(texts)):
+            out: list[list[float]] = []
+            for i in range(0, len(texts), _EMBED_BATCH_SIZE):
+                batch = texts[i : i + _EMBED_BATCH_SIZE]
+                part = await asyncio.to_thread(self._create_vectors_sync, batch)
+                out.extend(part)
+            return out
 
     def embed_text_sync(self, text: str) -> list[float]:
         vecs = self.embed_texts_sync([text])
@@ -71,8 +73,9 @@ class ZhipuEmbeddingProvider:
     def embed_texts_sync(self, texts: list[str]) -> list[list[float]]:
         if not self._client or not texts:
             return []
-        out: list[list[float]] = []
-        for i in range(0, len(texts), _EMBED_BATCH_SIZE):
-            batch = texts[i : i + _EMBED_BATCH_SIZE]
-            out.extend(self._create_vectors_sync(batch))
-        return out
+        with business_span("embedding.zhipu.embed", batch_size=len(texts)):
+            out: list[list[float]] = []
+            for i in range(0, len(texts), _EMBED_BATCH_SIZE):
+                batch = texts[i : i + _EMBED_BATCH_SIZE]
+                out.extend(self._create_vectors_sync(batch))
+            return out
diff --git a/api/app/adapters/llm/deepseek.py b/api/app/adapters/llm/deepseek.py
index 244574c..0d6805d 100644
--- a/api/app/adapters/llm/deepseek.py
+++ b/api/app/adapters/llm/deepseek.py
@@ -4,6 +4,8 @@ from collections.abc import AsyncIterator
 
 from langchain_openai import ChatOpenAI
 
+from app.core.llm_telemetry import langchain_invoke_span, observe_astream
+
 
 class DeepSeekLLMProvider:
     """LangChain-based LLM adapter for DeepSeek and OpenAI-compatible APIs.
@@ -56,7 +58,15 @@ class DeepSeekLLMProvider:
     ) -> str:
         llm = self._get_llm(temperature, model, max_tokens)
         lc_messages = _to_langchain_messages(messages)
-        result = await llm.ainvoke(lc_messages)
+        resolved_model = model or self._default_model
+        with langchain_invoke_span(
+            agent="deepseek.complete",
+            provider="deepseek",
+            model=resolved_model,
+            call_type="chat",
+        ) as tel:
+            result = await llm.ainvoke(lc_messages)
+            tel["response"] = result
         return str(result.content)
 
     async def stream(
@@ -69,7 +79,14 @@ class DeepSeekLLMProvider:
     ) -> AsyncIterator[str]:
         llm = self._get_llm(temperature, model, max_tokens)
         lc_messages = _to_langchain_messages(messages)
-        async for chunk in llm.astream(lc_messages):
+        resolved_model = model or self._default_model
+        async for chunk in observe_astream(
+            llm,
+            lc_messages,
+            agent="deepseek.stream",
+            provider="deepseek",
+            model=resolved_model,
+        ):
             if chunk.content:
                 yield str(chunk.content)
 
diff --git a/api/app/adapters/sms/tencent.py b/api/app/adapters/sms/tencent.py
index 357988b..e3f736d 100644
--- a/api/app/adapters/sms/tencent.py
+++ b/api/app/adapters/sms/tencent.py
@@ -7,6 +7,7 @@ from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
 from tencentcloud.sms.v20210111 import models as sms_models
 from tencentcloud.sms.v20210111 import sms_client
 
+from app.core.business_telemetry import business_span
 from app.core.logging import get_logger
 
 logger = get_logger(__name__)
@@ -32,6 +33,10 @@ class TencentSmsSender:
         self._template_param_count = template_param_count
 
     def send_verification_code(self, phone: str, code: str) -> bool:
+        with business_span("sms.tencent.send"):
+            return self._send_verification_code_inner(phone, code)
+
+    def _send_verification_code_inner(self, phone: str, code: str) -> bool:
         if not self._secret_id or not self._secret_key:
             logger.error("Tencent SMS credentials not configured")
             return False
diff --git a/api/app/adapters/tts/openai_tts.py b/api/app/adapters/tts/openai_tts.py
index 6c2553a..55eaf64 100644
--- a/api/app/adapters/tts/openai_tts.py
+++ b/api/app/adapters/tts/openai_tts.py
@@ -5,6 +5,7 @@ from io import BytesIO
 
 from openai import OpenAI
 
+from app.core.business_telemetry import business_span
 from app.core.logging import get_logger
 
 logger = get_logger(__name__)
@@ -35,6 +36,10 @@ class OpenAITTSProvider:
         *,
         language: str = "zh",  # noqa: ARG002 — OpenAI TTS auto-detects language
     ) -> bytes:
+        with business_span("tts.synthesize", provider="openai"):
+            return await self._synthesize_api(text, voice)
+
+    async def _synthesize_api(self, text: str, voice: str) -> bytes:
         if not self._client:
             return b""
         try:
diff --git a/api/app/adapters/tts/tencent_tts.py b/api/app/adapters/tts/tencent_tts.py
index 2377fa3..c00fa15 100644
--- a/api/app/adapters/tts/tencent_tts.py
+++ b/api/app/adapters/tts/tencent_tts.py
@@ -5,6 +5,7 @@ import base64
 import re
 import uuid
 
+from app.core.business_telemetry import business_span
 from app.core.logging import get_logger
 
 logger = get_logger(__name__)
@@ -180,6 +181,16 @@ class TencentTTSProvider:
         voice: str = "alloy",
         *,
         language: str = "zh",
+    ) -> bytes:
+        with business_span("tts.synthesize", provider="tencent"):
+            return await self._synthesize_inner(text, voice, language=language)
+
+    async def _synthesize_inner(
+        self,
+        text: str,
+        voice: str = "alloy",
+        *,
+        language: str = "zh",
     ) -> bytes:
         if not self._secret_id or not self._secret_key:
             logger.error(
diff --git a/api/app/agents/chat/interview_agent.py b/api/app/agents/chat/interview_agent.py
index 105598a..15293c8 100644
--- a/api/app/agents/chat/interview_agent.py
+++ b/api/app/agents/chat/interview_agent.py
@@ -38,6 +38,7 @@ from app.agents.state_schema import (
     interview_control_state,
     narrative_coverage_state,
 )
+from app.core.llm_telemetry import infer_provider_model, observe_ainvoke
 from app.core.agent_logging import (
     agent_span,
     log_agent_payload,
@@ -331,7 +332,15 @@ class InterviewAgent:
                     conversation_turn_total,
                     history_pairs_windowed,
                 )
-                response = await chat_llm.ainvoke(messages)
+                provider, model = infer_provider_model(chat_llm)
+                response = await observe_ainvoke(
+                    chat_llm,
+                    messages,
+                    agent="InterviewAgent.generate_response",
+                    provider=provider,
+                    model=model,
+                    call_type="chat",
+                )
             response_ms = (time.perf_counter() - llm_t0) * 1000
             logger.info(
                 "event=chat_llm_done agent=InterviewAgent.generate_response_with_state "
@@ -384,7 +393,15 @@ class InterviewAgent:
                         _message_contents_char_count(retry_messages),
                         conversation_id,
                     )
-                    response_retry = await chat_llm.ainvoke(retry_messages)
+                    provider, model = infer_provider_model(chat_llm)
+                    response_retry = await observe_ainvoke(
+                        chat_llm,
+                        retry_messages,
+                        agent="InterviewAgent.duplicate_guard_retry",
+                        provider=provider,
+                        model=model,
+                        call_type="chat",
+                    )
                 logger.info(
                     "event=chat_llm_done agent=InterviewAgent.duplicate_guard_retry "
                     "response_latency_ms={:.2f}",
@@ -524,7 +541,15 @@ class InterviewAgent:
                     hw.turn_total,
                     len(hw.window) // 2,
                 )
-                response = await opening_llm.ainvoke(messages)
+                provider, model = infer_provider_model(opening_llm)
+                response = await observe_ainvoke(
+                    opening_llm,
+                    messages,
+                    agent="InterviewAgent.opening",
+                    provider=provider,
+                    model=model,
+                    call_type="chat",
+                )
             logger.info(
                 "event=chat_llm_done agent=InterviewAgent.generate_opening_message "
                 "response_latency_ms={:.2f}",
@@ -643,7 +668,15 @@ class InterviewAgent:
                     len(hw.window) // 2,
                     idle_hours,
                 )
-                response = await re_greet_llm.ainvoke(messages)
+                provider, model = infer_provider_model(re_greet_llm)
+                response = await observe_ainvoke(
+                    re_greet_llm,
+                    messages,
+                    agent="InterviewAgent.re_greeting",
+                    provider=provider,
+                    model=model,
+                    call_type="chat",
+                )
             logger.info(
                 "event=chat_llm_done agent=InterviewAgent.generate_re_greeting_message "
                 "response_latency_ms={:.2f}",
diff --git a/api/app/core/agent_logging.py b/api/app/core/agent_logging.py
index e2021e0..b468a99 100644
--- a/api/app/core/agent_logging.py
+++ b/api/app/core/agent_logging.py
@@ -24,7 +24,11 @@ import time
 from contextlib import contextmanager
 from typing import Any, Iterator
 
+from opentelemetry import trace
+from opentelemetry.trace import Status, StatusCode
+
 from app.core.config import settings
+from app.core.telemetry import get_tracer
 
 _dedup_lock = threading.Lock()
 _last_prompt_sha256_by_label: dict[str, str] = {}
@@ -97,15 +101,41 @@ def agent_span(
     ctx = " ".join(f"{k}={v!r}" for k, v in context.items())
     if agent_verbose_enabled():
         logger.debug("agent_span_start {} {}", operation, ctx)
-    try:
-        yield
-    finally:
-        ms = (time.perf_counter() - t0) * 1000
+
+    def _log_end(ms: float) -> None:
         if agent_verbose_enabled():
             logger.debug("agent_span_end {} duration_ms={:.2f} {}", operation, ms, ctx)
         elif settings.log_agent_verbose:
             logger.info("agent_span {} duration_ms={:.2f} {}", operation, ms, ctx)
 
+    if settings.otel_enabled:
+        tracer = get_tracer("app.agent")
+        with tracer.start_as_current_span(
+            "agent.operation",
+            attributes={"agent.operation": operation},
+        ) as span:
+            failed = False
+            try:
+                yield
+            except Exception:
+                failed = True
+                if span.is_recording():
+                    span.set_status(Status(StatusCode.ERROR))
+                raise
+            finally:
+                ms = (time.perf_counter() - t0) * 1000
+                if span.is_recording():
+                    span.set_attribute("agent.duration_ms", round(ms, 2))
+                    if not failed:
+                        span.set_status(Status(StatusCode.OK))
+                _log_end(ms)
+        return
+
+    try:
+        yield
+    finally:
+        _log_end((time.perf_counter() - t0) * 1000)
+
 
 def log_agent_payload(
     logger: Any,
diff --git a/api/app/core/alembic_revision_repair.py b/api/app/core/alembic_revision_repair.py
index 69a3e87..2426348 100644
--- a/api/app/core/alembic_revision_repair.py
+++ b/api/app/core/alembic_revision_repair.py
@@ -9,6 +9,8 @@ _WITHDRAWN_0020_REVISIONS = frozenset(
         "0020_add_tts_audio_urls_column",
         "0020_backfill_missing_schema",
         "0020_backfill_all_missing_columns",
+        # 曾本地试运行后从仓库撤回，仅 dev 库可能残留 stamp
+        "0020_chapters_book_id",
     }
 )
 _REPAIR_TARGET_REVISION = "0018_users_language_preference"
diff --git a/api/app/core/business_telemetry.py b/api/app/core/business_telemetry.py
new file mode 100644
index 0000000..0a0488d
--- /dev/null
+++ b/api/app/core/business_telemetry.py
@@ -0,0 +1,81 @@
+"""
+业务链路 OpenTelemetry span（回忆录阶段、WS、外部依赖等）。
+"""
+
+from __future__ import annotations
+
+import time
+from contextlib import contextmanager
+from typing import Any, Iterator
+
+from opentelemetry import trace
+from opentelemetry.trace import Status, StatusCode
+
+from app.core.config import settings
+from app.core.telemetry import get_meter, get_tracer
+
+_meter = None
+_duration_hist = None
+
+# 仅低基数字段进入 span attribute（禁止 user_id / conversation_id 等）
+_ALLOWED_SPAN_ATTRS = frozenset(
+    {"provider", "chapter_category", "segment_count", "batch_size", "hours"}
+)
+
+
+def _ensure_instruments() -> None:
+    global _meter, _duration_hist
+    if _meter is not None or not settings.otel_enabled:
+        return
+    _meter = get_meter("app.business")
+    _duration_hist = _meter.create_histogram(
+        "business.operation.duration",
+        unit="ms",
+        description="Business operation wall time",
+    )
+
+
+def _normalize_attr_value(value: Any) -> str | int | float | bool:
+    if isinstance(value, (str, int, float, bool)):
+        return value
+    return str(value)
+
+
+@contextmanager
+def business_span(
+    name: str,
+    /,
+    **attributes: Any,
+) -> Iterator[trace.Span]:
+    if not settings.otel_enabled:
+        yield trace.INVALID_SPAN
+        return
+
+    tracer = get_tracer("app.business")
+    otel_attrs = {
+        f"business.{k}": _normalize_attr_value(v)
+        for k, v in attributes.items()
+        if k in _ALLOWED_SPAN_ATTRS and v is not None and v != ""
+    }
+    t0 = time.perf_counter()
+    outcome = "ok"
+    with tracer.start_as_current_span(name, attributes=otel_attrs) as span:
+        try:
+            yield span
+        except Exception:
+            outcome = "error"
+            if span.is_recording():
+                span.set_status(Status(StatusCode.ERROR))
+            raise
+        finally:
+            duration_ms = (time.perf_counter() - t0) * 1000
+            if span.is_recording():
+                span.set_attribute("business.duration_ms", round(duration_ms, 2))
+                if outcome == "ok":
+                    span.set_status(Status(StatusCode.OK))
+            _ensure_instruments()
+            if _duration_hist is not None:
+                _duration_hist.record(
+                    duration_ms,
+                    {"operation": name, "outcome": outcome},
+                )
diff --git a/api/app/core/config.py b/api/app/core/config.py
index 6341f26..03a172a 100644
--- a/api/app/core/config.py
+++ b/api/app/core/config.py
@@ -223,6 +223,36 @@ class Settings(BaseSettings):
     # 非空时额外写入 JSONL（serialize=True），便于 Loki/ELK；与 stderr 彩色控制台并存
     log_json_file: str = ""
 
+    # ── OpenTelemetry ─────────────────────────────────────────
+    otel_enabled: bool = False
+    otel_exporter_otlp_endpoint: str = "http://localhost:48317"
+    otel_exporter_otlp_insecure: bool = True
+    otel_service_name: str = ""
+    otel_traces_sampler: str = Field(
+        default="always_on",
+        description="always_on | parentbased_traceidratio | always_off",
+    )
+    otel_traces_sampler_arg: float | None = Field(default=None, ge=0.0, le=1.0)
+    otel_metric_export_interval_ms: int = Field(default=10_000, ge=1000, le=300_000)
+
+    @field_validator("otel_enabled", mode="before")
+    @classmethod
+    def _coerce_otel_enabled(cls, v: object) -> bool:
+        if isinstance(v, bool):
+            return v
+        if v is None:
+            return False
+        return str(v).strip().lower() in ("1", "true", "yes", "on")
+
+    @field_validator("otel_exporter_otlp_insecure", mode="before")
+    @classmethod
+    def _coerce_otel_exporter_otlp_insecure(cls, v: object) -> bool:
+        if isinstance(v, bool):
+            return v
+        if v is None:
+            return True
+        return str(v).strip().lower() in ("1", "true", "yes", "on")
+
     @field_validator("celery_purge_broker_on_startup", mode="before")
     @classmethod
     def _coerce_celery_purge_broker_on_startup(cls, v: object) -> bool:
diff --git a/api/app/core/langchain_llm.py b/api/app/core/langchain_llm.py
index 6c09ba7..b8fd33a 100644
--- a/api/app/core/langchain_llm.py
+++ b/api/app/core/langchain_llm.py
@@ -16,6 +16,7 @@ from app.core.agent_logging import (
     agent_verbose_enabled,
     log_agent_payload,
 )
+from app.core.llm_telemetry import infer_provider_model, langchain_invoke_span
 from app.core.logging import get_logger
 
 logger = get_logger(__name__)
@@ -68,29 +69,41 @@ def invoke_json_object(
     sha = _prompt_sha12(prompt_for_api)
     attempts = 2 if retry_empty else 1
     t0 = time.perf_counter()
+    provider, model = infer_provider_model(llm)
     last_content = ""
-    for attempt in range(attempts):
-        response = bound.invoke(prompt_for_api)
-        content = (getattr(response, "content", None) or "").strip()
-        last_content = content
-        if content:
-            if attempt > 0:
-                logger.info(
-                    "json_object 空内容重试成功 agent={} prompt_sha12={}",
+    with langchain_invoke_span(
+        agent=tag,
+        provider=provider,
+        model=model,
+        call_type="json",
+        prompt_sha12=sha,
+        max_tokens=max_tokens,
+    ) as tel:
+        for attempt in range(attempts):
+            response = bound.invoke(prompt_for_api)
+            tel["response"] = response
+            content = (getattr(response, "content", None) or "").strip()
+            last_content = content
+            if content:
+                if attempt > 0:
+                    logger.info(
+                        "json_object 空内容重试成功 agent={} prompt_sha12={}",
+                        tag,
+                        sha,
+                    )
+                tel["outcome"] = "ok"
+                _log_json_object_done(
+                    tag, sha, prompt_for_api, content, attempt + 1, t0, success=True
+                )
+                return content
+            if attempt == 0 and retry_empty:
+                logger.warning(
+                    "json_object 返回空 content，将重试 agent={} attempt={} prompt_sha12={}",
                     tag,
+                    attempt,
                     sha,
                 )
-            _log_json_object_done(
-                tag, sha, prompt_for_api, content, attempt + 1, t0, success=True
-            )
-            return content
-        if attempt == 0 and retry_empty:
-            logger.warning(
-                "json_object 返回空 content，将重试 agent={} attempt={} prompt_sha12={}",
-                tag,
-                attempt,
-                sha,
-            )
+        tel["outcome"] = "error"
     logger.warning("json_object 仍为空 agent={} prompt_sha12={}", tag, sha)
     _log_json_object_done(
         tag, sha, prompt_for_api, last_content, attempts, t0, success=False
@@ -113,29 +126,41 @@ async def ainvoke_json_object(
     sha = _prompt_sha12(prompt_for_api)
     attempts = 2 if retry_empty else 1
     t0 = time.perf_counter()
+    provider, model = infer_provider_model(llm)
     last_content = ""
-    for attempt in range(attempts):
-        response = await bound.ainvoke(prompt_for_api)
-        content = (getattr(response, "content", None) or "").strip()
-        last_content = content
-        if content:
-            if attempt > 0:
-                logger.info(
-                    "json_object 空内容重试成功 agent={} prompt_sha12={}",
+    with langchain_invoke_span(
+        agent=tag,
+        provider=provider,
+        model=model,
+        call_type="json",
+        prompt_sha12=sha,
+        max_tokens=max_tokens,
+    ) as tel:
+        for attempt in range(attempts):
+            response = await bound.ainvoke(prompt_for_api)
+            tel["response"] = response
+            content = (getattr(response, "content", None) or "").strip()
+            last_content = content
+            if content:
+                if attempt > 0:
+                    logger.info(
+                        "json_object 空内容重试成功 agent={} prompt_sha12={}",
+                        tag,
+                        sha,
+                    )
+                tel["outcome"] = "ok"
+                _log_json_object_done(
+                    tag, sha, prompt_for_api, content, attempt + 1, t0, success=True
+                )
+                return content
+            if attempt == 0 and retry_empty:
+                logger.warning(
+                    "json_object 返回空 content，将重试 agent={} attempt={} prompt_sha12={}",
                     tag,
+                    attempt,
                     sha,
                 )
-            _log_json_object_done(
-                tag, sha, prompt_for_api, content, attempt + 1, t0, success=True
-            )
-            return content
-        if attempt == 0 and retry_empty:
-            logger.warning(
-                "json_object 返回空 content，将重试 agent={} attempt={} prompt_sha12={}",
-                tag,
-                attempt,
-                sha,
-            )
+        tel["outcome"] = "error"
     logger.warning("json_object 仍为空 agent={} prompt_sha12={}", tag, sha)
     _log_json_object_done(
         tag, sha, prompt_for_api, last_content, attempts, t0, success=False
diff --git a/api/app/core/llm_call.py b/api/app/core/llm_call.py
index 4b4ae7b..52cf767 100644
--- a/api/app/core/llm_call.py
+++ b/api/app/core/llm_call.py
@@ -30,6 +30,12 @@ from app.core.langchain_llm import (
 )
 from app.core.llm_errors import LlmHttpErrorVendor, format_llm_http_error_message
 from app.core.llm_http_openai_chat_errors import should_log_openai_error_as_warning
+from app.core.llm_telemetry import (
+    extract_token_usage,
+    infer_provider_model,
+    llm_call_span,
+    record_llm_call,
+)
 from app.core.logging import get_logger
 
 logger = get_logger(__name__)
@@ -138,14 +144,16 @@ def _invoke_raw_sync(
     max_tokens: int,
     agent: str,
     retry_empty: bool,
-) -> tuple[str, int]:
+) -> tuple[str, int, int, int]:
     prompt_for_api = ensure_json_object_prompt_has_json_keyword(prompt)
     bound = bind_json_object_mode(llm, max_tokens=max_tokens)
     tag = agent or "json_object"
     sha = _prompt_sha12(prompt_for_api)
     attempts = 2 if retry_empty else 1
+    last_in, last_out = 0, 0
     for attempt in range(attempts):
         response = bound.invoke(prompt_for_api)
+        last_in, last_out = extract_token_usage(response)
         content = (getattr(response, "content", None) or "").strip()
         if content:
             if attempt > 0:
@@ -154,7 +162,7 @@ def _invoke_raw_sync(
                     tag,
                     sha,
                 )
-            return content, attempt + 1
+            return content, attempt + 1, last_in, last_out
         if attempt == 0 and retry_empty:
             logger.warning(
                 "json_object 返回空 content，将重试 agent={} attempt={} prompt_sha12={}",
@@ -163,7 +171,7 @@ def _invoke_raw_sync(
                 sha,
             )
     logger.warning("json_object 仍为空 agent={} prompt_sha12={}", tag, sha)
-    return "", attempts
+    return "", attempts, last_in, last_out
 
 
 async def _invoke_raw_async(
@@ -173,14 +181,16 @@ async def _invoke_raw_async(
     max_tokens: int,
     agent: str,
     retry_empty: bool,
-) -> tuple[str, int]:
+) -> tuple[str, int, int, int]:
     prompt_for_api = ensure_json_object_prompt_has_json_keyword(prompt)
     bound = bind_json_object_mode(llm, max_tokens=max_tokens)
     tag = agent or "json_object"
     sha = _prompt_sha12(prompt_for_api)
     attempts = 2 if retry_empty else 1
+    last_in, last_out = 0, 0
     for attempt in range(attempts):
         response = await bound.ainvoke(prompt_for_api)
+        last_in, last_out = extract_token_usage(response)
         content = (getattr(response, "content", None) or "").strip()
         if content:
             if attempt > 0:
@@ -189,7 +199,7 @@ async def _invoke_raw_async(
                     tag,
                     sha,
                 )
-            return content, attempt + 1
+            return content, attempt + 1, last_in, last_out
         if attempt == 0 and retry_empty:
             logger.warning(
                 "json_object 返回空 content，将重试 agent={} attempt={} prompt_sha12={}",
@@ -198,7 +208,7 @@ async def _invoke_raw_async(
                 sha,
             )
     logger.warning("json_object 仍为空 agent={} prompt_sha12={}", tag, sha)
-    return "", attempts
+    return "", attempts, last_in, last_out
 
 
 def _parse_and_validate(
@@ -252,6 +262,12 @@ def _emit_meta(
     parse_ok: bool,
     used_fallback: bool,
     error_kind: str | None,
+    provider: str,
+    model: str,
+    prompt_sha12: str,
+    input_tokens: int = 0,
+    output_tokens: int = 0,
+    span: Any | None = None,
 ) -> None:
     meta = LLMCallMeta(
         agent=agent,
@@ -263,17 +279,35 @@ def _emit_meta(
         used_fallback=used_fallback,
         error_kind=error_kind,
     )
-    logger.bind(
-        event="llm_json_call",
+    bind = {
+        "event": "llm_json_call",
+        "agent": meta.agent,
+        "schema": meta.schema_name,
+        "max_tokens": meta.max_tokens,
+        "duration_ms": round(meta.duration_ms, 2),
+        "attempts": meta.attempts,
+        "parse_ok": meta.parse_ok,
+        "used_fallback": meta.used_fallback,
+        "error_kind": meta.error_kind,
+        "provider": provider,
+        "prompt_sha12": prompt_sha12,
+    }
+    logger.bind(**bind).info("llm_json_call_done")
+    record_llm_call(
         agent=meta.agent,
-        schema=meta.schema_name,
-        max_tokens=meta.max_tokens,
-        duration_ms=round(meta.duration_ms, 2),
+        schema_name=meta.schema_name,
+        provider=provider,
+        model=model,
+        duration_ms=meta.duration_ms,
         attempts=meta.attempts,
         parse_ok=meta.parse_ok,
         used_fallback=meta.used_fallback,
         error_kind=meta.error_kind,
-    ).info("llm_json_call_done")
+        prompt_sha12=prompt_sha12,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        span=span,
+    )
 
 
 def llm_json_call(
@@ -288,13 +322,59 @@ def llm_json_call(
     http_error_vendor: LlmHttpErrorVendor = "deepseek",
 ) -> T:
     """同步：invoke → 解析 JSON → `schema.model_validate`；失败时 `fallback_factory` 或 `LLMCallError`。"""
-    t0 = time.perf_counter()
     schema_name = getattr(schema, "__name__", str(schema))
+    provider, model = infer_provider_model(llm, http_error_vendor=http_error_vendor)
+    prompt_sha12 = _prompt_sha12(prompt)
+
+    with llm_call_span(
+        agent=agent,
+        schema_name=schema_name,
+        provider=provider,
+        model=model,
+        prompt_sha12=prompt_sha12,
+        max_tokens=max_tokens,
+    ) as span:
+        return _llm_json_call_sync_body(
+            llm,
+            prompt,
+            schema,
+            max_tokens=max_tokens,
+            agent=agent,
+            fallback_factory=fallback_factory,
+            retry_empty=retry_empty,
+            http_error_vendor=http_error_vendor,
+            schema_name=schema_name,
+            provider=provider,
+            model=model,
+            prompt_sha12=prompt_sha12,
+            span=span,
+        )
+
+
+def _llm_json_call_sync_body(
+    llm: Any,
+    prompt: str,
+    schema: type[T],
+    *,
+    max_tokens: int,
+    agent: str,
+    fallback_factory: Callable[[], T] | None,
+    retry_empty: bool,
+    http_error_vendor: LlmHttpErrorVendor,
+    schema_name: str,
+    provider: str,
+    model: str,
+    prompt_sha12: str,
+    span: Any,
+) -> T:
+    t0 = time.perf_counter()
     attempts_used = 0
+    input_tokens = 0
+    output_tokens = 0
     raw = ""
 
     try:
-        raw, attempts_used = _invoke_raw_sync(
+        raw, attempts_used, input_tokens, output_tokens = _invoke_raw_sync(
             llm,
             prompt,
             max_tokens=max_tokens,
@@ -311,6 +391,12 @@ def llm_json_call(
             parse_ok=True,
             used_fallback=False,
             error_kind=None,
+            provider=provider,
+            model=model,
+            prompt_sha12=prompt_sha12,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            span=span,
         )
         if agent_verbose_enabled():
             log_agent_payload(
@@ -331,6 +417,12 @@ def llm_json_call(
             parse_ok=False,
             used_fallback=used_fb,
             error_kind=e.kind,
+            provider=provider,
+            model=model,
+            prompt_sha12=prompt_sha12,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            span=span,
         )
         if agent_verbose_enabled():
             log_agent_payload(
@@ -354,6 +446,12 @@ def llm_json_call(
             parse_ok=False,
             used_fallback=used_fb,
             error_kind="invoke",
+            provider=provider,
+            model=model,
+            prompt_sha12=prompt_sha12,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            span=span,
         )
         if agent_verbose_enabled():
             log_agent_payload(
@@ -383,13 +481,59 @@ async def allm_json_call(
     http_error_vendor: LlmHttpErrorVendor = "deepseek",
 ) -> T:
     """异步版，语义与 `llm_json_call` 一致。"""
-    t0 = time.perf_counter()
     schema_name = getattr(schema, "__name__", str(schema))
+    provider, model = infer_provider_model(llm, http_error_vendor=http_error_vendor)
+    prompt_sha12 = _prompt_sha12(prompt)
+
+    with llm_call_span(
+        agent=agent,
+        schema_name=schema_name,
+        provider=provider,
+        model=model,
+        prompt_sha12=prompt_sha12,
+        max_tokens=max_tokens,
+    ) as span:
+        return await _allm_json_call_async_body(
+            llm,
+            prompt,
+            schema,
+            max_tokens=max_tokens,
+            agent=agent,
+            fallback_factory=fallback_factory,
+            retry_empty=retry_empty,
+            http_error_vendor=http_error_vendor,
+            schema_name=schema_name,
+            provider=provider,
+            model=model,
+            prompt_sha12=prompt_sha12,
+            span=span,
+        )
+
+
+async def _allm_json_call_async_body(
+    llm: Any,
+    prompt: str,
+    schema: type[T],
+    *,
+    max_tokens: int,
+    agent: str,
+    fallback_factory: Callable[[], T] | None,
+    retry_empty: bool,
+    http_error_vendor: LlmHttpErrorVendor,
+    schema_name: str,
+    provider: str,
+    model: str,
+    prompt_sha12: str,
+    span: Any,
+) -> T:
+    t0 = time.perf_counter()
     attempts_used = 0
+    input_tokens = 0
+    output_tokens = 0
     raw = ""
 
     try:
-        raw, attempts_used = await _invoke_raw_async(
+        raw, attempts_used, input_tokens, output_tokens = await _invoke_raw_async(
             llm,
             prompt,
             max_tokens=max_tokens,
@@ -406,6 +550,12 @@ async def allm_json_call(
             parse_ok=True,
             used_fallback=False,
             error_kind=None,
+            provider=provider,
+            model=model,
+            prompt_sha12=prompt_sha12,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            span=span,
         )
         if agent_verbose_enabled():
             log_agent_payload(
@@ -426,6 +576,12 @@ async def allm_json_call(
             parse_ok=False,
             used_fallback=used_fb,
             error_kind=e.kind,
+            provider=provider,
+            model=model,
+            prompt_sha12=prompt_sha12,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            span=span,
         )
         if agent_verbose_enabled():
             log_agent_payload(
@@ -449,6 +605,12 @@ async def allm_json_call(
             parse_ok=False,
             used_fallback=used_fb,
             error_kind="invoke",
+            provider=provider,
+            model=model,
+            prompt_sha12=prompt_sha12,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            span=span,
         )
         if agent_verbose_enabled():
             log_agent_payload(
diff --git a/api/app/core/llm_gateway.py b/api/app/core/llm_gateway.py
index 3e7c16d..29eadf1 100644
--- a/api/app/core/llm_gateway.py
+++ b/api/app/core/llm_gateway.py
@@ -14,6 +14,7 @@ from pydantic import BaseModel
 
 from app.core.dependencies import get_llm_provider, get_llm_provider_fast
 from app.core.llm_call import allm_json_call, llm_json_call
+from app.core.llm_telemetry import langchain_invoke_span
 
 T = TypeVar("T", bound=BaseModel)
 
@@ -58,16 +59,32 @@ class LlmGateway:
                 else 0.7
             )
         )
-        return await provider.complete(
-            messages,
+        resolved_model = model if model is not None else (use_case.model if use_case else None)
+        agent_name = use_case.name if use_case else "llm_gateway.chat"
+        kwargs = dict(
+            messages=messages,
             temperature=resolved_temperature,
-            model=model if model is not None else (use_case.model if use_case else None),
+            model=resolved_model,
             max_tokens=(
                 max_tokens
                 if max_tokens is not None
                 else (use_case.max_tokens if use_case else None)
             ),
         )
+        # DeepSeekProvider.complete 已包 langchain_invoke_span，避免双层 span
+        from app.adapters.llm.deepseek import DeepSeekLLMProvider
+
+        if isinstance(provider, DeepSeekLLMProvider):
+            return await provider.complete(**kwargs)
+
+        provider_label = type(provider).__name__.replace("Provider", "").lower() or "unknown"
+        with langchain_invoke_span(
+            agent=agent_name,
+            provider=provider_label,
+            model=resolved_model or "unknown",
+            call_type="chat",
+        ):
+            return await provider.complete(**kwargs)
 
     async def json_object(
         self,
diff --git a/api/app/core/llm_telemetry.py b/api/app/core/llm_telemetry.py
new file mode 100644
index 0000000..6d52d55
--- /dev/null
+++ b/api/app/core/llm_telemetry.py
@@ -0,0 +1,384 @@
+"""
+LLM 调用 OpenTelemetry span 与 metrics（低基数 attributes，不含 prompt/response 正文）。
+"""
+
+from __future__ import annotations
+
+import time
+from contextlib import contextmanager
+from typing import Any, Iterator, Literal
+
+from opentelemetry import trace
+from opentelemetry.trace import Status, StatusCode
+
+from app.core.config import settings
+from app.core.telemetry import get_meter, get_tracer
+
+CallType = Literal["json", "chat", "stream"]
+
+_meter = None
+_duration_hist = None
+_call_counter = None
+_tokens_in_counter = None
+_tokens_out_counter = None
+
+
+def _ensure_instruments() -> None:
+    global _meter, _duration_hist, _call_counter, _tokens_in_counter, _tokens_out_counter
+    if _meter is not None or not settings.otel_enabled:
+        return
+    _meter = get_meter("app.llm")
+    _duration_hist = _meter.create_histogram(
+        "llm.call.duration",
+        unit="ms",
+        description="LLM call wall time",
+    )
+    _call_counter = _meter.create_counter(
+        "llm.call.total",
+        description="LLM call count by outcome",
+    )
+    _tokens_in_counter = _meter.create_counter(
+        "llm.tokens.input",
+        description="LLM input tokens when reported by provider",
+    )
+    _tokens_out_counter = _meter.create_counter(
+        "llm.tokens.output",
+        description="LLM output tokens when reported by provider",
+    )
+
+
+def infer_provider_model(
+    llm: Any,
+    *,
+    http_error_vendor: str = "deepseek",
+) -> tuple[str, str]:
+    model = ""
+    for attr in ("model_name", "model"):
+        v = getattr(llm, attr, None)
+        if v:
+            model = str(v)
+            break
+    provider = (http_error_vendor or "unknown").strip().lower()
+    return provider, model
+
+
+def _outcome_label(*, parse_ok: bool, used_fallback: bool, error_kind: str | None) -> str:
+    if parse_ok and not used_fallback:
+        return "ok"
+    if used_fallback:
+        return "fallback"
+    return error_kind or "error"
+
+
+def extract_token_usage(response: Any) -> tuple[int, int]:
+    """从 LangChain AIMessage / chunk 解析 token 用量。"""
+    usage = getattr(response, "usage_metadata", None)
+    if usage is None and hasattr(response, "response_metadata"):
+        meta = getattr(response, "response_metadata", None) or {}
+        if isinstance(meta, dict):
+            usage = meta.get("token_usage") or meta.get("usage")
+    if usage is None:
+        return 0, 0
+    if isinstance(usage, dict):
+        inp = usage.get("input_tokens") or usage.get("prompt_tokens") or 0
+        out = usage.get("output_tokens") or usage.get("completion_tokens") or 0
+        return int(inp or 0), int(out or 0)
+    inp = getattr(usage, "input_tokens", None) or getattr(usage, "prompt_tokens", None) or 0
+    out = (
+        getattr(usage, "output_tokens", None)
+        or getattr(usage, "completion_tokens", None)
+        or 0
+    )
+    return int(inp or 0), int(out or 0)
+
+
+def record_llm_completion(
+    *,
+    agent: str,
+    provider: str,
+    model: str,
+    duration_ms: float,
+    call_type: CallType = "chat",
+    outcome: str = "ok",
+    input_tokens: int = 0,
+    output_tokens: int = 0,
+    span: trace.Span | None = None,
+    extra_span_attributes: dict[str, Any] | None = None,
+) -> None:
+    if not settings.otel_enabled:
+        return
+
+    _ensure_instruments()
+    attrs = {
+        "agent": agent,
+        "provider": provider,
+        "call_type": call_type,
+        "outcome": outcome,
+    }
+    if _duration_hist is not None:
+        _duration_hist.record(duration_ms, attrs)
+    if _call_counter is not None:
+        _call_counter.add(1, attrs)
+    if input_tokens > 0 and _tokens_in_counter is not None:
+        _tokens_in_counter.add(input_tokens, {"provider": provider, "agent": agent})
+    if output_tokens > 0 and _tokens_out_counter is not None:
+        _tokens_out_counter.add(output_tokens, {"provider": provider, "agent": agent})
+
+    if span is not None and span.is_recording():
+        span.set_attribute("llm.duration_ms", round(duration_ms, 2))
+        span.set_attribute("llm.call_type", call_type)
+        span.set_attribute("llm.outcome", outcome)
+        if input_tokens:
+            span.set_attribute("llm.tokens.input", input_tokens)
+        if output_tokens:
+            span.set_attribute("llm.tokens.output", output_tokens)
+        if extra_span_attributes:
+            for k, v in extra_span_attributes.items():
+                span.set_attribute(k, v)
+        if outcome == "ok":
+            span.set_status(Status(StatusCode.OK))
+        elif outcome == "fallback":
+            span.set_status(Status(StatusCode.OK, "fallback"))
+        else:
+            span.set_status(Status(StatusCode.ERROR, outcome))
+
+
+@contextmanager
+def langchain_invoke_span(
+    *,
+    agent: str,
+    provider: str,
+    model: str,
+    call_type: CallType,
+    prompt_sha12: str = "",
+    max_tokens: int | None = None,
+) -> Iterator[dict[str, Any]]:
+    """
+    包住 LangChain invoke/ainvoke；yield 可变 dict 供调用方写入 response 后触发 record。
+    keys: response, outcome, input_tokens, output_tokens, error_kind
+    """
+    ctx: dict[str, Any] = {
+        "response": None,
+        "outcome": "ok",
+        "input_tokens": 0,
+        "output_tokens": 0,
+    }
+    if not settings.otel_enabled:
+        yield ctx
+        return
+
+    tracer = get_tracer("app.llm")
+    span_name = {
+        "json": "llm.json_invoke",
+        "chat": "llm.chat_invoke",
+        "stream": "llm.stream_invoke",
+    }.get(call_type, "llm.invoke")
+    attrs: dict[str, Any] = {
+        "llm.agent": agent,
+        "llm.provider": provider,
+        "llm.model": model or "unknown",
+        "llm.call_type": call_type,
+    }
+    if prompt_sha12:
+        attrs["llm.prompt_sha12"] = prompt_sha12
+    if max_tokens is not None:
+        attrs["llm.max_tokens"] = max_tokens
+
+    t0 = time.perf_counter()
+    with tracer.start_as_current_span(span_name, attributes=attrs) as span:
+        try:
+            yield ctx
+        except Exception:
+            ctx["outcome"] = "error"
+            raise
+        finally:
+            duration_ms = (time.perf_counter() - t0) * 1000
+            resp = ctx.get("response")
+            if resp is not None and not ctx.get("input_tokens") and not ctx.get("output_tokens"):
+                inp, out = extract_token_usage(resp)
+                ctx["input_tokens"] = inp
+                ctx["output_tokens"] = out
+            record_llm_completion(
+                agent=agent,
+                provider=provider,
+                model=model,
+                duration_ms=duration_ms,
+                call_type=call_type,
+                outcome=str(ctx.get("outcome") or "ok"),
+                input_tokens=int(ctx.get("input_tokens") or 0),
+                output_tokens=int(ctx.get("output_tokens") or 0),
+                span=span,
+            )
+
+
+@contextmanager
+def llm_call_span(
+    *,
+    agent: str,
+    schema_name: str,
+    provider: str,
+    model: str,
+    prompt_sha12: str,
+    max_tokens: int,
+) -> Iterator[trace.Span]:
+    if not settings.otel_enabled:
+        yield trace.INVALID_SPAN
+        return
+    tracer = get_tracer("app.llm")
+    with tracer.start_as_current_span(
+        "llm.json_call",
+        attributes={
+            "llm.agent": agent,
+            "llm.schema_name": schema_name,
+            "llm.provider": provider,
+            "llm.model": model or "unknown",
+            "llm.prompt_sha12": prompt_sha12,
+            "llm.max_tokens": max_tokens,
+            "llm.call_type": "json",
+        },
+    ) as span:
+        yield span
+
+
+async def observe_ainvoke(
+    llm: Any,
+    messages: Any,
+    *,
+    agent: str,
+    provider: str = "deepseek",
+    model: str = "",
+    call_type: CallType = "chat",
+    extra_span_attributes: dict[str, Any] | None = None,
+    record_response_latency_ms: bool = True,
+) -> Any:
+    """包装 ``ainvoke``，统一 span + metrics。"""
+    t0 = time.perf_counter()
+    with langchain_invoke_span(
+        agent=agent,
+        provider=provider,
+        model=model,
+        call_type=call_type,
+    ) as tel:
+        result = await llm.ainvoke(messages)
+        tel["response"] = result
+        span = trace.get_current_span()
+        if span.is_recording():
+            if record_response_latency_ms:
+                span.set_attribute(
+                    "llm.response_latency_ms",
+                    round((time.perf_counter() - t0) * 1000, 2),
+                )
+            if extra_span_attributes:
+                for key, value in extra_span_attributes.items():
+                    if value is not None:
+                        span.set_attribute(key, value)
+        return result
+
+
+async def observe_astream(
+    llm: Any,
+    prompt: Any,
+    *,
+    agent: str,
+    provider: str = "deepseek",
+    model: str = "",
+):
+    """包装 ``astream``，记录 wall time 与可选 TTFT。"""
+    if not settings.otel_enabled:
+        async for chunk in llm.astream(prompt):
+            yield chunk
+        return
+
+    tracer = get_tracer("app.llm")
+    t0 = time.perf_counter()
+    ttft_ms: float | None = None
+    last_chunk: Any = None
+    with tracer.start_as_current_span(
+        "llm.stream_invoke",
+        attributes={
+            "llm.agent": agent,
+            "llm.provider": provider,
+            "llm.model": model or "unknown",
+            "llm.call_type": "stream",
+        },
+    ) as span:
+        try:
+            async for chunk in llm.astream(prompt):
+                if ttft_ms is None and getattr(chunk, "content", None):
+                    ttft_ms = (time.perf_counter() - t0) * 1000
+                last_chunk = chunk
+                yield chunk
+        except Exception:
+            duration_ms = (time.perf_counter() - t0) * 1000
+            record_llm_completion(
+                agent=agent,
+                provider=provider,
+                model=model,
+                duration_ms=duration_ms,
+                call_type="stream",
+                outcome="error",
+                span=span,
+                extra_span_attributes=(
+                    {"llm.ttft_ms": round(ttft_ms, 2)} if ttft_ms is not None else None
+                ),
+            )
+            raise
+        duration_ms = (time.perf_counter() - t0) * 1000
+        inp, out = extract_token_usage(last_chunk) if last_chunk else (0, 0)
+        extra: dict[str, Any] = {}
+        if ttft_ms is not None:
+            extra["llm.ttft_ms"] = round(ttft_ms, 2)
+        record_llm_completion(
+            agent=agent,
+            provider=provider,
+            model=model,
+            duration_ms=duration_ms,
+            call_type="stream",
+            outcome="ok",
+            input_tokens=inp,
+            output_tokens=out,
+            span=span,
+            extra_span_attributes=extra or None,
+        )
+
+
+def record_llm_call(
+    *,
+    agent: str,
+    schema_name: str,
+    provider: str,
+    model: str,
+    duration_ms: float,
+    attempts: int,
+    parse_ok: bool,
+    used_fallback: bool,
+    error_kind: str | None,
+    prompt_sha12: str,
+    input_tokens: int = 0,
+    output_tokens: int = 0,
+    span: trace.Span | None = None,
+) -> None:
+    outcome = _outcome_label(
+        parse_ok=parse_ok,
+        used_fallback=used_fallback,
+        error_kind=error_kind,
+    )
+    record_llm_completion(
+        agent=agent,
+        provider=provider,
+        model=model,
+        duration_ms=duration_ms,
+        call_type="json",
+        outcome=outcome,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        span=span,
+        extra_span_attributes={
+            "llm.schema_name": schema_name,
+            "llm.attempts": attempts,
+            "llm.parse_ok": parse_ok,
+            "llm.used_fallback": used_fallback,
+            **({"llm.error_kind": error_kind} if error_kind else {}),
+            **({"llm.prompt_sha12": prompt_sha12} if prompt_sha12 else {}),
+        },
+    )
diff --git a/api/app/core/logging.py b/api/app/core/logging.py
index 18175bd..b065ce9 100644
--- a/api/app/core/logging.py
+++ b/api/app/core/logging.py
@@ -108,10 +108,32 @@ def _stdlib_emit_display(log_record: logging.LogRecord) -> tuple[str, int]:
     return fn, ln
 
 
+def _merge_trace_context(record: Any) -> None:
+    """每条日志合并当前 OTel trace/span（覆盖 Celery/后台无 HTTP middleware 的场景）。"""
+    try:
+        from app.core.telemetry import current_trace_context
+
+        ctx = current_trace_context()
+        if not ctx:
+            return
+    except Exception:
+        return
+    ex = record["extra"]
+    for k, v in ctx.items():
+        if not v:
+            continue
+        cur = ex.get(k)
+        if cur is None or str(cur).strip() in ("", "-"):
+            ex[k] = v
+
+
 def _stderr_format(record: Any) -> str:
-    """控制台 sink：request_id / correlation_id / user_id 有值时才显示对应列。"""
+    """控制台 sink：request_id / correlation_id / user_id / trace_id 有值时才显示对应列。"""
     rid = str(record["extra"].get("request_id") or "").strip()
     rid_part = "<dim>rid={extra[request_id]}</dim> | " if rid and rid != "-" else ""
+    tid = str(record["extra"].get("trace_id") or "").strip()
+    tid_short = tid[:12] if len(tid) > 12 else tid
+    tid_part = f"<dim>tid={tid_short}</dim> | " if tid else ""
     cid = str(record["extra"].get("correlation_id") or "").strip()
     cid_part = "<dim>corr={extra[correlation_id]}</dim> | " if cid else ""
     uid = str(record["extra"].get("user_id") or "").strip()
@@ -120,7 +142,7 @@ def _stderr_format(record: Any) -> str:
         "<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
         "<level>{level.name: <8}</level> | "
         "<cyan>{extra[module]}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | "
-        f"{rid_part}{cid_part}{uid_part}"
+        f"{rid_part}{tid_part}{cid_part}{uid_part}"
         "<level>{message}</level>\n{exception}"
     )
 
@@ -242,8 +264,8 @@ def setup_logging() -> None:
             enqueue=True,
         )
 
-    logger.configure(extra={"request_id": "-", "module": "-"})
-    logger = logger.patch(_merge_celery_worker_extra)
+    logger.configure(extra={"request_id": "-", "module": "-", "trace_id": "", "span_id": ""})
+    logger = logger.patch(_merge_celery_worker_extra).patch(_merge_trace_context)
 
     # 仅 root 挂 InterceptHandler，避免子 logger 与 root 各处理一次导致重复行
     root = logging.getLogger()
diff --git a/api/app/core/middleware.py b/api/app/core/middleware.py
index 9bdcdc0..c7a708d 100644
--- a/api/app/core/middleware.py
+++ b/api/app/core/middleware.py
@@ -8,6 +8,7 @@ from starlette.middleware.base import BaseHTTPMiddleware
 from starlette.requests import Request
 
 from app.core.logging import logger
+from app.core.telemetry import current_trace_context
 
 
 class RequestIdMiddleware(BaseHTTPMiddleware):
@@ -17,7 +18,8 @@ class RequestIdMiddleware(BaseHTTPMiddleware):
         request_id = request.headers.get("X-Request-ID") or str(uuid.uuid4())
         request.state.request_id = request_id
 
-        with logger.contextualize(request_id=request_id):
+        bind = {"request_id": request_id, **current_trace_context()}
+        with logger.contextualize(**bind):
             response = await call_next(request)
 
         response.headers["X-Request-ID"] = request_id
diff --git a/api/app/core/telemetry.py b/api/app/core/telemetry.py
new file mode 100644
index 0000000..21b45a4
--- /dev/null
+++ b/api/app/core/telemetry.py
@@ -0,0 +1,146 @@
+"""
+OpenTelemetry 初始化：traces / metrics / logs 导出至 OTLP Collector。
+
+在 ``setup_logging()`` 之后、FastAPI / Celery 应用创建前调用 ``setup_telemetry(service_name=...)``。
+``OTEL_ENABLED=false`` 时无操作，便于测试与无 Collector 环境。
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+from opentelemetry import metrics, trace
+from opentelemetry._logs import set_logger_provider
+from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter
+from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.instrumentation.celery import CeleryInstrumentor
+from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
+from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
+from opentelemetry.instrumentation.logging import LoggingInstrumentor
+from opentelemetry.instrumentation.redis import RedisInstrumentor
+from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
+from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
+from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
+from opentelemetry.sdk.metrics import MeterProvider
+from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
+from opentelemetry.sdk.resources import Resource
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.sdk.trace.sampling import ParentBasedTraceIdRatio
+
+from app.core.config import settings
+
+if TYPE_CHECKING:
+    from fastapi import FastAPI
+
+_initialized = False
+_otel_logging_handler: LoggingHandler | None = None
+
+
+def _build_resource(service_name: str) -> Resource:
+    return Resource.create(
+        {
+            "service.name": service_name,
+            "deployment.environment": settings.app_environment,
+            "service.version": "0.2.0",
+        }
+    )
+
+
+def _build_sampler():
+    from opentelemetry.sdk.trace.sampling import (
+        ALWAYS_OFF,
+        ALWAYS_ON,
+        TraceIdRatioBased,
+    )
+
+    name = (settings.otel_traces_sampler or "always_on").strip().lower()
+    arg = settings.otel_traces_sampler_arg
+    if name in ("always_on", "alwayson"):
+        return ALWAYS_ON
+    if name in ("always_off", "alwaysoff"):
+        return ALWAYS_OFF
+    ratio = 0.1 if arg is None else arg
+    if name == "traceidratio":
+        return TraceIdRatioBased(ratio)
+    return ParentBasedTraceIdRatio(ratio)
+
+
+def setup_telemetry(*, service_name: str) -> None:
+    """配置 OTLP exporter 与自动 instrumentation（幂等）。"""
+    global _initialized, _otel_logging_handler
+    if _initialized or not settings.otel_enabled:
+        return
+
+    endpoint = settings.otel_exporter_otlp_endpoint.rstrip("/")
+    insecure = settings.otel_exporter_otlp_insecure
+
+    resource = _build_resource(service_name)
+
+    span_exporter = OTLPSpanExporter(endpoint=endpoint, insecure=insecure)
+    tracer_provider = TracerProvider(resource=resource, sampler=_build_sampler())
+    tracer_provider.add_span_processor(BatchSpanProcessor(span_exporter))
+    trace.set_tracer_provider(tracer_provider)
+
+    metric_exporter = OTLPMetricExporter(endpoint=endpoint, insecure=insecure)
+    metric_reader = PeriodicExportingMetricReader(
+        metric_exporter,
+        export_interval_millis=settings.otel_metric_export_interval_ms,
+    )
+    meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
+    metrics.set_meter_provider(meter_provider)
+
+    log_exporter = OTLPLogExporter(endpoint=endpoint, insecure=insecure)
+    log_provider = LoggerProvider(resource=resource)
+    log_provider.add_log_record_processor(BatchLogRecordProcessor(log_exporter))
+    set_logger_provider(log_provider)
+
+    LoggingInstrumentor().instrument(set_logging_format=True)
+    _otel_logging_handler = LoggingHandler(
+        level=logging.NOTSET,
+        logger_provider=log_provider,
+    )
+    logging.getLogger().addHandler(_otel_logging_handler)
+
+    HTTPXClientInstrumentor().instrument()
+    RedisInstrumentor().instrument()
+    SQLAlchemyInstrumentor().instrument()
+
+    _initialized = True
+
+
+def instrument_fastapi_app(app: FastAPI) -> None:
+    if not settings.otel_enabled:
+        return
+    FastAPIInstrumentor.instrument_app(
+        app,
+        excluded_urls="/health",
+    )
+
+
+def instrument_celery() -> None:
+    if not settings.otel_enabled:
+        return
+    CeleryInstrumentor().instrument()
+
+
+def get_tracer(name: str):
+    return trace.get_tracer(name)
+
+
+def get_meter(name: str):
+    return metrics.get_meter(name)
+
+
+def current_trace_context() -> dict[str, str]:
+    """返回当前 span 的 trace_id / span_id（十六进制），无活跃 span 时为空 dict。"""
+    span = trace.get_current_span()
+    ctx = span.get_span_context()
+    if not ctx.is_valid:
+        return {}
+    return {
+        "trace_id": format(ctx.trace_id, "032x"),
+        "span_id": format(ctx.span_id, "016x"),
+    }
diff --git a/api/app/features/conversation/ws/pipeline.py b/api/app/features/conversation/ws/pipeline.py
index bfeeca6..0745491 100644
--- a/api/app/features/conversation/ws/pipeline.py
+++ b/api/app/features/conversation/ws/pipeline.py
@@ -20,6 +20,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.agents.chat import ChatOrchestrator
 from app.agents.chat.reply_limits import segments_from_llm_response
 from app.core.agent_logging import agent_summary_enabled
+from app.core.business_telemetry import business_span
 from app.core.config import settings
 from app.core.cos_url_keys import (
     TTS_PRESIGNED_EXPIRES_SEC,
@@ -634,6 +635,12 @@ def _split_audio_bytes(audio_bytes: bytes, fmt: str) -> list[bytes]:
 async def _transcribe_long_audio(audio_bytes: bytes, fmt: str = "m4a") -> str:
     """超过 55 s 的音频自动切片后并行 ASR；短音频直接转写。"""
     asr = get_asr_provider()
+    return await _transcribe_long_audio_inner(audio_bytes, fmt, asr)
+
+
+async def _transcribe_long_audio_inner(
+    audio_bytes: bytes, fmt: str, asr: Any
+) -> str:
     try:
         chunks = await asyncio.to_thread(_split_audio_bytes, audio_bytes, fmt)
     except Exception as exc:
@@ -938,6 +945,32 @@ async def process_user_message(
     tts_this_turn: Optional[bool] = None,
 ) -> None:
     """处理用户消息，生成 Agent 回应。由 ChatOrchestrator 路由到 ProfileAgent 或 InterviewAgent。"""
+    with business_span("conversation.ws.process_turn"):
+        await _process_user_message_inner(
+            conversation_id,
+            user_message,
+            conversation,
+            segment,
+            db,
+            user,
+            user_message_timestamp,
+            force_skip_tts=force_skip_tts,
+            tts_this_turn=tts_this_turn,
+        )
+
+
+async def _process_user_message_inner(
+    conversation_id: str,
+    user_message: str,
+    conversation: Conversation,
+    segment: Segment,
+    db: AsyncSession,
+    user: User = None,
+    user_message_timestamp: Optional[datetime] = None,
+    *,
+    force_skip_tts: bool = False,
+    tts_this_turn: Optional[bool] = None,
+) -> None:
     store = ConversationHistoryStore(db)
     tts_urls: list[str] = []
     user_language = _resolve_user_language(user)
diff --git a/api/app/features/evaluation/judge_service.py b/api/app/features/evaluation/judge_service.py
index e4b89fe..d9f9929 100644
--- a/api/app/features/evaluation/judge_service.py
+++ b/api/app/features/evaluation/judge_service.py
@@ -445,7 +445,16 @@ class EvalJudgeService:
         if hasattr(llm, "bind"):
             llm = llm.bind(max_tokens=_COMPARE_STREAM_MAX)
         try:
-            async for chunk in llm.astream(prompt):
+            from app.core.llm_telemetry import infer_provider_model, observe_astream
+
+            provider, model = infer_provider_model(llm, http_error_vendor="zhipu")
+            async for chunk in observe_astream(
+                llm,
+                prompt,
+                agent="EvalJudge.stream_conversation_compare",
+                provider=provider,
+                model=model,
+            ):
                 piece = getattr(chunk, "content", None)
                 if piece:
                     yield piece
diff --git a/api/app/features/memoir/story_pipeline_sync.py b/api/app/features/memoir/story_pipeline_sync.py
index f4cc3f6..dc127d6 100644
--- a/api/app/features/memoir/story_pipeline_sync.py
+++ b/api/app/features/memoir/story_pipeline_sync.py
@@ -27,6 +27,7 @@ from app.agents.memoir.story_route_agent import (
     StoryRouteAgent,
     default_append_target_story_id,
 )
+from app.core.business_telemetry import business_span
 from app.agents.stage_constants import (
     CATEGORY_TO_CHAT_STAGE,
     CHAPTER_CATEGORIES,
@@ -996,6 +997,46 @@ def run_story_pipeline_for_category_batch(
 
     返回 :class:`StoryPipelineResult`。低置信路由会被延迟而不创建 Story/Chapter。
     """
+    with business_span(
+        "memoir.story_pipeline.batch",
+        chapter_category=chapter_category,
+        segment_count=len(category_segments),
+    ):
+        return _run_story_pipeline_batch_inner(
+            session,
+            user_id=user_id,
+            chapter_category=chapter_category,
+            category_segments=category_segments,
+            state=state,
+            user_profile=user_profile,
+            user_birth_year=user_birth_year,
+            llm=llm,
+            background_voice=background_voice,
+            occupation=occupation,
+            memoir_correlation_id=memoir_correlation_id,
+            llm_fast=llm_fast,
+            memory_evidence=memory_evidence,
+            language=language,
+        )
+
+
+def _run_story_pipeline_batch_inner(
+    session: Session,
+    *,
+    user_id: str,
+    chapter_category: str,
+    category_segments: list,
+    state: MemoirStateSchema,
+    user_profile: str,
+    user_birth_year: int | None,
+    llm: Any,
+    background_voice: str = "default",
+    occupation: str = "",
+    memoir_correlation_id: str | None = None,
+    llm_fast: Any | None = None,
+    memory_evidence: dict | None = None,
+    language: str = "zh",
+) -> StoryPipelineResult:
     pipeline_phase_timings: dict[str, float] = {}
     narrative_agent = NarrativeAgent()
     route_agent = StoryRouteAgent()
@@ -1013,9 +1054,10 @@ def run_story_pipeline_for_category_batch(
         top_k = int(settings.evidence_top_k_large_batch)
 
     def _oral_job() -> tuple[str, float]:
-        t_oral = time.perf_counter()
-        out = normalize_oral_for_memoir(combined_text, llm=llm)
-        return out, time.perf_counter() - t_oral
+        with business_span("memoir.story_pipeline.oral_normalize"):
+            t_oral = time.perf_counter()
+            out = normalize_oral_for_memoir(combined_text, llm=llm)
+            return out, time.perf_counter() - t_oral
 
     _t_parallel = time.perf_counter()
     with ThreadPoolExecutor(max_workers=1) as pool:
@@ -1045,7 +1087,8 @@ def run_story_pipeline_for_category_batch(
         top_k,
     )
 
-    evidence_text = format_evidence_chunks_for_prompt(evidence)
+    with business_span("memoir.story_pipeline.evidence_prep", chapter_category=chapter_category):
+        evidence_text = format_evidence_chunks_for_prompt(evidence)
     ct_raw = (combined_text or "").strip()
     om_norm = (oral_for_memoir or "").strip()
     if ct_raw != om_norm:
@@ -1099,35 +1142,36 @@ def run_story_pipeline_for_category_batch(
     calculated_order_index = STAGE_TO_ORDER.get(chapter_category, 999)
 
     _t0 = time.perf_counter()
-    use_batch_plan = (
-        llm_route
-        and len(category_segments) >= 2
-        and len(category_segments) <= PLAN_BATCH_MAX_SEGMENTS
-    )
-    plan: StoryBatchPlan | None = None
-    if use_batch_plan:
-        segs = _route_segment_texts(category_segments)
-        plan = route_agent.plan_batch(
-            chapter_category=chapter_category,
-            chapter_title=title,
-            segments=segs,
-            candidate_stories=candidates,
-            llm=llm_route,
-            valid_story_ids=valid_ids,
-            story_meta=story_meta,
+    with business_span("memoir.story_pipeline.route", chapter_category=chapter_category):
+        use_batch_plan = (
+            llm_route
+            and len(category_segments) >= 2
+            and len(category_segments) <= PLAN_BATCH_MAX_SEGMENTS
         )
+        plan: StoryBatchPlan | None = None
+        if use_batch_plan:
+            segs = _route_segment_texts(category_segments)
+            plan = route_agent.plan_batch(
+                chapter_category=chapter_category,
+                chapter_title=title,
+                segments=segs,
+                candidate_stories=candidates,
+                llm=llm_route,
+                valid_story_ids=valid_ids,
+                story_meta=story_meta,
+            )
 
-    single_route: Any = None
-    if plan is None:
-        single_route = route_agent.decide(
-            chapter_category=chapter_category,
-            chapter_title=title,
-            batch_transcript=route_transcript,
-            candidate_stories=candidates,
-            llm=llm_route,
-            valid_story_ids=valid_ids,
-            story_meta=story_meta,
-        )
+        single_route: Any = None
+        if plan is None:
+            single_route = route_agent.decide(
+                chapter_category=chapter_category,
+                chapter_title=title,
+                batch_transcript=route_transcript,
+                candidate_stories=candidates,
+                llm=llm_route,
+                valid_story_ids=valid_ids,
+                story_meta=story_meta,
+            )
     pipeline_phase_timings["route"] = time.perf_counter() - _t0
 
     if (
@@ -1166,89 +1210,91 @@ def run_story_pipeline_for_category_batch(
     )
 
     _t0 = time.perf_counter()
-    if plan is not None:
-        dispatch_ids = _run_batch_plan_writes(
-            session,
-            plan=plan,
-            category_segments=category_segments,
-            chapter=chapter,
-            chapter_category=chapter_category,
-            evidence_text=evidence_text,
-            evidence=evidence,
-            evidence_top_k=top_k,
-            slot_snippets=slot_snippets,
-            user_id=user_id,
-            user_profile=user_profile,
-            user_birth_year=user_birth_year,
-            llm=llm,
-            narrative_agent=narrative_agent,
-            candidate_stories=candidates,
-            story_meta=story_meta,
-            background_voice=background_voice,
-            occupation=occupation,
-            memoir_correlation_id=memoir_correlation_id,
-            fidelity_llm=llm_fidelity,
-            language=language,
-        )
-    else:
-        route = single_route
-        decision_source = (
-            route.reason
-            if route.reason in FALLBACK_NEW_STORY_REASONS
-            else ("fallback_no_llm" if not llm_route else "single_decide")
-        )
-        target_story_id, existing_for_narrative, decision_source = (
-            _resolve_append_target(
+    with business_span("memoir.story_pipeline.narrative_writes", chapter_category=chapter_category):
+        if plan is not None:
+            dispatch_ids = _run_batch_plan_writes(
                 session,
-                route_decision=route.decision,
-                route_target_story_id=route.target_story_id,
-                user_id=user_id,
+                plan=plan,
+                category_segments=category_segments,
+                chapter=chapter,
                 chapter_category=chapter_category,
-                oral_norm=om_norm,
+                evidence_text=evidence_text,
+                evidence=evidence,
+                evidence_top_k=top_k,
+                slot_snippets=slot_snippets,
+                user_id=user_id,
+                user_profile=user_profile,
+                user_birth_year=user_birth_year,
+                llm=llm,
+                narrative_agent=narrative_agent,
                 candidate_stories=candidates,
                 story_meta=story_meta,
-                decision_source=decision_source,
+                background_voice=background_voice,
+                occupation=occupation,
                 memoir_correlation_id=memoir_correlation_id,
+                fidelity_llm=llm_fidelity,
+                language=language,
+            )
+        else:
+            route = single_route
+            decision_source = (
+                route.reason
+                if route.reason in FALLBACK_NEW_STORY_REASONS
+                else ("fallback_no_llm" if not llm_route else "single_decide")
+            )
+            target_story_id, existing_for_narrative, decision_source = (
+                _resolve_append_target(
+                    session,
+                    route_decision=route.decision,
+                    route_target_story_id=route.target_story_id,
+                    user_id=user_id,
+                    chapter_category=chapter_category,
+                    oral_norm=om_norm,
+                    candidate_stories=candidates,
+                    story_meta=story_meta,
+                    decision_source=decision_source,
+                    memoir_correlation_id=memoir_correlation_id,
+                )
             )
-        )
 
-        sid, _ = _execute_narrative_unit(
-            session,
-            oral_text=oral_for_memoir,
-            evidence_text=evidence_text,
-            evidence=evidence,
-            evidence_top_k=top_k,
-            chapter=chapter,
-            chapter_category=chapter_category,
-            slot_snippets=slot_snippets,
-            user_id=user_id,
-            user_profile=user_profile,
-            user_birth_year=user_birth_year,
-            llm=llm,
-            narrative_agent=narrative_agent,
-            target_story_id=target_story_id,
-            existing_for_narrative=existing_for_narrative,
-            decision_source=decision_source,
-            route_decision=route.decision,
-            route_type="single",
-            segment_ids=[str(s.id) for s in category_segments],
-            category_segments=category_segments,
-            background_voice=background_voice,
-            occupation=occupation,
-            memoir_correlation_id=memoir_correlation_id,
-            fidelity_llm=llm_fidelity,
-            language=language,
-        )
-        if sid:
-            dispatch_ids.add(sid)
+            sid, _ = _execute_narrative_unit(
+                session,
+                oral_text=oral_for_memoir,
+                evidence_text=evidence_text,
+                evidence=evidence,
+                evidence_top_k=top_k,
+                chapter=chapter,
+                chapter_category=chapter_category,
+                slot_snippets=slot_snippets,
+                user_id=user_id,
+                user_profile=user_profile,
+                user_birth_year=user_birth_year,
+                llm=llm,
+                narrative_agent=narrative_agent,
+                target_story_id=target_story_id,
+                existing_for_narrative=existing_for_narrative,
+                decision_source=decision_source,
+                route_decision=route.decision,
+                route_type="single",
+                segment_ids=[str(s.id) for s in category_segments],
+                category_segments=category_segments,
+                background_voice=background_voice,
+                occupation=occupation,
+                memoir_correlation_id=memoir_correlation_id,
+                fidelity_llm=llm_fidelity,
+                language=language,
+            )
+            if sid:
+                dispatch_ids.add(sid)
 
     pipeline_phase_timings["narrative_writes"] = time.perf_counter() - _t0
 
     _t0 = time.perf_counter()
-    reorder_chapter_story_links_by_life_order_sync(session, str(chapter.id))
-    mark_chapter_dirty_sync(session, str(chapter.id))
-    session.flush()
-    refresh_chapter_evidence_snapshot_with_retry_sync(session, str(chapter.id))
+    with business_span("memoir.story_pipeline.finalize", chapter_category=chapter_category):
+        reorder_chapter_story_links_by_life_order_sync(session, str(chapter.id))
+        mark_chapter_dirty_sync(session, str(chapter.id))
+        session.flush()
+        refresh_chapter_evidence_snapshot_with_retry_sync(session, str(chapter.id))
     pipeline_phase_timings["finalize"] = time.perf_counter() - _t0
 
     image_settings = MemoirImageSettings.from_env()
diff --git a/api/app/features/payment/alipay_client.py b/api/app/features/payment/alipay_client.py
index 8f4b0f7..dad2b0a 100644
--- a/api/app/features/payment/alipay_client.py
+++ b/api/app/features/payment/alipay_client.py
@@ -4,6 +4,7 @@
 
 from typing import Dict
 
+from app.core.business_telemetry import business_span
 from app.core.logging import get_logger
 from app.features.payment.payment_config import AlipayConfig
 from app.features.payment.payment_exceptions import (
@@ -46,6 +47,15 @@ class AlipayClient:
         out_trade_no: str,
         total_amount: int,
         subject: str,
+    ) -> PaymentResult:
+        with business_span("payment.alipay.create_app_order"):
+            return self._create_app_order_inner(out_trade_no, total_amount, subject)
+
+    def _create_app_order_inner(
+        self,
+        out_trade_no: str,
+        total_amount: int,
+        subject: str,
     ) -> PaymentResult:
         self._ensure_client()
         try:
@@ -100,6 +110,10 @@ class AlipayClient:
             raise PaymentNotifyError(f"支付宝回调处理失败: {e}")
 
     def query_order(self, out_trade_no: str) -> PaymentStatus:
+        with business_span("payment.alipay.query_order"):
+            return self._query_order_inner(out_trade_no)
+
+    def _query_order_inner(self, out_trade_no: str) -> PaymentStatus:
         self._ensure_client()
         try:
             result = self._client.api_alipay_trade_query(out_trade_no=out_trade_no)
diff --git a/api/app/features/payment/wechat_client.py b/api/app/features/payment/wechat_client.py
index 3268628..89b9ec6 100644
--- a/api/app/features/payment/wechat_client.py
+++ b/api/app/features/payment/wechat_client.py
@@ -7,6 +7,7 @@ import os
 import time
 from typing import Dict
 
+from app.core.business_telemetry import business_span
 from app.core.logging import get_logger
 from app.features.payment.payment_config import WeChatPayConfig
 from app.features.payment.payment_exceptions import (
@@ -149,6 +150,15 @@ class WeChatPayClient:
         out_trade_no: str,
         total_amount: int,
         description: str,
+    ) -> PaymentResult:
+        with business_span("payment.wechat.create_app_order"):
+            return self._create_app_order_inner(out_trade_no, total_amount, description)
+
+    def _create_app_order_inner(
+        self,
+        out_trade_no: str,
+        total_amount: int,
+        description: str,
     ) -> PaymentResult:
         self._ensure_client()
         try:
@@ -217,6 +227,10 @@ class WeChatPayClient:
             raise PaymentNotifyError(f"微信支付回调处理失败: {e}")
 
     def query_order(self, out_trade_no: str) -> PaymentStatus:
+        with business_span("payment.wechat.query_order"):
+            return self._query_order_inner(out_trade_no)
+
+    def _query_order_inner(self, out_trade_no: str) -> PaymentStatus:
         self._ensure_client()
         try:
             code, message = self._client.query(out_trade_no=out_trade_no)
diff --git a/api/app/internal_main.py b/api/app/internal_main.py
index 179b4fb..55e8354 100644
--- a/api/app/internal_main.py
+++ b/api/app/internal_main.py
@@ -14,12 +14,18 @@ from app.core.logging import get_logger, setup_logging
 
 setup_logging()
 
+from app.core.config import settings
+from app.core.telemetry import instrument_fastapi_app, setup_telemetry
+
+setup_telemetry(
+    service_name=settings.otel_service_name or "life-echo-internal-api",
+)
+
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import HTMLResponse
 from fastapi.staticfiles import StaticFiles
 
-from app.core.config import settings
 from app.core.errors import register_exception_handlers
 from app.core.middleware import RequestIdMiddleware
 from app.features.evaluation import models as _eval_models  # noqa: F401
@@ -35,6 +41,8 @@ internal_app = FastAPI(
     openapi_url="/openapi.json" if settings.internal_eval_enable_docs else None,
 )
 
+instrument_fastapi_app(internal_app)
+
 internal_app.add_middleware(RequestIdMiddleware)
 _origins = [
     o.strip()
@@ -66,7 +74,7 @@ async def internal_eval_landing():
 <body style="font-family:system-ui,sans-serif;max-width:44rem;margin:2rem auto;line-height:1.5">
 <h1>Life Echo · 内部回归评测 API</h1>
 <p>这里是 <strong>HTTP API</strong>（端口由启动命令决定），<strong>没有内置网页</strong>。
-浏览「回归评测台」请在仓库执行 <code>./internal-eval.sh</code> 或 <code>cd app-eval-web && npm run dev</code>，
+浏览「回归评测台」请在仓库执行 <code>./development.sh</code> 或 <code>cd app-eval-web && npm run dev</code>，
 在终端里打开 Vite 给出的地址（一般为 <strong>http://127.0.0.1:5174/</strong>）。</p>
 <p>健康检查：<a href="/health">/health</a></p>
 {docs_hint}
diff --git a/api/app/main.py b/api/app/main.py
index a9ae9dd..18a7b65 100644
--- a/api/app/main.py
+++ b/api/app/main.py
@@ -8,11 +8,17 @@ from app.core.logging import get_logger, setup_logging
 
 setup_logging()
 
+from app.core.config import settings
+from app.core.telemetry import instrument_fastapi_app, setup_telemetry
+
+setup_telemetry(
+    service_name=settings.otel_service_name or "life-echo-api",
+)
+
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 
-from app.core.config import settings
 from app.core.errors import register_exception_handlers
 from app.core.middleware import RequestIdMiddleware
 from app.core.openapi import custom_openapi
@@ -46,6 +52,8 @@ app = FastAPI(
     openapi_url="/openapi.json" if settings.enable_docs else None,
 )
 
+instrument_fastapi_app(app)
+
 # OpenAPI 全局增强
 app.openapi = lambda: custom_openapi(app)  # type: ignore[assignment]
 
diff --git a/api/app/tasks/celery_app.py b/api/app/tasks/celery_app.py
index a8a8ad3..62a439c 100644
--- a/api/app/tasks/celery_app.py
+++ b/api/app/tasks/celery_app.py
@@ -14,11 +14,17 @@ from app.core.logging import get_logger, setup_logging
 # 与 app.main 一致：先配置 loguru + InterceptHandler，再加载会打日志的依赖
 setup_logging()
 
+from app.core.config import settings
+from app.core.telemetry import instrument_celery, setup_telemetry
+
+# Worker 与 API 共用 .env，固定 service.name，勿读 OTEL_SERVICE_NAME（留给主站 / internal）
+setup_telemetry(service_name="life-echo-celery-worker")
+instrument_celery()
+
 from celery import Celery
 from celery.signals import task_failure, task_postrun, task_prerun, task_success
 
 from app.core.celery_log_context import clear_celery_log_extras, set_celery_log_extras
-from app.core.config import settings
 from app.core.log_events import celery_prerun_extras
 from app.features.asset import models as _asset_models  # noqa: F401 - register Asset
 from app.features.auth import models as _auth_models  # noqa: F401
@@ -123,9 +129,12 @@ def _log_task_prerun(
     **_: object,
 ) -> None:
     name = getattr(task, "name", None) or "?"
+    from app.core.telemetry import current_trace_context
+
     extras = celery_prerun_extras(name, tuple(args or ()), dict(kwargs or {}))
     if task_id:
         extras["task_id"] = str(task_id).strip()
+    extras.update(current_trace_context())
     set_celery_log_extras(extras if extras else None)
     _celery_lifecycle_log.info(
         "event=celery_task_start task={} task_id={} msg=Celery 任务已开始",
diff --git a/api/app/tasks/memoir_tasks.py b/api/app/tasks/memoir_tasks.py
index 90f9963..0d6b6da 100644
--- a/api/app/tasks/memoir_tasks.py
+++ b/api/app/tasks/memoir_tasks.py
@@ -26,6 +26,7 @@ from app.core.chapter_pipeline_lock import (
 from app.core.chapter_pipeline_lock import (
     release_chapter_pipeline_lock as _release_chapter_lock,
 )
+from app.core.business_telemetry import business_span
 from app.core.config import settings
 from app.core.db import AsyncSessionLocal, get_sync_db
 from app.core.dependencies import get_embedding_provider
@@ -614,7 +615,10 @@ def process_memoir_phase2(
         },
     )
     try:
-        with get_sync_db() as db:
+        with business_span(
+            "memoir.phase2",
+            chapter_category=chapter_category,
+        ), get_sync_db() as db:
             user_convs = select(Conversation.id).where(
                 Conversation.user_id == user_id,
                 Conversation.deleted_at.is_(None),
@@ -691,9 +695,13 @@ def process_memoir_phase2(
             affected_chapter_ids: Set[str] = set()
 
             lock_t0 = time.perf_counter()
-            lock_handle = _acquire_chapter_lock(
-                user_id, chapter_category, ttl_seconds=_chapter_lock_ttl()
-            )
+            with business_span(
+                "memoir.phase2.lock",
+                chapter_category=chapter_category,
+            ):
+                lock_handle = _acquire_chapter_lock(
+                    user_id, chapter_category, ttl_seconds=_chapter_lock_ttl()
+                )
             lock_elapsed = time.perf_counter() - lock_t0
             if lock_handle is None:
                 logger.warning(
@@ -746,22 +754,26 @@ def process_memoir_phase2(
                         "relevant_stories": [],
                     }
                 pipeline_t0 = time.perf_counter()
-                pipeline_result = run_story_pipeline_for_category_batch(
-                    db,
-                    user_id=user_id,
+                with business_span(
+                    "memoir.phase2.story_pipeline",
                     chapter_category=chapter_category,
-                    category_segments=category_segments,
-                    state=state,
-                    user_profile=user_profile,
-                    user_birth_year=user_birth_year,
-                    llm=llm,
-                    background_voice=background_voice,
-                    occupation=user_occupation,
-                    memoir_correlation_id=cid,
-                    llm_fast=llm_fast,
-                    memory_evidence=memory_evidence,
-                    language=user_language,
-                )
+                ):
+                    pipeline_result = run_story_pipeline_for_category_batch(
+                        db,
+                        user_id=user_id,
+                        chapter_category=chapter_category,
+                        category_segments=category_segments,
+                        state=state,
+                        user_profile=user_profile,
+                        user_birth_year=user_birth_year,
+                        llm=llm,
+                        background_voice=background_voice,
+                        occupation=user_occupation,
+                        memoir_correlation_id=cid,
+                        llm_fast=llm_fast,
+                        memory_evidence=memory_evidence,
+                        language=user_language,
+                    )
                 pipeline_elapsed = time.perf_counter() - pipeline_t0
 
                 if pipeline_result.deferred:
@@ -939,7 +951,10 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]):
     phase1_t0 = time.perf_counter()
 
     try:
-        with get_sync_db() as db:
+        with business_span(
+            "memoir.phase1",
+            segment_count=len(segment_ids),
+        ), get_sync_db() as db:
             user_obj_for_lang = db.get(User, user_id)
             user_language = (
                 "en"
@@ -986,47 +1001,48 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]):
                 },
             )
             ingest_t0 = time.perf_counter()
-            ingest_items: list[tuple[str, str, dict | None]] = []
-            non_empty_segments: list = []
-            for seg in segments:
-                text = (seg.user_input_text or "").strip()
-                if not text:
-                    continue
-                conv_id = getattr(seg, "conversation_id", None) or ""
-                ln = getattr(seg, "lineage_json", None)
-                lineage_payload = ln if isinstance(ln, dict) else None
-                ingest_items.append((conv_id, text, lineage_payload))
-                non_empty_segments.append(seg)
+            with business_span("memoir.phase1.ingest"):
+                ingest_items: list[tuple[str, str, dict | None]] = []
+                non_empty_segments: list = []
+                for seg in segments:
+                    text = (seg.user_input_text or "").strip()
+                    if not text:
+                        continue
+                    conv_id = getattr(seg, "conversation_id", None) or ""
+                    ln = getattr(seg, "lineage_json", None)
+                    lineage_payload = ln if isinstance(ln, dict) else None
+                    ingest_items.append((conv_id, text, lineage_payload))
+                    non_empty_segments.append(seg)
 
-            ingested_source_ids: list[str] = []
-            if ingest_items:
-                try:
-                    ingested_source_ids = asyncio.run(
-                        _memory_ingest_transcripts_batch(
-                            user_id,
-                            ingest_items,
-                            memoir_correlation_id=memoir_correlation_id,
+                ingested_source_ids: list[str] = []
+                if ingest_items:
+                    try:
+                        ingested_source_ids = asyncio.run(
+                            _memory_ingest_transcripts_batch(
+                                user_id,
+                                ingest_items,
+                                memoir_correlation_id=memoir_correlation_id,
+                            )
                         )
-                    )
-                    for seg, sid in zip(
-                        non_empty_segments, ingested_source_ids, strict=True
-                    ):
-                        logger.info(
-                            "event=memory_transcript_ingested user_id={} task_id={} "
-                            "source_id={} conversation_id={} segment_id={} transcript_chars={}",
-                            user_id,
-                            task_id,
-                            sid,
-                            getattr(seg, "conversation_id", None) or "",
-                            seg.id,
-                            len((seg.user_input_text or "").strip()),
+                        for seg, sid in zip(
+                            non_empty_segments, ingested_source_ids, strict=True
+                        ):
+                            logger.info(
+                                "event=memory_transcript_ingested user_id={} task_id={} "
+                                "source_id={} conversation_id={} segment_id={} transcript_chars={}",
+                                user_id,
+                                task_id,
+                                sid,
+                                getattr(seg, "conversation_id", None) or "",
+                                seg.id,
+                                len((seg.user_input_text or "").strip()),
+                            )
+                    except Exception as e:
+                        logger.warning(
+                            "Memory batch ingest 失败: {} exc_type={}",
+                            e,
+                            type(e).__name__,
                         )
-                except Exception as e:
-                    logger.warning(
-                        "Memory batch ingest 失败: {} exc_type={}",
-                        e,
-                        type(e).__name__,
-                    )
             ingest_elapsed = time.perf_counter() - ingest_t0
             merge_pipeline_run(
                 memoir_correlation_id,
@@ -1050,31 +1066,32 @@ def process_memoir_phase1(self, user_id: str, segment_ids: List[str]):
                 )
 
             prep_t0 = time.perf_counter()
-            memoir_orchestrator = MemoirOrchestrator()
+            with business_span("memoir.phase1.prepare_batches"):
+                memoir_orchestrator = MemoirOrchestrator()
 
-            def _phase1_chunk_cb(idx: int, total: int) -> None:
-                merge_pipeline_run(
-                    memoir_correlation_id,
-                    {"phase1": {"detail": {"prepare_batches_chunk": [idx, total]}}},
+                def _phase1_chunk_cb(idx: int, total: int) -> None:
+                    merge_pipeline_run(
+                        memoir_correlation_id,
+                        {"phase1": {"detail": {"prepare_batches_chunk": [idx, total]}}},
+                    )
+
+                prepared = memoir_orchestrator.prepare_batches(
+                    segments=list(segments),
+                    llm=llm,
+                    llm_fast=llm_fast,
+                    get_or_create_state=lambda: get_or_create_state_sync(user_id, db),
+                    update_slot=lambda stage, slot_name, snippet, seg_ids: update_slot_sync(
+                        user_id,
+                        stage,
+                        slot_name,
+                        snippet,
+                        seg_ids,
+                        db,
+                        memoir_batch=True,
+                    ),
+                    on_phase1_chunk=_phase1_chunk_cb,
+                    language=user_language,
                 )
-
-            prepared = memoir_orchestrator.prepare_batches(
-                segments=list(segments),
-                llm=llm,
-                llm_fast=llm_fast,
-                get_or_create_state=lambda: get_or_create_state_sync(user_id, db),
-                update_slot=lambda stage, slot_name, snippet, seg_ids: update_slot_sync(
-                    user_id,
-                    stage,
-                    slot_name,
-                    snippet,
-                    seg_ids,
-                    db,
-                    memoir_batch=True,
-                ),
-                on_phase1_chunk=_phase1_chunk_cb,
-                language=user_language,
-            )
             prep_elapsed = time.perf_counter() - prep_t0
             merge_pipeline_run(
                 memoir_correlation_id,
diff --git a/api/app/tasks/memory_compaction_tasks.py b/api/app/tasks/memory_compaction_tasks.py
index d906f49..4002cf7 100644
--- a/api/app/tasks/memory_compaction_tasks.py
+++ b/api/app/tasks/memory_compaction_tasks.py
@@ -9,6 +9,7 @@ from typing import Any
 
 from celery import shared_task
 
+from app.core.business_telemetry import business_span
 from app.core.config import settings
 from app.core.db import AsyncSessionLocal
 from app.core.logging import get_logger
@@ -49,7 +50,8 @@ def memory_compaction_sweep() -> dict[str, Any]:
     if not settings.memory_compaction_enabled:
         return {"skipped": True, "reason": "disabled"}
     hours = int(settings.memory_compaction_sweep_recent_hours)
-    user_ids = asyncio.run(_list_users_with_recent_chunks_async(hours))
+    with business_span("memory.compaction.sweep", hours=hours):
+        user_ids = asyncio.run(_list_users_with_recent_chunks_async(hours))
     ctx_base: dict[str, Any] = {"trigger_source": "beat", "sweep_hours": hours}
     for uid in user_ids:
         schedule_memory_compaction_run(uid, dict(ctx_base))
@@ -100,7 +102,8 @@ def memory_compaction_run(
         return out
 
     try:
-        out = asyncio.run(_run_memory_compaction_async(user_id, ctx))
+        with business_span("memory.compaction.run"):
+            out = asyncio.run(_run_memory_compaction_async(user_id, ctx))
 
         if out.get("new_cursor_ts") and out.get("new_cursor_id") is not None:
             set_incremental_cursor_pair(
diff --git a/api/app/tasks/memory_enrichment_tasks.py b/api/app/tasks/memory_enrichment_tasks.py
index 54434ef..2a70136 100644
--- a/api/app/tasks/memory_enrichment_tasks.py
+++ b/api/app/tasks/memory_enrichment_tasks.py
@@ -11,6 +11,7 @@ from typing import Any, cast
 
 from celery import shared_task
 
+from app.core.business_telemetry import business_span
 from app.core.config import settings
 from app.core.db import AsyncSessionLocal
 from app.core.dependencies import get_embedding_provider
@@ -166,7 +167,8 @@ def embed_memory_source(
         status="running",
     )
     try:
-        result = asyncio.run(_embed_memory_source_async(user_id, source_id))
+        with business_span("memory.embed_source"):
+            result = asyncio.run(_embed_memory_source_async(user_id, source_id))
         ms = (time.perf_counter() - t0) * 1000
         logger.info(
             "event=memory_embedding_done user_id={} source_id={} duration_ms={:.1f} status={} vectors_written={} msg=记忆向量化完成",
@@ -241,7 +243,8 @@ def enrich_memory_source(
         status="running",
     )
     try:
-        asyncio.run(_enrich_memory_source_async(user_id, source_id))
+        with business_span("memory.enrich_source"):
+            asyncio.run(_enrich_memory_source_async(user_id, source_id))
         ms = (time.perf_counter() - t0) * 1000
         logger.info(
             "event=memory_enrichment_done user_id={} source_id={} duration_ms={:.1f} "
diff --git a/api/deploy/observability/grafana/dashboards/life-echo-business.json b/api/deploy/observability/grafana/dashboards/life-echo-business.json
new file mode 100644
index 0000000..e14f545
--- /dev/null
+++ b/api/deploy/observability/grafana/dashboards/life-echo-business.json
@@ -0,0 +1,75 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
+      "id": 1,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(business_operation_duration_milliseconds_bucket[5m])) by (le, operation))",
+          "legendFormat": "{{operation}} p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Business operation duration p95",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "id": 2,
+      "targets": [
+        {
+          "expr": "sum(rate(business_operation_duration_milliseconds_count[5m])) by (operation, outcome)",
+          "legendFormat": "{{operation}} / {{outcome}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Business operations rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "id": 3,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(business_operation_duration_milliseconds_bucket[5m])) by (le, operation)) and on(operation) (operation=~\"conversation\\\\.ws\\\\..*|asr\\\\.transcribe|tts\\\\.synthesize\")",
+          "legendFormat": "{{operation}}",
+          "refId": "A"
+        }
+      ],
+      "title": "WS / ASR / TTS p95",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
+      "id": 4,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(business_operation_duration_milliseconds_bucket[5m])) by (le, operation)) and on(operation) (operation=~\"memoir\\\\..*\")",
+          "legendFormat": "{{operation}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Memoir pipeline phases p95",
+      "type": "timeseries"
+    }
+  ],
+  "schemaVersion": 39,
+  "tags": ["life-echo", "business"],
+  "templating": { "list": [] },
+  "time": { "from": "now-6h", "to": "now" },
+  "title": "Life Echo Business",
+  "uid": "life-echo-business",
+  "version": 1
+}
diff --git a/api/deploy/observability/grafana/dashboards/life-echo-llm.json b/api/deploy/observability/grafana/dashboards/life-echo-llm.json
new file mode 100644
index 0000000..3505ab2
--- /dev/null
+++ b/api/deploy/observability/grafana/dashboards/life-echo-llm.json
@@ -0,0 +1,79 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "id": 1,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(llm_call_duration_milliseconds_bucket[5m])) by (le, agent, call_type))",
+          "legendFormat": "{{agent}} / {{call_type}} p95",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM duration p95 by agent / call_type",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "id": 2,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum(rate(llm_call_duration_milliseconds_bucket[5m])) by (le, call_type))",
+          "legendFormat": "{{call_type}} p50",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM duration p50 by call_type (json vs chat vs stream)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "id": 3,
+      "targets": [
+        {
+          "expr": "sum(rate(llm_call_total[5m])) by (outcome, call_type)",
+          "legendFormat": "{{outcome}} / {{call_type}}",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM calls by outcome",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "id": 4,
+      "targets": [
+        {
+          "expr": "sum(rate(llm_tokens_input_total[5m])) by (agent)",
+          "legendFormat": "in {{agent}}",
+          "refId": "A"
+        },
+        {
+          "expr": "sum(rate(llm_tokens_output_total[5m])) by (agent)",
+          "legendFormat": "out {{agent}}",
+          "refId": "B"
+        }
+      ],
+      "title": "LLM tokens/min",
+      "type": "timeseries"
+    }
+  ],
+  "schemaVersion": 39,
+  "tags": ["life-echo", "llm"],
+  "templating": { "list": [] },
+  "time": { "from": "now-1h", "to": "now" },
+  "title": "Life Echo LLM",
+  "uid": "life-echo-llm",
+  "version": 1
+}
diff --git a/api/deploy/observability/grafana/dashboards/life-echo-logs.json b/api/deploy/observability/grafana/dashboards/life-echo-logs.json
new file mode 100644
index 0000000..3cd9ddc
--- /dev/null
+++ b/api/deploy/observability/grafana/dashboards/life-echo-logs.json
@@ -0,0 +1,69 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "datasource": { "type": "loki", "uid": "loki" },
+      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 0 },
+      "id": 1,
+      "options": { "showTime": true, "sortOrder": "Descending" },
+      "targets": [
+        {
+          "expr": "{compose_service=~\".+\"} |= \"event=llm_json_call\"",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM JSON calls (event=llm_json_call)",
+      "type": "logs"
+    },
+    {
+      "datasource": { "type": "loki", "uid": "loki" },
+      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 10 },
+      "id": 2,
+      "options": { "showTime": true, "sortOrder": "Descending" },
+      "targets": [
+        {
+          "expr": "{compose_service=~\".+\"} |= \"event=celery_task_failed\"",
+          "refId": "A"
+        }
+      ],
+      "title": "Celery task failures",
+      "type": "logs"
+    },
+    {
+      "datasource": { "type": "loki", "uid": "loki" },
+      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 20 },
+      "id": 3,
+      "options": { "showTime": true, "sortOrder": "Descending" },
+      "targets": [
+        {
+          "expr": "{trace_id=~\"$trace_id\"}",
+          "refId": "A"
+        }
+      ],
+      "title": "Logs by trace_id",
+      "type": "logs"
+    }
+  ],
+  "schemaVersion": 39,
+  "tags": ["life-echo", "logs"],
+  "templating": {
+    "list": [
+      {
+        "current": { "text": "", "value": "" },
+        "label": "trace_id",
+        "name": "trace_id",
+        "options": [],
+        "query": "",
+        "type": "textbox"
+      }
+    ]
+  },
+  "time": { "from": "now-1h", "to": "now" },
+  "title": "Life Echo Logs",
+  "uid": "life-echo-logs",
+  "version": 1
+}
diff --git a/api/deploy/observability/grafana/dashboards/life-echo-overview.json b/api/deploy/observability/grafana/dashboards/life-echo-overview.json
new file mode 100644
index 0000000..f43e4ba
--- /dev/null
+++ b/api/deploy/observability/grafana/dashboards/life-echo-overview.json
@@ -0,0 +1,154 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 0 },
+      "id": 1,
+      "options": { "legend": { "displayMode": "list", "placement": "bottom" } },
+      "targets": [
+        {
+          "expr": "sum(rate(http_server_request_duration_seconds_count[5m]))",
+          "legendFormat": "HTTP requests/s",
+          "refId": "A"
+        }
+      ],
+      "title": "API request rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 0 },
+      "id": 2,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le)) * 1000",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "API latency p95",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 0 },
+      "id": 3,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(llm_call_duration_milliseconds_bucket[5m])) by (le, agent, provider))",
+          "legendFormat": "{{agent}} / {{provider}}",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM call duration p95",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "id": 4,
+      "targets": [
+        {
+          "expr": "sum(rate(llm_call_total[5m])) by (outcome)",
+          "legendFormat": "{{outcome}}",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM calls by outcome",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "loki", "uid": "loki" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "id": 5,
+      "options": { "showTime": true, "sortOrder": "Descending" },
+      "targets": [
+        {
+          "expr": "{compose_service=~\".+\"} |= \"llm_json_call\"",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM JSON call logs",
+      "type": "logs"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
+      "id": 6,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(db_client_operation_duration_seconds_bucket[5m])) by (le)) * 1000",
+          "legendFormat": "DB p95",
+          "refId": "A"
+        }
+      ],
+      "title": "DB client latency p95",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
+      "id": 7,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(http_client_request_duration_seconds_bucket[5m])) by (le)) * 1000",
+          "legendFormat": "HTTP client p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Outbound HTTP latency p95",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
+      "id": 8,
+      "targets": [
+        {
+          "expr": "sum(rate(http_server_request_duration_seconds_count{http_response_status_code=~\"5..\"}[5m])) / clamp_min(sum(rate(http_server_request_duration_seconds_count[5m])), 1e-9)",
+          "legendFormat": "5xx rate",
+          "refId": "A"
+        }
+      ],
+      "title": "HTTP 5xx error rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
+      "id": 9,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(redis_client_operation_duration_seconds_bucket[5m])) by (le)) * 1000",
+          "legendFormat": "Redis p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Redis client latency p95",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 39,
+  "tags": ["life-echo"],
+  "templating": { "list": [] },
+  "time": { "from": "now-1h", "to": "now" },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Life Echo Overview",
+  "uid": "life-echo-overview",
+  "version": 1
+}
diff --git a/api/deploy/observability/grafana/provisioning/alerting/contact_points.yml b/api/deploy/observability/grafana/provisioning/alerting/contact_points.yml
new file mode 100644
index 0000000..96dcc4f
--- /dev/null
+++ b/api/deploy/observability/grafana/provisioning/alerting/contact_points.yml
@@ -0,0 +1,4 @@
+apiVersion: 1
+
+# 本地 dev 占位：不配置真实通知渠道。在 Grafana UI 中可绑定 Slack/Webhook。
+contactPoints: []
diff --git a/api/deploy/observability/grafana/provisioning/alerting/rules.yml b/api/deploy/observability/grafana/provisioning/alerting/rules.yml
new file mode 100644
index 0000000..5228f33
--- /dev/null
+++ b/api/deploy/observability/grafana/provisioning/alerting/rules.yml
@@ -0,0 +1,147 @@
+apiVersion: 1
+
+groups:
+  - orgId: 1
+    name: life-echo-alerts
+    folder: Life Echo
+    interval: 1m
+    rules:
+      - uid: life_echo_api_p95_high
+        title: API latency p95 > 2s
+        condition: C
+        data:
+          - refId: A
+            relativeTimeRange: { from: 300, to: 0 }
+            datasourceUid: Prometheus
+            model:
+              expr: histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le)) * 1000
+              refId: A
+          - refId: B
+            datasourceUid: __expr__
+            model:
+              type: reduce
+              expression: A
+              reducer: last
+              refId: B
+          - refId: C
+            datasourceUid: __expr__
+            model:
+              type: threshold
+              expression: B
+              conditions:
+                - evaluator: { type: gt, params: [2000] }
+                  operator: { type: and }
+                  reducer: { type: last }
+              refId: C
+        noDataState: NoData
+        execErrState: Error
+        for: 5m
+        annotations:
+          summary: API p95 latency above 2s for 5 minutes
+        labels:
+          severity: warning
+
+      - uid: life_echo_llm_error_rate
+        title: LLM error rate > 5%
+        condition: C
+        data:
+          - refId: A
+            relativeTimeRange: { from: 300, to: 0 }
+            datasourceUid: Prometheus
+            model:
+              expr: sum(rate(llm_call_total{outcome="error"}[5m])) / clamp_min(sum(rate(llm_call_total[5m])), 1e-9)
+              refId: A
+          - refId: B
+            datasourceUid: __expr__
+            model:
+              type: reduce
+              expression: A
+              reducer: last
+              refId: B
+          - refId: C
+            datasourceUid: __expr__
+            model:
+              type: threshold
+              expression: B
+              conditions:
+                - evaluator: { type: gt, params: [0.05] }
+                  operator: { type: and }
+                  reducer: { type: last }
+              refId: C
+        noDataState: NoData
+        execErrState: Error
+        for: 5m
+        annotations:
+          summary: LLM call error rate above 5%
+        labels:
+          severity: warning
+
+      - uid: life_echo_otel_collector_down
+        title: OTel Collector scrape down
+        condition: C
+        data:
+          - refId: A
+            relativeTimeRange: { from: 120, to: 0 }
+            datasourceUid: Prometheus
+            model:
+              expr: up{job="otel-collector"}
+              refId: A
+          - refId: B
+            datasourceUid: __expr__
+            model:
+              type: reduce
+              expression: A
+              reducer: last
+              refId: B
+          - refId: C
+            datasourceUid: __expr__
+            model:
+              type: threshold
+              expression: B
+              conditions:
+                - evaluator: { type: lt, params: [1] }
+                  operator: { type: and }
+                  reducer: { type: last }
+              refId: C
+        noDataState: Alerting
+        execErrState: Error
+        for: 2m
+        annotations:
+          summary: Prometheus cannot scrape otel-collector
+        labels:
+          severity: critical
+
+      - uid: life_echo_celery_task_failed
+        title: Celery task failures detected
+        condition: C
+        data:
+          - refId: A
+            relativeTimeRange: { from: 300, to: 0 }
+            datasourceUid: loki
+            model:
+              expr: sum(count_over_time({compose_service=~".+"} |= "event=celery_task_failed" [5m]))
+              refId: A
+          - refId: B
+            datasourceUid: __expr__
+            model:
+              type: reduce
+              expression: A
+              reducer: last
+              refId: B
+          - refId: C
+            datasourceUid: __expr__
+            model:
+              type: threshold
+              expression: B
+              conditions:
+                - evaluator: { type: gt, params: [0] }
+                  operator: { type: and }
+                  reducer: { type: last }
+              refId: C
+        noDataState: NoData
+        execErrState: Error
+        for: 5m
+        annotations:
+          summary: Celery task failure logs in last 5 minutes
+        labels:
+          severity: warning
diff --git a/api/deploy/observability/grafana/provisioning/dashboards/dashboards.yml b/api/deploy/observability/grafana/provisioning/dashboards/dashboards.yml
new file mode 100644
index 0000000..c2f6cff
--- /dev/null
+++ b/api/deploy/observability/grafana/provisioning/dashboards/dashboards.yml
@@ -0,0 +1,11 @@
+apiVersion: 1
+
+providers:
+  - name: Life Echo
+    orgId: 1
+    folder: Life Echo
+    type: file
+    disableDeletion: false
+    editable: true
+    options:
+      path: /etc/grafana/dashboards
diff --git a/api/deploy/observability/grafana/provisioning/datasources/datasources.yml b/api/deploy/observability/grafana/provisioning/datasources/datasources.yml
new file mode 100644
index 0000000..89fd49e
--- /dev/null
+++ b/api/deploy/observability/grafana/provisioning/datasources/datasources.yml
@@ -0,0 +1,43 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: false
+
+  - name: Tempo
+    type: tempo
+    access: proxy
+    url: http://tempo:3200
+    editable: false
+    jsonData:
+      httpMethod: GET
+      tracesToLogsV2:
+        datasourceUid: loki
+        spanStartTimeShift: -1m
+        spanEndTimeShift: 1m
+        filterByTraceID: true
+        filterBySpanID: false
+        customQuery: true
+        query: '{container=~".+"} | json | trace_id="$${__trace.traceId}"'
+      serviceMap:
+        datasourceUid: prometheus
+      nodeGraph:
+        enabled: true
+
+  - name: Loki
+    type: loki
+    uid: loki
+    access: proxy
+    url: http://loki:3100
+    editable: false
+    jsonData:
+      derivedFields:
+        - datasourceUid: tempo
+          matcherRegex: '"trace_id":"([a-f0-9]+)"'
+          name: TraceID
+          url: "$${__value.raw}"
+          urlDisplayLabel: View Trace
diff --git a/api/deploy/observability/loki-config.yaml b/api/deploy/observability/loki-config.yaml
new file mode 100644
index 0000000..4a09ace
--- /dev/null
+++ b/api/deploy/observability/loki-config.yaml
@@ -0,0 +1,32 @@
+auth_enabled: false
+
+server:
+  http_listen_port: 3100
+
+common:
+  instance_addr: 127.0.0.1
+  path_prefix: /loki
+  storage:
+    filesystem:
+      chunks_directory: /loki/chunks
+      rules_directory: /loki/rules
+  replication_factor: 1
+  ring:
+    kvstore:
+      store: inmemory
+
+schema_config:
+  configs:
+    - from: 2024-01-01
+      store: tsdb
+      object_store: filesystem
+      schema: v13
+      index:
+        prefix: index_
+        period: 24h
+
+limits_config:
+  retention_period: 168h
+
+ruler:
+  alertmanager_url: http://localhost:9093
diff --git a/api/deploy/observability/otel-collector-config.yaml b/api/deploy/observability/otel-collector-config.yaml
new file mode 100644
index 0000000..d8fcef7
--- /dev/null
+++ b/api/deploy/observability/otel-collector-config.yaml
@@ -0,0 +1,53 @@
+receivers:
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+
+processors:
+  batch:
+    timeout: 5s
+    send_batch_size: 1024
+  memory_limiter:
+    check_interval: 1s
+    limit_mib: 512
+    spike_limit_mib: 128
+  resource:
+    attributes:
+      - key: deployment.environment
+        value: development
+        action: upsert
+
+exporters:
+  otlp/tempo:
+    endpoint: tempo:4317
+    tls:
+      insecure: true
+  prometheus:
+    endpoint: 0.0.0.0:8889
+  loki:
+    endpoint: http://loki:3100/loki/api/v1/push
+    tls:
+      insecure: true
+
+extensions:
+  health_check:
+    endpoint: 0.0.0.0:13133
+
+service:
+  extensions: [health_check]
+  pipelines:
+    traces:
+      receivers: [otlp]
+      processors: [memory_limiter, batch]
+      exporters: [otlp/tempo]
+    metrics:
+      receivers: [otlp]
+      processors: [memory_limiter, batch]
+      exporters: [prometheus]
+    logs:
+      receivers: [otlp]
+      processors: [memory_limiter, batch]
+      exporters: [loki]
diff --git a/api/deploy/observability/prometheus.yml b/api/deploy/observability/prometheus.yml
new file mode 100644
index 0000000..ea02974
--- /dev/null
+++ b/api/deploy/observability/prometheus.yml
@@ -0,0 +1,12 @@
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+scrape_configs:
+  - job_name: prometheus
+    static_configs:
+      - targets: ["localhost:9090"]
+
+  - job_name: otel-collector
+    static_configs:
+      - targets: ["otel-collector:8889"]
diff --git a/api/deploy/observability/promtail-config.yaml b/api/deploy/observability/promtail-config.yaml
new file mode 100644
index 0000000..e09c808
--- /dev/null
+++ b/api/deploy/observability/promtail-config.yaml
@@ -0,0 +1,41 @@
+server:
+  http_listen_port: 9080
+  grpc_listen_port: 0
+
+positions:
+  filename: /tmp/positions.yaml
+
+clients:
+  - url: http://loki:3100/loki/api/v1/push
+
+scrape_configs:
+  - job_name: docker
+    docker_sd_configs:
+      - host: unix:///var/run/docker.sock
+        refresh_interval: 5s
+    relabel_configs:
+      - source_labels: ["__meta_docker_container_name"]
+        regex: "/(.*)"
+        target_label: container
+      - source_labels: ["__meta_docker_container_log_stream"]
+        target_label: stream
+      - source_labels: ["__meta_docker_container_label_com_docker_compose_service"]
+        target_label: compose_service
+    pipeline_stages:
+      - regex:
+          expression: '(?:tid=|trace_id=)(?P<trace_id>[0-9a-f]{12,32})'
+      - regex:
+          expression: 'event=(?P<event>[a-zA-Z0-9_.-]+)'
+      - regex:
+          expression: 'duration_ms=(?P<duration_ms>[0-9.]+)'
+      - json:
+          expressions:
+            trace_id: trace_id
+            span_id: span_id
+            request_id: request_id
+            event: event
+      - structured_metadata:
+          trace_id:
+      - labels:
+          request_id:
+          event:
diff --git a/api/deploy/observability/tempo.yaml b/api/deploy/observability/tempo.yaml
new file mode 100644
index 0000000..9c0d969
--- /dev/null
+++ b/api/deploy/observability/tempo.yaml
@@ -0,0 +1,29 @@
+server:
+  http_listen_port: 3200
+
+distributor:
+  receivers:
+    otlp:
+      protocols:
+        grpc:
+          endpoint: 0.0.0.0:4317
+
+ingester:
+  max_block_duration: 5m
+
+compactor:
+  compaction:
+    block_retention: 48h
+
+storage:
+  trace:
+    backend: local
+    local:
+      path: /var/tempo/traces
+    wal:
+      path: /var/tempo/wal
+
+query_frontend:
+  search:
+    duration_slo: 5s
+    throughput_bytes_slo: 1.073741824e+09
diff --git a/api/development.sh b/api/development.sh
index d64c9c9..ceb3576 100755
--- a/api/development.sh
+++ b/api/development.sh
@@ -25,10 +25,19 @@ API_PORT="${API_PORT:-8000}"
 CELERY_POOL="${CELERY_POOL:-solo}"
 SKIP_INSTALL="${SKIP_INSTALL:-0}"
 SKIP_INFRA="${SKIP_INFRA:-0}"
+# 可观测性：空=若 .env 中 OTEL_ENABLED=true 则启动 compose；0=不启；1=强制启动
+START_OBSERVABILITY="${START_OBSERVABILITY:-}"
 SHUTDOWN_TIMEOUT="${SHUTDOWN_TIMEOUT:-12}"
 
-# 由 internal-eval.sh 开启：在 main:app + Celery 之外再启 internal_main(:8001) 与 app-eval-web
-LIFE_ECHO_WITH_INTERNAL_EVAL="${LIFE_ECHO_WITH_INTERNAL_EVAL:-0}"
+# 与 docker-compose.observability.yml / .env.example 默认宿主机端口一致
+OTEL_GRPC_HOST_PORT="${OTEL_GRPC_HOST_PORT:-48317}"
+GRAFANA_HOST_PORT="${GRAFANA_HOST_PORT:-48300}"
+PROMETHEUS_HOST_PORT="${PROMETHEUS_HOST_PORT:-49090}"
+
+# 默认一并启动 internal_main + app-eval-web（设 0 可仅主站）
+LIFE_ECHO_WITH_INTERNAL_EVAL="${LIFE_ECHO_WITH_INTERNAL_EVAL:-1}"
+# 自动用 Google Chrome 打开 Grafana / 评测 Web（勿用 Vite --open，避免落到 Safari）
+OPEN_OBSERVABILITY_UI="${OPEN_OBSERVABILITY_UI:-1}"
 # 若 :8000 已由其他 development 实例占用，仅附加 :8001 + 前端（需自备同一份 Celery/主站）
 EVAL_ATTACH_ONLY="${EVAL_ATTACH_ONLY:-0}"
 INTERNAL_EVAL_HOST="${INTERNAL_EVAL_HOST:-0.0.0.0}"
@@ -43,6 +52,9 @@ INTERNAL_EVAL_PID=""
 EVAL_WEB_PID=""
 CLEANED_UP=0
 INFRA_STARTED=0
+OBSERVABILITY_STARTED=0
+OBSERVABILITY_BROWSER_SCHEDULED=0
+EVAL_WEB_BROWSER_SCHEDULED=0
 
 print_header() {
   echo -e "\n${BLUE}========================================${NC}"
@@ -62,6 +74,64 @@ print_err() {
   echo -e "${RED}✗ $1${NC}"
 }
 
+open_browser_url() {
+  local url="$1"
+  if command -v open >/dev/null 2>&1 && [[ "$(uname -s)" == "Darwin" ]]; then
+    if open -a "Google Chrome" "${url}" >/dev/null 2>&1; then
+      return 0
+    fi
+    print_warn "未找到 Google Chrome，请手动打开: ${url}"
+    return 1
+  fi
+  if command -v google-chrome >/dev/null 2>&1; then
+    google-chrome "${url}" >/dev/null 2>&1 &
+    return 0
+  fi
+  if command -v chromium-browser >/dev/null 2>&1; then
+    chromium-browser "${url}" >/dev/null 2>&1 &
+    return 0
+  fi
+  if command -v chromium >/dev/null 2>&1; then
+    chromium "${url}" >/dev/null 2>&1 &
+    return 0
+  fi
+  print_warn "未找到 Chrome/Chromium，请手动打开: ${url}"
+  return 1
+}
+
+schedule_observability_browser() {
+  if [[ "${OPEN_OBSERVABILITY_UI}" != "1" ]] || [[ "${OBSERVABILITY_BROWSER_SCHEDULED}" == "1" ]]; then
+    return 0
+  fi
+  OBSERVABILITY_BROWSER_SCHEDULED=1
+  local grafana_url="http://127.0.0.1:${GRAFANA_HOST_PORT}"
+  (
+    sleep 4
+    open_browser_url "${grafana_url}"
+  ) &
+  print_ok "将自动打开 Grafana: ${grafana_url}"
+}
+
+schedule_eval_web_browser() {
+  if [[ "${OPEN_EVAL_WEB}" != "1" ]] || [[ "${EVAL_WEB_BROWSER_SCHEDULED:-0}" == "1" ]]; then
+    return 0
+  fi
+  EVAL_WEB_BROWSER_SCHEDULED=1
+  local eval_url="http://127.0.0.1:${EVAL_WEB_PORT}/"
+  (
+    local i=0
+    while (( i < 30 )); do
+      if is_port_listening "${EVAL_WEB_PORT}"; then
+        break
+      fi
+      sleep 1
+      i=$((i + 1))
+    done
+    open_browser_url "${eval_url}"
+  ) &
+  print_ok "将自动打开评测 Web (Chrome): ${eval_url}"
+}
+
 is_pid_alive() {
   local pid="$1"
   [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null
@@ -147,11 +217,9 @@ cleanup() {
   fi
 
   if [[ "${INFRA_STARTED}" == "1" ]]; then
-    print_warn "正在停止 PostgreSQL / Redis 容器..."
-    (
-      cd "${ROOT_DIR}" && docker compose -f docker-compose.dev.yml stop
-    ) >/dev/null 2>&1 || true
-    print_ok "PostgreSQL/Redis 容器已停止"
+    print_warn "正在停止 Docker 基础设施..."
+    docker_compose_cmd stop >/dev/null 2>&1 || true
+    print_ok "Docker 容器已停止"
   fi
 }
 
@@ -163,12 +231,107 @@ require_cmd() {
   fi
 }
 
+read_env_bool() {
+  local key="$1"
+  local default="${2:-0}"
+  local line val
+
+  if [[ -n "${!key:-}" ]]; then
+    val="${!key}"
+    case "${val}" in
+      1 | true | TRUE | yes | YES | on | ON) return 0 ;;
+      *) return 1 ;;
+    esac
+  fi
+
+  if [[ ! -f "${ROOT_DIR}/.env" ]]; then
+    [[ "${default}" == "1" ]]
+    return
+  fi
+
+  line="$(grep -E "^${key}=" "${ROOT_DIR}/.env" | tail -1 | cut -d= -f2- | tr -d '\r' | sed 's/^"//;s/"$//')"
+  case "${line}" in
+    1 | true | TRUE | yes | YES | on | ON) return 0 ;;
+    *) [[ "${default}" == "1" ]] ;;
+  esac
+}
+
+should_start_observability() {
+  case "${START_OBSERVABILITY}" in
+    0 | false | FALSE | no | NO | off | OFF) return 1 ;;
+    1 | true | TRUE | yes | YES | on | ON) return 0 ;;
+  esac
+  read_env_bool "OTEL_ENABLED" "0"
+}
+
+docker_compose_cmd() {
+  # 统一 compose -f，兼容 macOS 自带 bash 3.2（勿用 local -n / local arr=(-f …)）
+  if should_start_observability; then
+    (cd "${ROOT_DIR}" && docker compose \
+      -f docker-compose.dev.yml \
+      -f docker-compose.observability.yml \
+      "$@")
+    return
+  fi
+  if [[ "$1" == "up" ]]; then
+    (cd "${ROOT_DIR}" && docker compose -f docker-compose.dev.yml "$@" --remove-orphans)
+  else
+    (cd "${ROOT_DIR}" && docker compose -f docker-compose.dev.yml "$@")
+  fi
+}
+
+wait_otel_collector_ready() {
+  local retries="${1:-30}"
+  local i=0
+  while (( i < retries )); do
+    if is_port_listening "${OTEL_GRPC_HOST_PORT}"; then
+      return 0
+    fi
+    sleep 1
+    i=$((i + 1))
+  done
+  return 1
+}
+
+check_otel_collector_ready() {
+  if ! read_env_bool "OTEL_ENABLED" "0"; then
+    return 0
+  fi
+  if is_port_listening "${OTEL_GRPC_HOST_PORT}"; then
+    print_ok "OTel Collector 端口已监听 (:${OTEL_GRPC_HOST_PORT})"
+    return 0
+  fi
+  if [[ "${OBSERVABILITY_STARTED}" == "1" ]]; then
+    print_warn "等待 OTel Collector 端口 :${OTEL_GRPC_HOST_PORT} …"
+    if wait_otel_collector_ready 45; then
+      print_ok "OTel Collector 端口已监听 (:${OTEL_GRPC_HOST_PORT})"
+      return 0
+    fi
+  fi
+  print_warn "OTEL_ENABLED=true 但 :${OTEL_GRPC_HOST_PORT} 未监听"
+  print_warn "请确认本次启动日志中有「启动可观测性栈」；或手动执行:"
+  print_warn "  docker compose -f docker-compose.dev.yml -f docker-compose.observability.yml up -d"
+  print_warn "不需要可观测性时在 .env.development 设 OTEL_ENABLED=false"
+  return 1
+}
+
 start_infra() {
-  print_header "启动 PostgreSQL 和 Redis"
-  cd "${ROOT_DIR}"
-  docker compose -f docker-compose.dev.yml up -d
+  if should_start_observability; then
+    print_header "启动 PostgreSQL、Redis 与可观测性栈 (OTel / Grafana LGTM)"
+    OBSERVABILITY_STARTED=1
+  else
+    print_header "启动 PostgreSQL 和 Redis"
+  fi
+  docker_compose_cmd up -d
   INFRA_STARTED=1
   print_ok "PostgreSQL 127.0.0.1:48291，Redis 127.0.0.1:48307（见 docker-compose.dev.yml / .env.example）"
+  if [[ "${OBSERVABILITY_STARTED}" == "1" ]]; then
+    print_ok "Grafana http://127.0.0.1:${GRAFANA_HOST_PORT} （admin/admin）"
+    print_ok "Prometheus http://127.0.0.1:${PROMETHEUS_HOST_PORT}"
+    print_ok "OTLP gRPC 127.0.0.1:${OTEL_GRPC_HOST_PORT}（应用读 .env 中 OTEL_*，无需 export）"
+    print_ok "详见 docs/observability.md"
+    schedule_observability_browser
+  fi
   print_ok "基础设施已就绪"
 }
 
@@ -467,19 +630,15 @@ start_eval_web() {
     exit 1
   fi
 
-  local vite_extra=()
-  if [[ "${OPEN_EVAL_WEB}" == "1" ]]; then
-    vite_extra+=(--open)
-  fi
-
   (
     cd "${EVAL_WEB_DIR}"
     VITE_EVAL_API_KEY="${api_key}" \
     VITE_EVAL_PROXY_TARGET="http://127.0.0.1:${INTERNAL_EVAL_PORT}" \
-      npm run dev -- --host 127.0.0.1 --port "${EVAL_WEB_PORT}" "${vite_extra[@]}"
+      npm run dev -- --host 127.0.0.1 --port "${EVAL_WEB_PORT}"
   ) &
   EVAL_WEB_PID=$!
   print_ok "eval-web 已启动 (PID: ${EVAL_WEB_PID}) → http://127.0.0.1:${EVAL_WEB_PORT}/"
+  schedule_eval_web_browser
 }
 
 start_internal_eval_http() {
@@ -493,7 +652,8 @@ start_internal_eval_http() {
     exit 1
   fi
 
-  "${UVICORN_BIN}" app.internal_main:internal_app --reload \
+  OTEL_SERVICE_NAME="${INTERNAL_EVAL_OTEL_SERVICE_NAME:-life-echo-internal-api}" \
+    "${UVICORN_BIN}" app.internal_main:internal_app --reload \
     --reload-exclude 'alembic/**' \
     --reload-exclude 'alembic.ini' \
     --host "${INTERNAL_EVAL_HOST}" --port "${INTERNAL_EVAL_PORT}" &
@@ -547,7 +707,7 @@ start_services() {
   fi
 
   if [[ "${skip_main}" == "1" ]] && [[ "${LIFE_ECHO_WITH_INTERNAL_EVAL}" != "1" ]]; then
-    print_err "EVAL_ATTACH_ONLY=1 仅用于在已有主站时附加内部评测；请使用 ./internal-eval.sh 或导出 LIFE_ECHO_WITH_INTERNAL_EVAL=1"
+    print_err "EVAL_ATTACH_ONLY=1 仅用于在已有主站时附加内部评测；请设置 LIFE_ECHO_WITH_INTERNAL_EVAL=1"
     exit 1
   fi
 
@@ -601,14 +761,27 @@ start_services() {
     echo "主站文档: http://localhost:${API_PORT}/docs"
     echo "健康检查: http://localhost:${API_PORT}/health"
   fi
+  if [[ "${LIFE_ECHO_WITH_INTERNAL_EVAL}" == "1" ]]; then
+    echo "评测 Web UI:  http://127.0.0.1:${EVAL_WEB_PORT}/"
+    echo "内部评测 API: http://127.0.0.1:${INTERNAL_EVAL_PORT}/health"
+  fi
+  if read_env_bool "OTEL_ENABLED" "0"; then
+    echo "可观测性: Grafana http://127.0.0.1:${GRAFANA_HOST_PORT} | Prometheus http://127.0.0.1:${PROMETHEUS_HOST_PORT}"
+    if is_port_listening "${GRAFANA_HOST_PORT}"; then
+      schedule_observability_browser
+    fi
+  fi
+  if [[ "${LIFE_ECHO_WITH_INTERNAL_EVAL}" == "1" ]] && is_pid_alive "${EVAL_WEB_PID}"; then
+    schedule_eval_web_browser
+  fi
   echo "按 Ctrl+C 停止所有进程"
 }
 
 main() {
   if [[ "${LIFE_ECHO_WITH_INTERNAL_EVAL}" == "1" ]]; then
-    print_header "Life Echo 开发环境 + 内部评测（主站 + :${INTERNAL_EVAL_PORT} + Eval Web）"
+    print_header "Life Echo 开发环境（主站 + 内部评测 + 可观测性）"
   else
-    print_header "Life Echo 开发环境一键启动"
+    print_header "Life Echo 开发环境一键启动（无内部评测）"
   fi
 
   require_cmd "uv"
@@ -618,16 +791,22 @@ main() {
 
   trap cleanup EXIT INT TERM
 
+  ensure_venv
+  # 必须在 start_infra 之前同步，否则 should_start_observability 读不到 .env.development 里的 OTEL_ENABLED
+  ensure_dotenv_from_development
+
   if [[ "${SKIP_INFRA}" != "1" ]]; then
     start_infra
     wait_postgres_ready || true
   else
     print_warn "已跳过 docker 基础设施 (SKIP_INFRA=1)"
+    if should_start_observability; then
+      print_warn "SKIP_INFRA=1 未自动启动 observability；若需 LGTM 请手动 docker compose up observability overlay"
+    fi
   fi
 
-  ensure_venv
-  ensure_dotenv_from_development
   check_env_file
+  check_otel_collector_ready || true
   wait_host_infra_ready
   run_migrations
   start_services
diff --git a/api/docker-compose.observability.yml b/api/docker-compose.observability.yml
new file mode 100644
index 0000000..a251fb2
--- /dev/null
+++ b/api/docker-compose.observability.yml
@@ -0,0 +1,122 @@
+# 本地可观测性栈 overlay（与 docker-compose.dev.yml 一起使用）
+#
+#   docker compose -f docker-compose.dev.yml -f docker-compose.observability.yml up -d
+#
+# 宿主机端口刻意避开 3000/9090/4317 等常用口，与 .env.example 中 OTEL_* / *_HOST_PORT 对齐。
+# Grafana: http://127.0.0.1:${GRAFANA_HOST_PORT:-48300}  (admin / admin)
+# OTLP:    127.0.0.1:${OTEL_GRPC_HOST_PORT:-48317} (gRPC)  :${OTEL_HTTP_HOST_PORT:-48318} (HTTP)
+
+services:
+  otel-collector:
+    image: otel/opentelemetry-collector-contrib:0.120.0
+    container_name: life-echo-otel-collector
+    command: ["--config=/etc/otelcol/config.yaml"]
+    volumes:
+      - ./deploy/observability/otel-collector-config.yaml:/etc/otelcol/config.yaml:ro
+    ports:
+      - "127.0.0.1:${OTEL_GRPC_HOST_PORT:-48317}:4317"
+      - "127.0.0.1:${OTEL_HTTP_HOST_PORT:-48318}:4318"
+      - "127.0.0.1:${OTEL_COLLECTOR_HEALTH_HOST_PORT:-48333}:13133"
+    depends_on:
+      tempo:
+        condition: service_started
+      loki:
+        condition: service_started
+    networks:
+      - default
+    restart: unless-stopped
+
+  tempo:
+    image: grafana/tempo:2.7.2
+    container_name: life-echo-tempo
+    command: ["-config.file=/etc/tempo.yaml"]
+    volumes:
+      - ./deploy/observability/tempo.yaml:/etc/tempo.yaml:ro
+      - tempo_data:/var/tempo
+    ports:
+      - "127.0.0.1:${TEMPO_HTTP_HOST_PORT:-43200}:3200"
+    networks:
+      - default
+    restart: unless-stopped
+
+  loki:
+    image: grafana/loki:3.4.2
+    container_name: life-echo-loki
+    command: ["-config.file=/etc/loki/loki-config.yaml"]
+    volumes:
+      - ./deploy/observability/loki-config.yaml:/etc/loki/loki-config.yaml:ro
+      - loki_data:/loki
+    ports:
+      - "127.0.0.1:${LOKI_HTTP_HOST_PORT:-43100}:3100"
+    networks:
+      - default
+    restart: unless-stopped
+
+  promtail:
+    image: grafana/promtail:3.4.2
+    container_name: life-echo-promtail
+    command: ["-config.file=/etc/promtail/config.yml"]
+    volumes:
+      - ./deploy/observability/promtail-config.yaml:/etc/promtail/config.yml:ro
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+    depends_on:
+      loki:
+        condition: service_started
+    networks:
+      - default
+    restart: unless-stopped
+
+  prometheus:
+    image: prom/prometheus:v3.2.1
+    container_name: life-echo-prometheus
+    command:
+      - --config.file=/etc/prometheus/prometheus.yml
+      - --storage.tsdb.path=/prometheus
+      - --web.enable-lifecycle
+    volumes:
+      - ./deploy/observability/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - prometheus_data:/prometheus
+    ports:
+      - "127.0.0.1:${PROMETHEUS_HOST_PORT:-49090}:9090"
+    depends_on:
+      otel-collector:
+        condition: service_started
+    networks:
+      - default
+    restart: unless-stopped
+
+  grafana:
+    image: grafana/grafana:11.5.2
+    container_name: life-echo-grafana
+    environment:
+      GF_SECURITY_ADMIN_USER: admin
+      GF_SECURITY_ADMIN_PASSWORD: admin
+      GF_USERS_ALLOW_SIGN_UP: "false"
+      GF_AUTH_ANONYMOUS_ENABLED: "false"
+    volumes:
+      - ./deploy/observability/grafana/provisioning:/etc/grafana/provisioning:ro
+      - ./deploy/observability/grafana/dashboards:/etc/grafana/dashboards:ro
+      - grafana_data:/var/lib/grafana
+    ports:
+      - "127.0.0.1:${GRAFANA_HOST_PORT:-48300}:3000"
+    depends_on:
+      prometheus:
+        condition: service_started
+      tempo:
+        condition: service_started
+      loki:
+        condition: service_started
+    networks:
+      - default
+    restart: unless-stopped
+
+volumes:
+  tempo_data:
+  loki_data:
+  prometheus_data:
+  grafana_data:
+
+networks:
+  default:
+    name: life-echo-dev
+    external: true
diff --git a/api/docs/internal-eval.md b/api/docs/internal-eval.md
index 7b16fce..d6824e1 100644
--- a/api/docs/internal-eval.md
+++ b/api/docs/internal-eval.md
@@ -4,29 +4,30 @@
 
 ## 启动
 
-**推荐一条命令**：`internal-eval.sh` 实际调用 `development.sh`，在同一进程树里启动主站 `main:app`（**8000**）、**一份** Celery、内部评测 `internal_app`（默认 **8001**）以及 `app-eval-web`（默认 **5174**）。不需要再并行执行两份启动脚本。
+**推荐一条命令**：`./development.sh` 默认启动主站（**8000**）、Celery、内部评测 API（默认 **7999**）、评测 Web（**5174**）；`.env` 中 `OTEL_ENABLED=true` 时并起 Grafana 且自动打开浏览器。`./internal-eval.sh` 仅为兼容转发。
 
-| | 单一命令 `./internal-eval.sh` |
+| | `./development.sh`（默认） |
 |---|-------------------------------|
-| HTTP | 主站 **8000** + internal **8001** |
-| Celery | 仅 **一个** worker（与主站共用队列） |
-| 前端 | 默认启动 `app-eval-web`（`START_EVAL_WEB=0` 可关） |
+| HTTP | 主站 **8000** + internal **7999** |
+| Celery | 仅 **一个** worker |
+| 评测 UI | `open` → http://127.0.0.1:5174/（`OPEN_EVAL_WEB=0` 可关） |
+| 可观测性 | Grafana :48300（`OPEN_OBSERVABILITY_UI=0` 可关） |
 
 若 **主站 + Celery 已在其他终端** 由 `./development.sh` 跑起来了，只在同一台机器上多开评测 HTTP 与前端、且 **不再起第二份 Worker**：
 
 ```bash
 cd api
 # 确保 .env.development / .env 含 INTERNAL_EVAL_API_KEY；:8000 已被主站监听
-SKIP_INFRA=1 SKIP_INSTALL=1 EVAL_ATTACH_ONLY=1 ./internal-eval.sh
+SKIP_INFRA=1 SKIP_INSTALL=1 EVAL_ATTACH_ONLY=1 ./development.sh
 ```
 
 兼容旧写法：`SKIP_CELERY=1` 会映射为 `EVAL_ATTACH_ONLY=1`（仍要求 **8000 已在监听**）。
 
-仅主业务、不要评测台时照旧：`./development.sh`（不设置 `LIFE_ECHO_WITH_INTERNAL_EVAL`）。
+仅主业务、不要评测台：`LIFE_ECHO_WITH_INTERNAL_EVAL=0 ./development.sh`。
 
-若你只需要 **8001**、刻意不启主站 **8000**，请用下文「手动 uvicorn」配合既有 Celery，不要用 `./internal-eval.sh`（一键脚本会顺带拉起主站）。
+若只需 **7999**、不启主站 **8000**，见下文「手动 uvicorn」；不要用一键脚本。
 
-**默认会起 `app-eval-web`，并用 Vite `--open` 尝试打开浏览器**（`http://127.0.0.1:5174/`）。不要前端时设 `START_EVAL_WEB=0`；只要前端但不要弹窗时设 `OPEN_EVAL_WEB=0`。
+**默认会起 `app-eval-web`，并用系统浏览器打开评测台**（`http://127.0.0.1:5174/`，与 Grafana 同为 `open`）。不要前端时设 `START_EVAL_WEB=0`；只要前端但不要弹窗时设 `OPEN_EVAL_WEB=0`。
 
 数据库与主服务共用；需配置环境变量后启动专用进程：
 
diff --git a/api/docs/observability.md b/api/docs/observability.md
new file mode 100644
index 0000000..4c67c29
--- /dev/null
+++ b/api/docs/observability.md
@@ -0,0 +1,139 @@
+# 可观测性（OpenTelemetry + Grafana LGTM）
+
+本地开发使用 **OpenTelemetry** 采集 traces / metrics / logs，经 **OTel Collector** 写入 **Tempo / Prometheus / Loki**，在 **Grafana** 统一查看。
+
+配置写在 **`.env`**（由 `.env.development` 经 `development.sh` 同步，或从 [`.env.example`](../.env.example) 复制），`app.core.config.settings` 启动时自动读取，**无需**在 shell 里 `export OTEL_*`。
+
+## 启动栈
+
+在 `api/` 目录：
+
+```bash
+# 1. 数据库与 Redis
+docker compose -f docker-compose.dev.yml up -d
+
+# 2. 可观测性（需已存在 life-echo-dev 网络；端口来自 .env 或下列默认）
+docker compose -f docker-compose.dev.yml -f docker-compose.observability.yml up -d
+```
+
+| 服务 | 默认宿主机地址 | compose 变量 |
+|------|----------------|--------------|
+| Grafana | http://127.0.0.1:48300 （admin / admin） | `GRAFANA_HOST_PORT` |
+| Prometheus | http://127.0.0.1:49090 | `PROMETHEUS_HOST_PORT` |
+| OTLP gRPC | http://127.0.0.1:48317 | `OTEL_GRPC_HOST_PORT` |
+| OTLP HTTP | http://127.0.0.1:48318 | `OTEL_HTTP_HOST_PORT` |
+| Collector health | http://127.0.0.1:48333 | `OTEL_COLLECTOR_HEALTH_HOST_PORT` |
+
+容器**内部**仍使用标准端口（如 Collector `4317`）；仅宿主机映射使用 `48xxx` 段，与 Postgres `48291`、Redis `48307` 同一风格。
+
+预置 Dashboard（**Life Echo** 文件夹）：
+
+| Dashboard | 用途 |
+|-----------|------|
+| Life Echo Overview | API RED、LLM 摘要、依赖延迟 |
+| Life Echo LLM | `call_type` / agent / tokens、outcome 分布 |
+| Life Echo Business | 回忆录阶段、WS/ASR/TTS、Celery 业务 span |
+| Life Echo Logs | Loki 按 `event` / `trace_id` 检索 |
+
+## 启用应用导出
+
+在 [`.env.example`](../.env.example) 已给出本地默认值，同步到 `.env` 即可，例如：
+
+```env
+OTEL_ENABLED=true
+OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:48317
+OTEL_TRACES_SAMPLER=always_on
+OTEL_SERVICE_NAME=life-echo-api
+```
+
+推荐与全栈一并启动（`./development.sh` 在 `.env` 里 `OTEL_ENABLED=true` 时会起 observability compose，并默认打开 Grafana 浏览器标签）：
+
+```bash
+cd api
+./development.sh
+```
+
+仅手动起 API（不自动开 Grafana）：
+
+```bash
+cd api
+uv run uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
+```
+
+Celery worker 同一 `.env`；未设 `OTEL_SERVICE_NAME` 时 worker 默认为 `life-echo-celery-worker`。
+
+若 API 跑在 **Docker compose** 里，应设 `OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317`（服务名 + 容器内端口），而不是 `localhost`。
+
+不需要可观测性时：`.env` 中 `OTEL_ENABLED=false`（或未启动 observability compose）。
+
+## 采集内容
+
+| 类型 | 来源 |
+|------|------|
+| HTTP | FastAPI 自动 instrumentation（`/health` 排除） |
+| DB | SQLAlchemy |
+| Redis | redis-py |
+| 出站 HTTP | httpx（DeepSeek 等） |
+| Celery | 任务 span + W3C trace 传播 |
+| LLM | `llm_telemetry`（LangChain / DeepSeek / `llm_call`）+ `llm.call.*` / `llm.tokens.*` metrics |
+| 业务 | `business_telemetry`：WS 回合、回忆录 phase、ASR/TTS、支付等子 span |
+| 日志 | loguru patcher 注入 `trace_id`；Promtail 解析 `event` / `tid=`；可选 `LOG_JSON_FILE` JSON sink |
+
+日志字段：`request_id`、`trace_id`、`span_id`。HTTP 由中间件 `contextualize`；**Celery / 后台**由 loguru **patcher** 从当前 OTel span 合并，无需经过 HTTP 中间件。
+
+## 常用排查
+
+1. **API 慢**：Grafana → Tempo，按 `service.name=life-echo-api` 查 trace；看 DB / httpx / `llm.*` / `conversation.ws.*` 子 span。
+2. **LLM 慢**：**Life Echo LLM** Dashboard，或 Loki：`{compose_service=~".+"} |= "event=llm_json_call"`。
+3. **回忆录卡阶段**：Tempo 搜 `memoir.phase1` / `memoir.phase2` / `memoir.story_pipeline.*`；**Life Echo Business** Dashboard 看 `business_operation_duration_milliseconds`。
+4. **日志 ↔ Trace**：在 Tempo 复制 `trace_id` → Loki：`{compose_service=~".+"} |= "tid=<前12位>"`（控制台短格式）；Promtail 将 `trace_id` 写入 **structured metadata**（非高基数 label）。
+5. **Celery 堆积**：Tempo 过滤 `life-echo-celery-worker`；Loki `event=celery_task_failed`。
+6. **无数据**：`.env` 中 `OTEL_ENABLED=true`、`OTEL_EXPORTER_OTLP_ENDPOINT` 端口与 `OTEL_GRPC_HOST_PORT` 一致；Collector health `http://127.0.0.1:48333`；Prometheus target `otel-collector:8889` UP。
+
+### LOG_JSON_FILE 与 Promtail
+
+- **默认**：loguru 人类可读行 → Docker stdout → Promtail **regex** 提取 `tid` / `event` / `duration_ms`；`trace_id` 进 structured metadata，**不作为 Loki label**。
+- **可选**：`LOG_JSON_FILE=/path/to/app.jsonl` 开启 JSON sink（`serialize=true`），便于与 OTLP logs 或自建采集对齐；与 Promtail 可**并存**（同一容器 stdout 仍走 regex）。
+
+## 采样（staging/prod 第二阶段）
+
+| 环境 | 建议 |
+|------|------|
+| development | `OTEL_TRACES_SAMPLER=always_on` |
+| staging/production | `OTEL_TRACES_SAMPLER=parentbased_traceidratio`，`OTEL_TRACES_SAMPLER_ARG=0.1` |
+
+关闭 telemetry：`OTEL_ENABLED=false`，无 exporter 开销。
+
+## Prometheus 指标名（OTel → Prometheus）
+
+| OTel 仪器 | Prometheus 系列（histogram） |
+|-----------|------------------------------|
+| `llm.call.duration` (ms) | `llm_call_duration_milliseconds_bucket` |
+| `business.operation.duration` (ms) | `business_operation_duration_milliseconds_bucket` |
+| `http.server.request.duration` (s) | `http_server_request_duration_seconds_bucket` |
+| `db.client.operation.duration` (s) | `db_client_operation_duration_seconds_bucket` |
+| `http.client.request.duration` (s) | `http_client_request_duration_seconds_bucket` |
+
+Counter 示例：`llm_call_total`、`llm_tokens_input_total`。
+
+校验脚本（需 observability compose + 有流量）：
+
+```bash
+chmod +x scripts/verify_observability_metrics.sh
+./scripts/verify_observability_metrics.sh
+```
+
+## 验收清单（本地 E2E）
+
+- [ ] `OTEL_ENABLED=true`，启动 compose + API + Celery worker
+- [ ] 跑一条 WS 对话；Tempo 可见 `conversation.ws.process_turn`、`llm.chat_invoke`
+- [ ] 触发 memoir phase1；Tempo 可见 `memoir.phase1.*`、`memoir.story_pipeline.*`
+- [ ] Prometheus：`call_type` label 存在；真实 LLM 后 `llm_tokens_input_total` > 0
+- [ ] Loki：`|= "tid=<trace前12位>"` 能查到同次请求日志
+- [ ] `./scripts/verify_observability_metrics.sh` 通过
+- [ ] Grafana Alerting 页无 provisioning 错误（通知渠道可空）
+
+## 配置目录
+
+- [`deploy/observability/`](../deploy/observability/)：Collector、Tempo、Loki、Prometheus、Grafana provisioning
+- [`docker-compose.observability.yml`](../docker-compose.observability.yml)：本地 overlay
diff --git a/api/docs/部署指南.md b/api/docs/部署指南.md
index dda4786..75f3a1d 100644
--- a/api/docs/部署指南.md
+++ b/api/docs/部署指南.md
@@ -305,11 +305,13 @@ sudo journalctl -u life-echo-api -f
 
 ### 8. 监控与告警
 
+本地开发与预发可观测性栈（OpenTelemetry + Grafana LGTM）见 **[可观测性指南](observability.md)**。staging/production 全量接入为第二阶段（`docker-compose` profile）。
+
 #### 8.1 配置日志监控
 
 建议使用以下工具：
+- **Grafana + Loki + Tempo + Prometheus**（仓库内 `deploy/observability/`，推荐）
 - ELK Stack (Elasticsearch + Logstash + Kibana)
-- Grafana + Loki
 - 云服务商的日志服务
 
 #### 8.2 配置性能监控
diff --git a/api/internal-eval.sh b/api/internal-eval.sh
index 32130e2..3557df7 100755
--- a/api/internal-eval.sh
+++ b/api/internal-eval.sh
@@ -1,22 +1,18 @@
 #!/usr/bin/env bash
-# 在 development.sh 全栈之上附加 internal_main（默认 :8001）与 app-eval-web。
-# 只需一条命令，无需再并行跑两份脚本；共用同一份 Postgres/Redis/Celery（本脚本只起一个 Worker）。
+# 已合并入 development.sh（默认启动评测台 + 自动打开 Grafana / 评测 UI）。
+# 本脚本保留为兼容入口，行为与 ./development.sh 相同。
 #
-# 用法：cd api && ./internal-eval.sh
+# 若主站已在其他终端占用 :8000，仅附加评测 HTTP + 前端（不再起 Celery）：
+#   SKIP_INFRA=1 SKIP_INSTALL=1 EVAL_ATTACH_ONLY=1 ./development.sh
 #
-# 若主站已在其他终端由 ./development.sh 占用 :8000，仅多开评测 HTTP + 前端（不再起第二份 Celery）：
-#   SKIP_INFRA=1 SKIP_INSTALL=1 EVAL_ATTACH_ONLY=1 ./internal-eval.sh
-#
-# 兼容旧环境变量：SKIP_CELERY=1 等价于 EVAL_ATTACH_ONLY=1（仍要求 :8000 已有监听）。
-#
-# 其他可选变量与 development.sh 一致，例如：
-#   SKIP_INFRA=1  SKIP_INSTALL=1  START_EVAL_WEB=0  OPEN_EVAL_WEB=0
-#   INTERNAL_EVAL_PORT  EVAL_WEB_PORT  INTERNAL_EVAL_API_KEY
+# 兼容旧变量：SKIP_CELERY=1 等价于 EVAL_ATTACH_ONLY=1
 
 set -euo pipefail
 
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
+echo -e "\033[1;33m⚠ internal-eval.sh 已并入 development.sh，正在转发…\033[0m" >&2
+
 export LIFE_ECHO_WITH_INTERNAL_EVAL=1
 
 if [[ "${SKIP_CELERY:-}" == "1" ]]; then
diff --git a/api/pyproject.toml b/api/pyproject.toml
index cf056f0..95d2598 100644
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -17,6 +17,15 @@ dependencies = [
     "langchain-openai>=1.1.11",
     "loguru>=0.7.3",
     "openai>=2.26.0",
+    "opentelemetry-api>=1.42.0",
+    "opentelemetry-exporter-otlp-proto-grpc>=1.42.0",
+    "opentelemetry-instrumentation-celery>=0.63b0",
+    "opentelemetry-instrumentation-fastapi>=0.63b0",
+    "opentelemetry-instrumentation-httpx>=0.63b0",
+    "opentelemetry-instrumentation-logging>=0.63b0",
+    "opentelemetry-instrumentation-redis>=0.63b0",
+    "opentelemetry-instrumentation-sqlalchemy>=0.63b0",
+    "opentelemetry-sdk>=1.42.0",
     "pgvector>=0.4.2",
     "pillow>=12.1.1",
     "psycopg[binary]>=3.2.0",
diff --git a/api/scripts/verify_observability_metrics.sh b/api/scripts/verify_observability_metrics.sh
new file mode 100755
index 0000000..1ea22c5
--- /dev/null
+++ b/api/scripts/verify_observability_metrics.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# 校验本地 Prometheus 是否已暴露 OTel 导出指标（需 observability compose 运行中）。
+set -euo pipefail
+
+PROM_URL="${PROMETHEUS_URL:-http://127.0.0.1:49090}"
+QUERY_ENDPOINT="${PROM_URL}/api/v1/query"
+
+check_metric() {
+  local name="$1"
+  local result
+  result="$(curl -sf "${QUERY_ENDPOINT}?query=${name}" | python3 -c "
+import json, sys
+data = json.load(sys.stdin)
+r = data.get('data', {}).get('result', [])
+print('ok' if r else 'missing')
+")"
+  if [[ "${result}" != "ok" ]]; then
+    echo "MISSING: ${name}"
+    return 1
+  fi
+  echo "OK: ${name}"
+}
+
+echo "Checking Prometheus at ${PROM_URL} ..."
+fail=0
+for m in \
+  "llm_call_duration_milliseconds_bucket" \
+  "llm_call_total" \
+  "business_operation_duration_milliseconds_bucket" \
+  "http_server_request_duration_seconds_bucket"
+do
+  check_metric "${m}" || fail=1
+done
+
+if [[ "${fail}" -ne 0 ]]; then
+  echo ""
+  echo "Some metrics missing. Ensure OTEL_ENABLED=true, API/worker running, and traffic generated."
+  exit 1
+fi
+echo "All required metrics present."
diff --git a/api/tests/core/test_business_telemetry.py b/api/tests/core/test_business_telemetry.py
new file mode 100644
index 0000000..9ed81da
--- /dev/null
+++ b/api/tests/core/test_business_telemetry.py
@@ -0,0 +1,64 @@
+"""Business telemetry helpers (no real Collector required)."""
+
+from __future__ import annotations
+
+import pytest
+from opentelemetry import trace
+
+from app.core.business_telemetry import business_span
+from app.core.config import settings
+
+
+class TestBusinessSpan:
+    def test_disabled_is_noop(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(settings, "otel_enabled", False)
+        with business_span("memoir.phase1", user_id="u1") as span:
+            assert span == trace.INVALID_SPAN
+
+    def test_filters_high_cardinality_attrs(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(settings, "otel_enabled", True)
+        from opentelemetry.sdk.trace import TracerProvider
+        from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+        from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+            InMemorySpanExporter,
+        )
+
+        exporter = InMemorySpanExporter()
+        provider = TracerProvider()
+        provider.add_span_processor(SimpleSpanProcessor(exporter))
+        monkeypatch.setattr(
+            "app.core.business_telemetry.get_tracer",
+            lambda _name: provider.get_tracer("test"),
+        )
+
+        with business_span(
+            "memoir.phase2",
+            user_id="user-123",
+            chapter_category="childhood",
+        ):
+            pass
+
+        spans = exporter.get_finished_spans()
+        assert spans
+        attrs = dict(spans[0].attributes or {})
+        assert attrs.get("business.chapter_category") == "childhood"
+        assert "business.user_id" not in attrs
+
+    def test_enabled_yields_span(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(settings, "otel_enabled", True)
+        from opentelemetry.sdk.trace import TracerProvider
+        from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+        from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+            InMemorySpanExporter,
+        )
+
+        exporter = InMemorySpanExporter()
+        provider = TracerProvider()
+        provider.add_span_processor(SimpleSpanProcessor(exporter))
+        monkeypatch.setattr(
+            "app.core.business_telemetry.get_tracer",
+            lambda _name: provider.get_tracer("test"),
+        )
+
+        with business_span("conversation.ws.process_turn") as span:
+            assert span.is_recording()
diff --git a/api/tests/core/test_llm_telemetry.py b/api/tests/core/test_llm_telemetry.py
new file mode 100644
index 0000000..05a3658
--- /dev/null
+++ b/api/tests/core/test_llm_telemetry.py
@@ -0,0 +1,118 @@
+"""LLM telemetry helpers (no real Collector required)."""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from unittest.mock import patch
+
+import pytest
+
+from app.core import llm_telemetry
+from app.core.config import settings
+
+
+class TestExtractTokenUsage:
+    def test_usage_metadata_object(self) -> None:
+        msg = SimpleNamespace(usage_metadata=SimpleNamespace(input_tokens=10, output_tokens=4))
+        assert llm_telemetry.extract_token_usage(msg) == (10, 4)
+
+    def test_response_metadata_dict(self) -> None:
+        msg = SimpleNamespace(
+            usage_metadata=None,
+            response_metadata={"token_usage": {"prompt_tokens": 3, "completion_tokens": 7}},
+        )
+        assert llm_telemetry.extract_token_usage(msg) == (3, 7)
+
+    def test_missing_usage_returns_zero(self) -> None:
+        assert llm_telemetry.extract_token_usage(SimpleNamespace()) == (0, 0)
+
+
+class TestOtelDisabledNoOp:
+    def test_record_llm_completion_disabled(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(settings, "otel_enabled", False)
+        llm_telemetry.record_llm_completion(
+            agent="Test",
+            provider="mock",
+            model="m",
+            duration_ms=1.0,
+            input_tokens=5,
+            output_tokens=2,
+        )
+
+    def test_langchain_invoke_span_disabled(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(settings, "otel_enabled", False)
+        with llm_telemetry.langchain_invoke_span(
+            agent="Test",
+            provider="mock",
+            model="m",
+            call_type="chat",
+        ) as ctx:
+            ctx["response"] = SimpleNamespace(
+                usage_metadata=SimpleNamespace(input_tokens=1, output_tokens=1)
+            )
+            assert ctx["outcome"] == "ok"
+
+
+class TestLangchainInvokeSpanRecordsTokens:
+    def test_records_completion_with_tokens(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(settings, "otel_enabled", True)
+        recorded: list[dict] = []
+
+        def _capture(**kwargs: object) -> None:
+            recorded.append(kwargs)
+
+        with patch.object(llm_telemetry, "record_llm_completion", side_effect=_capture):
+            with llm_telemetry.langchain_invoke_span(
+                agent="TestAgent",
+                provider="mock",
+                model="m1",
+                call_type="chat",
+            ) as ctx:
+                ctx["response"] = SimpleNamespace(
+                    usage_metadata=SimpleNamespace(input_tokens=11, output_tokens=5)
+                )
+
+        assert len(recorded) == 1
+        assert recorded[0]["input_tokens"] == 11
+        assert recorded[0]["output_tokens"] == 5
+        assert recorded[0]["agent"] == "TestAgent"
+
+
+class TestObserveAinvokeExtraAttributes:
+    @pytest.mark.asyncio
+    async def test_response_latency_on_span(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        monkeypatch.setattr(settings, "otel_enabled", True)
+        from opentelemetry.sdk.trace import TracerProvider
+        from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+        from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+            InMemorySpanExporter,
+        )
+        from opentelemetry import trace
+
+        exporter = InMemorySpanExporter()
+        provider = TracerProvider()
+        provider.add_span_processor(SimpleSpanProcessor(exporter))
+        monkeypatch.setattr(
+            "app.core.llm_telemetry.get_tracer",
+            lambda _name: provider.get_tracer("test"),
+        )
+
+        class _LLM:
+            async def ainvoke(self, messages: list) -> SimpleNamespace:
+                return SimpleNamespace(
+                    usage_metadata=SimpleNamespace(input_tokens=1, output_tokens=1)
+                )
+
+        await llm_telemetry.observe_ainvoke(
+            _LLM(),
+            [],
+            agent="Test",
+            provider="mock",
+            model="m",
+            extra_span_attributes={"llm.custom": "x"},
+        )
+        spans = exporter.get_finished_spans()
+        assert spans
+        attrs = dict(spans[-1].attributes or {})
+        assert "llm.response_latency_ms" in attrs
+        assert attrs.get("llm.custom") == "x"
diff --git a/api/uv.lock b/api/uv.lock
index 38daf95..1645a62 100644
--- a/api/uv.lock
+++ b/api/uv.lock
@@ -88,6 +88,15 @@ dependencies = [
     { name = "langchain-openai" },
     { name = "loguru" },
     { name = "openai" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-grpc" },
+    { name = "opentelemetry-instrumentation-celery" },
+    { name = "opentelemetry-instrumentation-fastapi" },
+    { name = "opentelemetry-instrumentation-httpx" },
+    { name = "opentelemetry-instrumentation-logging" },
+    { name = "opentelemetry-instrumentation-redis" },
+    { name = "opentelemetry-instrumentation-sqlalchemy" },
+    { name = "opentelemetry-sdk" },
     { name = "pgvector" },
     { name = "pillow" },
     { name = "psycopg", extra = ["binary"] },
@@ -129,6 +138,15 @@ requires-dist = [
     { name = "langchain-openai", specifier = ">=1.1.11" },
     { name = "loguru", specifier = ">=0.7.3" },
     { name = "openai", specifier = ">=2.26.0" },
+    { name = "opentelemetry-api", specifier = ">=1.42.0" },
+    { name = "opentelemetry-exporter-otlp-proto-grpc", specifier = ">=1.42.0" },
+    { name = "opentelemetry-instrumentation-celery", specifier = ">=0.63b0" },
+    { name = "opentelemetry-instrumentation-fastapi", specifier = ">=0.63b0" },
+    { name = "opentelemetry-instrumentation-httpx", specifier = ">=0.63b0" },
+    { name = "opentelemetry-instrumentation-logging", specifier = ">=0.63b0" },
+    { name = "opentelemetry-instrumentation-redis", specifier = ">=0.63b0" },
+    { name = "opentelemetry-instrumentation-sqlalchemy", specifier = ">=0.63b0" },
+    { name = "opentelemetry-sdk", specifier = ">=1.42.0" },
     { name = "pgvector", specifier = ">=0.4.2" },
     { name = "pillow", specifier = ">=12.1.1" },
     { name = "psycopg", extras = ["binary"], specifier = ">=3.2.0" },
@@ -156,6 +174,15 @@ dev = [
     { name = "ruff", specifier = ">=0.15.6" },
 ]
 
+[[package]]
+name = "asgiref"
+version = "3.11.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/63/40/f03da1264ae8f7cfdbf9146542e5e7e8100a4c66ab48e791df9a03d3f6c0/asgiref-3.11.1.tar.gz", hash = "sha256:5f184dc43b7e763efe848065441eac62229c9f7b0475f41f80e207a114eda4ce", size = 38550, upload-time = "2026-02-03T13:30:14.33Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5c/0a/a72d10ed65068e115044937873362e6e32fab1b7dce0046aeb224682c989/asgiref-3.11.1-py3-none-any.whl", hash = "sha256:e8667a091e69529631969fd45dc268fa79b99c92c5fcdda727757e52146ec133", size = 24345, upload-time = "2026-02-03T13:30:13.039Z" },
+]
+
 [[package]]
 name = "av"
 version = "16.1.0"
@@ -920,6 +947,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" },
 ]
 
+[[package]]
+name = "googleapis-common-protos"
+version = "1.75.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b5/c8/f439cffde755cffa462bfbb156278fa6f9d09119719af9814b858fd4f81f/googleapis_common_protos-1.75.0.tar.gz", hash = "sha256:53a062ff3c32552fbd62c11fe23768b78e4ddf0494d5e5fd97d3f4689c75fbbd", size = 151035, upload-time = "2026-05-07T08:04:49.423Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e7/c8/e2645aa8ed02fd4c7a2f59d68783b65b1f3cbdfe39a6308e156509d1fee8/googleapis_common_protos-1.75.0-py3-none-any.whl", hash = "sha256:961ed60399c457ceb0ee8f285a84c870aabc9c6a832b9d37bb281b5bebde43ed", size = 300631, upload-time = "2026-05-07T08:03:30.345Z" },
+]
+
 [[package]]
 name = "greenlet"
 version = "3.3.2"
@@ -954,6 +993,37 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/29/4b/45d90626aef8e65336bed690106d1382f7a43665e2249017e9527df8823b/greenlet-3.3.2-cp314-cp314t-win_amd64.whl", hash = "sha256:c04c5e06ec3e022cbfe2cd4a846e1d4e50087444f875ff6d2c2ad8445495cf1a", size = 237086, upload-time = "2026-02-20T20:20:45.786Z" },
 ]
 
+[[package]]
+name = "grpcio"
+version = "1.80.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b7/48/af6173dbca4454f4637a4678b67f52ca7e0c1ed7d5894d89d434fecede05/grpcio-1.80.0.tar.gz", hash = "sha256:29aca15edd0688c22ba01d7cc01cb000d72b2033f4a3c72a81a19b56fd143257", size = 12978905, upload-time = "2026-03-30T08:49:10.502Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2f/3a/7c3c25789e3f069e581dc342e03613c5b1cb012c4e8c7d9d5cf960a75856/grpcio-1.80.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:e9e408fc016dffd20661f0126c53d8a31c2821b5c13c5d67a0f5ed5de93319ad", size = 6017243, upload-time = "2026-03-30T08:47:40.075Z" },
+    { url = "https://files.pythonhosted.org/packages/04/19/21a9806eb8240e174fd1ab0cd5b9aa948bb0e05c2f2f55f9d5d7405e6d08/grpcio-1.80.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:92d787312e613754d4d8b9ca6d3297e69994a7912a32fa38c4c4e01c272974b0", size = 12010840, upload-time = "2026-03-30T08:47:43.11Z" },
+    { url = "https://files.pythonhosted.org/packages/18/3a/23347d35f76f639e807fb7a36fad3068aed100996849a33809591f26eca6/grpcio-1.80.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8ac393b58aa16991a2f1144ec578084d544038c12242da3a215966b512904d0f", size = 6567644, upload-time = "2026-03-30T08:47:46.806Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/40/96e07ecb604a6a67ae6ab151e3e35b132875d98bc68ec65f3e5ab3e781d7/grpcio-1.80.0-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:68e5851ac4b9afe07e7f84483803ad167852570d65326b34d54ca560bfa53fb6", size = 7277830, upload-time = "2026-03-30T08:47:49.643Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/e2/da1506ecea1f34a5e365964644b35edef53803052b763ca214ba3870c856/grpcio-1.80.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:873ff5d17d68992ef6605330127425d2fc4e77e612fa3c3e0ed4e668685e3140", size = 6783216, upload-time = "2026-03-30T08:47:52.817Z" },
+    { url = "https://files.pythonhosted.org/packages/44/83/3b20ff58d0c3b7f6caaa3af9a4174d4023701df40a3f39f7f1c8e7c48f9d/grpcio-1.80.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2bea16af2750fd0a899bf1abd9022244418b55d1f37da2202249ba4ba673838d", size = 7385866, upload-time = "2026-03-30T08:47:55.687Z" },
+    { url = "https://files.pythonhosted.org/packages/47/45/55c507599c5520416de5eefecc927d6a0d7af55e91cfffb2e410607e5744/grpcio-1.80.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ba0db34f7e1d803a878284cd70e4c63cb6ae2510ba51937bf8f45ba997cefcf7", size = 8391602, upload-time = "2026-03-30T08:47:58.303Z" },
+    { url = "https://files.pythonhosted.org/packages/10/bb/dd06f4c24c01db9cf11341b547d0a016b2c90ed7dbbb086a5710df7dd1d7/grpcio-1.80.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8eb613f02d34721f1acf3626dfdb3545bd3c8505b0e52bf8b5710a28d02e8aa7", size = 7826752, upload-time = "2026-03-30T08:48:01.311Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/1e/9d67992ba23371fd63d4527096eb8c6b76d74d52b500df992a3343fd7251/grpcio-1.80.0-cp313-cp313-win32.whl", hash = "sha256:93b6f823810720912fd131f561f91f5fed0fda372b6b7028a2681b8194d5d294", size = 4142310, upload-time = "2026-03-30T08:48:04.594Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/e6/283326a27da9e2c3038bc93eeea36fb118ce0b2d03922a9cda6688f53c5b/grpcio-1.80.0-cp313-cp313-win_amd64.whl", hash = "sha256:e172cf795a3ba5246d3529e4d34c53db70e888fa582a8ffebd2e6e48bc0cba50", size = 4882833, upload-time = "2026-03-30T08:48:07.363Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/6d/e65307ce20f5a09244ba9e9d8476e99fb039de7154f37fb85f26978b59c3/grpcio-1.80.0-cp314-cp314-linux_armv7l.whl", hash = "sha256:3d4147a97c8344d065d01bbf8b6acec2cf86fb0400d40696c8bdad34a64ffc0e", size = 6017376, upload-time = "2026-03-30T08:48:10.005Z" },
+    { url = "https://files.pythonhosted.org/packages/69/10/9cef5d9650c72625a699c549940f0abb3c4bfdb5ed45a5ce431f92f31806/grpcio-1.80.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:d8e11f167935b3eb089ac9038e1a063e6d7dbe995c0bb4a661e614583352e76f", size = 12018133, upload-time = "2026-03-30T08:48:12.927Z" },
+    { url = "https://files.pythonhosted.org/packages/04/82/983aabaad82ba26113caceeb9091706a0696b25da004fe3defb5b346e15b/grpcio-1.80.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f14b618fc30de822681ee986cfdcc2d9327229dc4c98aed16896761cacd468b9", size = 6574748, upload-time = "2026-03-30T08:48:16.386Z" },
+    { url = "https://files.pythonhosted.org/packages/07/d7/031666ef155aa0bf399ed7e19439656c38bbd143779ae0861b038ce82abd/grpcio-1.80.0-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:4ed39fbdcf9b87370f6e8df4e39ca7b38b3e5e9d1b0013c7b6be9639d6578d14", size = 7277711, upload-time = "2026-03-30T08:48:19.627Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/43/f437a78f7f4f1d311804189e8f11fb311a01049b2e08557c1068d470cb2e/grpcio-1.80.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2dcc70e9f0ba987526e8e8603a610fb4f460e42899e74e7a518bf3c68fe1bf05", size = 6785372, upload-time = "2026-03-30T08:48:22.373Z" },
+    { url = "https://files.pythonhosted.org/packages/93/3d/f6558e9c6296cb4227faa5c43c54a34c68d32654b829f53288313d16a86e/grpcio-1.80.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:448c884b668b868562b1bda833c5fce6272d26e1926ec46747cda05741d302c1", size = 7395268, upload-time = "2026-03-30T08:48:25.638Z" },
+    { url = "https://files.pythonhosted.org/packages/06/21/0fdd77e84720b08843c371a2efa6f2e19dbebf56adc72df73d891f5506f0/grpcio-1.80.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:a1dc80fe55685b4a543555e6eef975303b36c8db1023b1599b094b92aa77965f", size = 8392000, upload-time = "2026-03-30T08:48:28.974Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/68/67f4947ed55d2e69f2cc199ab9fd85e0a0034d813bbeef84df6d2ba4d4b7/grpcio-1.80.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:31b9ac4ad1aa28ffee5503821fafd09e4da0a261ce1c1281c6c8da0423c83b6e", size = 7828477, upload-time = "2026-03-30T08:48:32.054Z" },
+    { url = "https://files.pythonhosted.org/packages/44/b6/8d4096691b2e385e8271911a0de4f35f0a6c7d05aff7098e296c3de86939/grpcio-1.80.0-cp314-cp314-win32.whl", hash = "sha256:367ce30ba67d05e0592470428f0ec1c31714cab9ef19b8f2e37be1f4c7d32fae", size = 4218563, upload-time = "2026-03-30T08:48:34.538Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/8c/bbe6baf2557262834f2070cf668515fa308b2d38a4bbf771f8f7872a7036/grpcio-1.80.0-cp314-cp314-win_amd64.whl", hash = "sha256:3b01e1f5464c583d2f567b2e46ff0d516ef979978f72091fd81f5ab7fa6e2e7f", size = 5019457, upload-time = "2026-03-30T08:48:37.308Z" },
+]
+
 [[package]]
 name = "h11"
 version = "0.16.0"
@@ -1523,6 +1593,218 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c6/2e/3f73e8ca53718952222cacd0cf7eecc9db439d020f0c1fe7ae717e4e199a/openai-2.26.0-py3-none-any.whl", hash = "sha256:6151bf8f83802f036117f06cc8a57b3a4da60da9926826cc96747888b57f394f", size = 1136409, upload-time = "2026-03-05T23:17:34.072Z" },
 ]
 
+[[package]]
+name = "opentelemetry-api"
+version = "1.42.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/86/ca/25288069c399be6769159d9fb7b1190b603537d82aad2fa2746a0cc2c8c6/opentelemetry_api-1.42.0.tar.gz", hash = "sha256:ea84c893ad177791d138e0349d6ceebd8d3bf006440900400ce220008dafc372", size = 72300, upload-time = "2026-05-19T09:46:29.885Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1b/0b/be5daf659b82b525338fde371dfcfab09b606a19bb5620c37076964710ec/opentelemetry_api-1.42.0-py3-none-any.whl", hash = "sha256:558d88f88192a973579910ef6f2c13db47a268d5ec2e53e83e50e74a39a02922", size = 61310, upload-time = "2026-05-19T09:46:06.561Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-common"
+version = "1.42.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-proto" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/76/a9/1496f27ecdfc7d504eac80f5e16474ee9d47cd08cda1f2917b58cf1c299c/opentelemetry_exporter_otlp_proto_common-1.42.0.tar.gz", hash = "sha256:c7a1a61f3a4c4dfa83127353edb1c75b873289d9ee42379db46eb835963b72e3", size = 21430, upload-time = "2026-05-19T09:46:32.838Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8b/7b/1542eb6e3d941a7dd93648d485b7c8495bc2841a2bb7dd5f394f370cf607/opentelemetry_exporter_otlp_proto_common-1.42.0-py3-none-any.whl", hash = "sha256:92de67f096c9200770f16fbdb63b96fb6061d604b4bc266726d8355caeb864e8", size = 17328, upload-time = "2026-05-19T09:46:11.291Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-grpc"
+version = "1.42.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "googleapis-common-protos" },
+    { name = "grpcio" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-common" },
+    { name = "opentelemetry-proto" },
+    { name = "opentelemetry-sdk" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/01/6a/63812e4f67d3658b21e94bc890b67296951f3aa8f6950fdf735f763500e5/opentelemetry_exporter_otlp_proto_grpc-1.42.0.tar.gz", hash = "sha256:75eac4e9d0bd69bea8199d75dfeb585cce05a9baa8215d1f7aad9e3583bf5ef9", size = 27136, upload-time = "2026-05-19T09:46:33.594Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4d/e9/308c4c03b536005a1443bee0d9f06de38aad8b94f59f58ac688ead7a8cf9/opentelemetry_exporter_otlp_proto_grpc-1.42.0-py3-none-any.whl", hash = "sha256:5d6d1691586f2e656fd14187f2f2f5fa06e94834e1acdce71edcbbe35730b31d", size = 19614, upload-time = "2026-05-19T09:46:12.331Z" },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation"
+version = "0.63b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "packaging" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2a/2d/322d464f4105966fb8555f871a84f43e821ce9aaf64ecae9586e9691c6a2/opentelemetry_instrumentation-0.63b0.tar.gz", hash = "sha256:80a339ef030a8d0fd1962375a9801dd31954e5063d74c00bc3d4e6581f43bab1", size = 41083, upload-time = "2026-05-19T09:47:06.194Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ae/45/a38e74da3f1b5c82c97289da91d978caa04321877f0ab170fc620a0753f2/opentelemetry_instrumentation-0.63b0-py3-none-any.whl", hash = "sha256:984b18763b652a881ac5a596098d89923f74cf53a658c2dde660387e018147ca", size = 35574, upload-time = "2026-05-19T09:46:07.257Z" },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation-asgi"
+version = "0.63b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "asgiref" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "opentelemetry-util-http" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b2/ba/dd540189230d211898ccc4df899874bc0d84f5c54a1e07a13a2bde606a57/opentelemetry_instrumentation_asgi-0.63b0.tar.gz", hash = "sha256:e201eed7616f7da0840adf8ab8c5ea64db7ab19b920373b38983e2bac8d3645d", size = 26154, upload-time = "2026-05-19T09:47:10.023Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ad/4f/caa793347febb9dae45f3d03d8bac04bf0752170a19c53016a0a91a214a0/opentelemetry_instrumentation_asgi-0.63b0-py3-none-any.whl", hash = "sha256:4e89555c110677226b9ca1734eda248360916bccf0ebadf8db8baf0015c9efca", size = 15907, upload-time = "2026-05-19T09:46:13.675Z" },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation-celery"
+version = "0.63b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "opentelemetry-semantic-conventions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/67/a7/82e696152b65178d13f9ee2241cadb72b7b908603c692a8519f0c0295e35/opentelemetry_instrumentation_celery-0.63b0.tar.gz", hash = "sha256:c02371fe46073b57ecf1287d833bfe00c02f79ba600549752ae7bd4fbcd8f06a", size = 15520, upload-time = "2026-05-19T09:47:15.445Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f7/9a/03f9168c0a07a0441129a9a426405f6b0efc3804f4c0c1e200f0a3a7c568/opentelemetry_instrumentation_celery-0.63b0-py3-none-any.whl", hash = "sha256:732d3a0b883cb777d8e0213ebbfa49fe8a8ee987ea49a6d45ec1351cb09e8b93", size = 13170, upload-time = "2026-05-19T09:46:21.78Z" },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation-fastapi"
+version = "0.63b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "opentelemetry-instrumentation-asgi" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "opentelemetry-util-http" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/51/73/6e44cd21b17d4affd41a621804421d476940b1dab352254b1a9c08a08df6/opentelemetry_instrumentation_fastapi-0.63b0.tar.gz", hash = "sha256:5117df842d0ce47e1fb9eb3c2ad2a7594bd139b129de9f3fa1ce5b28e970c046", size = 25387, upload-time = "2026-05-19T09:47:20.726Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/2d/f869b45eddbb7332cce7a863a4d1e758d58a9c890db6dbf0fe6aedd3eda1/opentelemetry_instrumentation_fastapi-0.63b0-py3-none-any.whl", hash = "sha256:ed43d2358164df83d811a8d69a7578cad3ab66fde4db027296c1ee20f703e3f0", size = 12797, upload-time = "2026-05-19T09:46:28.885Z" },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation-httpx"
+version = "0.63b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "opentelemetry-util-http" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d3/22/21c1d745b82eb28c41c4f0635be1d7b9d9d77bbe0b6c718d7e7d7fcc6f20/opentelemetry_instrumentation_httpx-0.63b0.tar.gz", hash = "sha256:aafb9e336be48b4c0c19ae1f003621e23d75b3560797d42baa656dcc3a555266", size = 23556, upload-time = "2026-05-19T09:47:22.997Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/af/f1/0c9ba71e48129390a9db60ec92ab0149cf97d1a983c11a77e1a04ec5dc7b/opentelemetry_instrumentation_httpx-0.63b0-py3-none-any.whl", hash = "sha256:e4359d317a3313fa8607b7ab4c47088a428856349363c754013fbd595f60fb23", size = 16338, upload-time = "2026-05-19T09:46:32.015Z" },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation-logging"
+version = "0.63b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "opentelemetry-semantic-conventions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/11/a8/e5ae9bf71babc3589252d826ffd212c004582a42699ab24245ecf8004f4a/opentelemetry_instrumentation_logging-0.63b0.tar.gz", hash = "sha256:c4b875cdd712e01e2a0b904d9c9248f4f03a8f41a8acd64000984359841b98d8", size = 19824, upload-time = "2026-05-19T09:47:24.771Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d7/1c/f61d7aa67ecf4ecc04bba5a276f6dc67f0803f6d0a61eceb585f3bb2fcb9/opentelemetry_instrumentation_logging-0.63b0-py3-none-any.whl", hash = "sha256:8fe17ed310de42683dc585f1bf6af6ccaa3192c997c431c57177e15bee6885f5", size = 15992, upload-time = "2026-05-19T09:46:35.553Z" },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation-redis"
+version = "0.63b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9c/bc/98f3355db9dd0f2885f168a2544739783349df7ed495cba2c06dddb3c183/opentelemetry_instrumentation_redis-0.63b0.tar.gz", hash = "sha256:a369c140eb7cdd8b59192255eb4e361755dc5353be5aa0ff25a2cbf964fb993c", size = 16713, upload-time = "2026-05-19T09:47:32.264Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9c/6a/b9955b1e659793e9e5e787e90d6b203b17fcf2b88811794fe1efa584ee94/opentelemetry_instrumentation_redis-0.63b0-py3-none-any.whl", hash = "sha256:61e1c18f1f87d2ebec1ed69dd187e233c4482ae528e02929150ef2699d15120a", size = 14538, upload-time = "2026-05-19T09:46:46.242Z" },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation-sqlalchemy"
+version = "0.63b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "packaging" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/36/72/0def937531c0e7a423af06cbffaf235caea7af0275082c6bca13a25701ec/opentelemetry_instrumentation_sqlalchemy-0.63b0.tar.gz", hash = "sha256:b854ac9fd5707a8f79dc9b252cdec6873217e5a6e7e5fdb43dca6858a26342cb", size = 18007, upload-time = "2026-05-19T09:47:34.518Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/91/98/eb7430900f683fd6cec4745736bc69ca7260442b6b20ad05194abe97a187/opentelemetry_instrumentation_sqlalchemy-0.63b0-py3-none-any.whl", hash = "sha256:6a31bf004798f8eabb74f75e1d90cf081c7d470933867be6a5c8c985925ddb3e", size = 14410, upload-time = "2026-05-19T09:46:49.328Z" },
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "1.42.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/71/2c/7c56a19498b46da4c54dc4e765c95d17f8fec2ba86bec1817b41ae635360/opentelemetry_proto-1.42.0.tar.gz", hash = "sha256:5d56a9067b631ea931a135d7b86428ae99649f591d4db69b9fc8c8e0465fce65", size = 45841, upload-time = "2026-05-19T09:46:42.058Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2d/ad/ff5f619a04cddb4936ead0dd8f590c5b373c5b4b9f2eef555e9d3d951ccb/opentelemetry_proto-1.42.0-py3-none-any.whl", hash = "sha256:2c0716a37e5c12efef37cbd01906d649b7fb85c85ac687518d0bd28527c6498e", size = 71779, upload-time = "2026-05-19T09:46:24.536Z" },
+]
+
+[[package]]
+name = "opentelemetry-sdk"
+version = "1.42.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b7/c9/dabaaf1c754a57b82b5a36aeca3806d92c1877ccfb12a697b65f88bf027c/opentelemetry_sdk-1.42.0.tar.gz", hash = "sha256:2479e462cc69357825c2c847ce4a601bc1b17e1279aa7f80d3490f0ae614d0e5", size = 239072, upload-time = "2026-05-19T09:46:42.992Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7b/7d/16bf9a9d42ebbd1679e0cda018d57a0712f3b6f6f1e7ae5ef3c7ee5927c0/opentelemetry_sdk-1.42.0-py3-none-any.whl", hash = "sha256:ec4a4f69e15220b3d7bccd93217aac745682bb6435b9381f7bb44cb7e07b4f2b", size = 170879, upload-time = "2026-05-19T09:46:25.871Z" },
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.63b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/20/f8/be4625838aae098c2f9fbdc062a1b3128ebb9e799b891b654ee8cad94897/opentelemetry_semantic_conventions-0.63b0.tar.gz", hash = "sha256:cfea295264654fa324fcef24aa56fb1836fdc0da27db128645dc6aa76115cc6c", size = 148333, upload-time = "2026-05-19T09:46:44.01Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8f/6f/8d0ce225b8fdbb72c97cf4130107d861eafcb3d8e5c3f5891e8556177316/opentelemetry_semantic_conventions-0.63b0-py3-none-any.whl", hash = "sha256:1f3962732b04f43e4fef28173c9a3615b8847b4b2d6386fdc085361b29875ab9", size = 203712, upload-time = "2026-05-19T09:46:27.569Z" },
+]
+
+[[package]]
+name = "opentelemetry-util-http"
+version = "0.63b0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/21/cf/0b53c5fe1113fb01e23c6c88b66d8289f979e61cece444576b286a3415fd/opentelemetry_util_http-0.63b0.tar.gz", hash = "sha256:401ddd686cd943ef801b9384b0722b904250f6bf3906951ce4f27bb6b63b04a3", size = 11101, upload-time = "2026-05-19T09:47:42.885Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6e/8c/7fd6f06139cca88a6341bebf2b01f3e97bb8fd8d12e7d3ad3d2ad88b8c49/opentelemetry_util_http-0.63b0-py3-none-any.whl", hash = "sha256:80536361b6348e57503cdae8c1b1be79574d14c30e879367336c5a076fd4f673", size = 8209, upload-time = "2026-05-19T09:47:01.712Z" },
+]
+
 [[package]]
 name = "orjson"
 version = "3.11.7"
@@ -1693,17 +1975,17 @@ wheels = [
 
 [[package]]
 name = "protobuf"
-version = "7.34.0"
+version = "6.33.6"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f2/00/04a2ab36b70a52d0356852979e08b44edde0435f2115dc66e25f2100f3ab/protobuf-7.34.0.tar.gz", hash = "sha256:3871a3df67c710aaf7bb8d214cc997342e63ceebd940c8c7fc65c9b3d697591a", size = 454726, upload-time = "2026-02-27T00:30:25.421Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/66/70/e908e9c5e52ef7c3a6c7902c9dfbb34c7e29c25d2f81ade3856445fd5c94/protobuf-6.33.6.tar.gz", hash = "sha256:a6768d25248312c297558af96a9f9c929e8c4cee0659cb07e780731095f38135", size = 444531, upload-time = "2026-03-18T19:05:00.988Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/13/c4/6322ab5c8f279c4c358bc14eb8aefc0550b97222a39f04eb3c1af7a830fa/protobuf-7.34.0-cp310-abi3-macosx_10_9_universal2.whl", hash = "sha256:8e329966799f2c271d5e05e236459fe1cbfdb8755aaa3b0914fa60947ddea408", size = 429248, upload-time = "2026-02-27T00:30:14.924Z" },
-    { url = "https://files.pythonhosted.org/packages/45/99/b029bbbc61e8937545da5b79aa405ab2d9cf307a728f8c9459ad60d7a481/protobuf-7.34.0-cp310-abi3-manylinux2014_aarch64.whl", hash = "sha256:9d7a5005fb96f3c1e64f397f91500b0eb371b28da81296ae73a6b08a5b76cdd6", size = 325753, upload-time = "2026-02-27T00:30:17.247Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/79/09f02671eb75b251c5550a1c48e7b3d4b0623efd7c95a15a50f6f9fc1e2e/protobuf-7.34.0-cp310-abi3-manylinux2014_s390x.whl", hash = "sha256:4a72a8ec94e7a9f7ef7fe818ed26d073305f347f8b3b5ba31e22f81fd85fca02", size = 340200, upload-time = "2026-02-27T00:30:18.672Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/57/89727baef7578897af5ed166735ceb315819f1c184da8c3441271dbcfde7/protobuf-7.34.0-cp310-abi3-manylinux2014_x86_64.whl", hash = "sha256:964cf977e07f479c0697964e83deda72bcbc75c3badab506fb061b352d991b01", size = 324268, upload-time = "2026-02-27T00:30:20.088Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/3e/38ff2ddee5cc946f575c9d8cc822e34bde205cf61acf8099ad88ef19d7d2/protobuf-7.34.0-cp310-abi3-win32.whl", hash = "sha256:f791ec509707a1d91bd02e07df157e75e4fb9fbdad12a81b7396201ec244e2e3", size = 426628, upload-time = "2026-02-27T00:30:21.555Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/71/7c32eaf34a61a1bae1b62a2ac4ffe09b8d1bb0cf93ad505f42040023db89/protobuf-7.34.0-cp310-abi3-win_amd64.whl", hash = "sha256:9f9079f1dde4e32342ecbd1c118d76367090d4aaa19da78230c38101c5b3dd40", size = 437901, upload-time = "2026-02-27T00:30:22.836Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/e7/14dc9366696dcb53a413449881743426ed289d687bcf3d5aee4726c32ebb/protobuf-7.34.0-py3-none-any.whl", hash = "sha256:e3b914dd77fa33fa06ab2baa97937746ab25695f389869afdf03e81f34e45dc7", size = 170716, upload-time = "2026-02-27T00:30:23.994Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/9f/2f509339e89cfa6f6a4c4ff50438db9ca488dec341f7e454adad60150b00/protobuf-6.33.6-cp310-abi3-win32.whl", hash = "sha256:7d29d9b65f8afef196f8334e80d6bc1d5d4adedb449971fefd3723824e6e77d3", size = 425739, upload-time = "2026-03-18T19:04:48.373Z" },
+    { url = "https://files.pythonhosted.org/packages/76/5d/683efcd4798e0030c1bab27374fd13a89f7c2515fb1f3123efdfaa5eab57/protobuf-6.33.6-cp310-abi3-win_amd64.whl", hash = "sha256:0cd27b587afca21b7cfa59a74dcbd48a50f0a6400cfb59391340ad729d91d326", size = 437089, upload-time = "2026-03-18T19:04:50.381Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/01/a3c3ed5cd186f39e7880f8303cc51385a198a81469d53d0fdecf1f64d929/protobuf-6.33.6-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:9720e6961b251bde64edfdab7d500725a2af5280f3f4c87e57c0208376aa8c3a", size = 427737, upload-time = "2026-03-18T19:04:51.866Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/90/b3c01fdec7d2f627b3a6884243ba328c1217ed2d978def5c12dc50d328a3/protobuf-6.33.6-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:e2afbae9b8e1825e3529f88d514754e094278bb95eadc0e199751cdd9a2e82a2", size = 324610, upload-time = "2026-03-18T19:04:53.096Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/ca/25afc144934014700c52e05103c2421997482d561f3101ff352e1292fb81/protobuf-6.33.6-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:c96c37eec15086b79762ed265d59ab204dabc53056e3443e702d2681f4b39ce3", size = 339381, upload-time = "2026-03-18T19:04:54.616Z" },
+    { url = "https://files.pythonhosted.org/packages/16/92/d1e32e3e0d894fe00b15ce28ad4944ab692713f2e7f0a99787405e43533a/protobuf-6.33.6-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:e9db7e292e0ab79dd108d7f1a94fe31601ce1ee3f7b79e0692043423020b0593", size = 323436, upload-time = "2026-03-18T19:04:55.768Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/72/02445137af02769918a93807b2b7890047c32bfb9f90371cbc12688819eb/protobuf-6.33.6-py3-none-any.whl", hash = "sha256:77179e006c476e69bf8e8ce866640091ec42e1beb80b213c3900006ecfba6901", size = 170656, upload-time = "2026-03-18T19:04:59.826Z" },
 ]
 
 [[package]]
@@ -2869,6 +3151,59 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" },
 ]
 
+[[package]]
+name = "wrapt"
+version = "2.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2e/64/925f213fdcbb9baeb1530449ac71a4d57fc361c053d06bf78d0c5c7cd80c/wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e", size = 81678, upload-time = "2026-03-06T02:53:25.134Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4c/7a/d936840735c828b38d26a854e85d5338894cda544cb7a85a9d5b8b9c4df7/wrapt-2.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:787fd6f4d67befa6fe2abdffcbd3de2d82dfc6fb8a6d850407c53332709d030b", size = 61259, upload-time = "2026-03-06T02:53:41.922Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/88/9a9b9a90ac8ca11c2fdb6a286cb3a1fc7dd774c00ed70929a6434f6bc634/wrapt-2.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4bdf26e03e6d0da3f0e9422fd36bcebf7bc0eeb55fdf9c727a09abc6b9fe472e", size = 61851, upload-time = "2026-03-06T02:52:48.672Z" },
+    { url = "https://files.pythonhosted.org/packages/03/a9/5b7d6a16fd6533fed2756900fc8fc923f678179aea62ada6d65c92718c00/wrapt-2.1.2-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bbac24d879aa22998e87f6b3f481a5216311e7d53c7db87f189a7a0266dafffb", size = 121446, upload-time = "2026-03-06T02:54:14.013Z" },
+    { url = "https://files.pythonhosted.org/packages/45/bb/34c443690c847835cfe9f892be78c533d4f32366ad2888972c094a897e39/wrapt-2.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16997dfb9d67addc2e3f41b62a104341e80cac52f91110dece393923c0ebd5ca", size = 123056, upload-time = "2026-03-06T02:54:10.829Z" },
+    { url = "https://files.pythonhosted.org/packages/93/b9/ff205f391cb708f67f41ea148545f2b53ff543a7ac293b30d178af4d2271/wrapt-2.1.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:162e4e2ba7542da9027821cb6e7c5e068d64f9a10b5f15512ea28e954893a267", size = 117359, upload-time = "2026-03-06T02:53:03.623Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/3d/1ea04d7747825119c3c9a5e0874a40b33594ada92e5649347c457d982805/wrapt-2.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f29c827a8d9936ac320746747a016c4bc66ef639f5cd0d32df24f5eacbf9c69f", size = 121479, upload-time = "2026-03-06T02:53:45.844Z" },
+    { url = "https://files.pythonhosted.org/packages/78/cc/ee3a011920c7a023b25e8df26f306b2484a531ab84ca5c96260a73de76c0/wrapt-2.1.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:a9dd9813825f7ecb018c17fd147a01845eb330254dff86d3b5816f20f4d6aaf8", size = 116271, upload-time = "2026-03-06T02:54:46.356Z" },
+    { url = "https://files.pythonhosted.org/packages/98/fd/e5ff7ded41b76d802cf1191288473e850d24ba2e39a6ec540f21ae3b57cb/wrapt-2.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6f8dbdd3719e534860d6a78526aafc220e0241f981367018c2875178cf83a413", size = 120573, upload-time = "2026-03-06T02:52:50.163Z" },
+    { url = "https://files.pythonhosted.org/packages/47/c5/242cae3b5b080cd09bacef0591691ba1879739050cc7c801ff35c8886b66/wrapt-2.1.2-cp313-cp313-win32.whl", hash = "sha256:5c35b5d82b16a3bc6e0a04349b606a0582bc29f573786aebe98e0c159bc48db6", size = 58205, upload-time = "2026-03-06T02:53:47.494Z" },
+    { url = "https://files.pythonhosted.org/packages/12/69/c358c61e7a50f290958809b3c61ebe8b3838ea3e070d7aac9814f95a0528/wrapt-2.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:f8bc1c264d8d1cf5b3560a87bbdd31131573eb25f9f9447bb6252b8d4c44a3a1", size = 60452, upload-time = "2026-03-06T02:53:30.038Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/66/c8a6fcfe321295fd8c0ab1bd685b5a01462a9b3aa2f597254462fc2bc975/wrapt-2.1.2-cp313-cp313-win_arm64.whl", hash = "sha256:3beb22f674550d5634642c645aba4c72a2c66fb185ae1aebe1e955fae5a13baf", size = 58842, upload-time = "2026-03-06T02:52:52.114Z" },
+    { url = "https://files.pythonhosted.org/packages/da/55/9c7052c349106e0b3f17ae8db4b23a691a963c334de7f9dbd60f8f74a831/wrapt-2.1.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0fc04bc8664a8bc4c8e00b37b5355cffca2535209fba1abb09ae2b7c76ddf82b", size = 63075, upload-time = "2026-03-06T02:53:19.108Z" },
+    { url = "https://files.pythonhosted.org/packages/09/a8/ce7b4006f7218248dd71b7b2b732d0710845a0e49213b18faef64811ffef/wrapt-2.1.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a9b9d50c9af998875a1482a038eb05755dfd6fe303a313f6a940bb53a83c3f18", size = 63719, upload-time = "2026-03-06T02:54:33.452Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/e5/2ca472e80b9e2b7a17f106bb8f9df1db11e62101652ce210f66935c6af67/wrapt-2.1.2-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2d3ff4f0024dd224290c0eabf0240f1bfc1f26363431505fb1b0283d3b08f11d", size = 152643, upload-time = "2026-03-06T02:52:42.721Z" },
+    { url = "https://files.pythonhosted.org/packages/36/42/30f0f2cefca9d9cbf6835f544d825064570203c3e70aa873d8ae12e23791/wrapt-2.1.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3278c471f4468ad544a691b31bb856374fbdefb7fee1a152153e64019379f015", size = 158805, upload-time = "2026-03-06T02:54:25.441Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/67/d08672f801f604889dcf58f1a0b424fe3808860ede9e03affc1876b295af/wrapt-2.1.2-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a8914c754d3134a3032601c6984db1c576e6abaf3fc68094bb8ab1379d75ff92", size = 145990, upload-time = "2026-03-06T02:53:57.456Z" },
+    { url = "https://files.pythonhosted.org/packages/68/a7/fd371b02e73babec1de6ade596e8cd9691051058cfdadbfd62a5898f3295/wrapt-2.1.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ff95d4264e55839be37bafe1536db2ab2de19da6b65f9244f01f332b5286cfbf", size = 155670, upload-time = "2026-03-06T02:54:55.309Z" },
+    { url = "https://files.pythonhosted.org/packages/86/2d/9fe0095dfdb621009f40117dcebf41d7396c2c22dca6eac779f4c007b86c/wrapt-2.1.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:76405518ca4e1b76fbb1b9f686cff93aebae03920cc55ceeec48ff9f719c5f67", size = 144357, upload-time = "2026-03-06T02:54:24.092Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/b6/ec7b4a254abbe4cde9fa15c5d2cca4518f6b07d0f1b77d4ee9655e30280e/wrapt-2.1.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c0be8b5a74c5824e9359b53e7e58bef71a729bacc82e16587db1c4ebc91f7c5a", size = 150269, upload-time = "2026-03-06T02:53:31.268Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/6b/2fabe8ebf148f4ee3c782aae86a795cc68ffe7d432ef550f234025ce0cfa/wrapt-2.1.2-cp313-cp313t-win32.whl", hash = "sha256:f01277d9a5fc1862f26f7626da9cf443bebc0abd2f303f41c5e995b15887dabd", size = 59894, upload-time = "2026-03-06T02:54:15.391Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/fb/9ba66fc2dedc936de5f8073c0217b5d4484e966d87723415cc8262c5d9c2/wrapt-2.1.2-cp313-cp313t-win_amd64.whl", hash = "sha256:84ce8f1c2104d2f6daa912b1b5b039f331febfeee74f8042ad4e04992bd95c8f", size = 63197, upload-time = "2026-03-06T02:54:41.943Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/1c/012d7423c95d0e337117723eb8ecf73c622ce15a97847e84cf3f8f26cd7e/wrapt-2.1.2-cp313-cp313t-win_arm64.whl", hash = "sha256:a93cd767e37faeddbe07d8fc4212d5cba660af59bdb0f6372c93faaa13e6e679", size = 60363, upload-time = "2026-03-06T02:54:48.093Z" },
+    { url = "https://files.pythonhosted.org/packages/39/25/e7ea0b417db02bb796182a5316398a75792cd9a22528783d868755e1f669/wrapt-2.1.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:1370e516598854e5b4366e09ce81e08bfe94d42b0fd569b88ec46cc56d9164a9", size = 61418, upload-time = "2026-03-06T02:53:55.706Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/0f/fa539e2f6a770249907757eaeb9a5ff4deb41c026f8466c1c6d799088a9b/wrapt-2.1.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:6de1a3851c27e0bd6a04ca993ea6f80fc53e6c742ee1601f486c08e9f9b900a9", size = 61914, upload-time = "2026-03-06T02:52:53.37Z" },
+    { url = "https://files.pythonhosted.org/packages/53/37/02af1867f5b1441aaeda9c82deed061b7cd1372572ddcd717f6df90b5e93/wrapt-2.1.2-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:de9f1a2bbc5ac7f6012ec24525bdd444765a2ff64b5985ac6e0692144838542e", size = 120417, upload-time = "2026-03-06T02:54:30.74Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/b7/0138a6238c8ba7476c77cf786a807f871672b37f37a422970342308276e7/wrapt-2.1.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:970d57ed83fa040d8b20c52fe74a6ae7e3775ae8cff5efd6a81e06b19078484c", size = 122797, upload-time = "2026-03-06T02:54:51.539Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/ad/819ae558036d6a15b7ed290d5b14e209ca795dd4da9c58e50c067d5927b0/wrapt-2.1.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3969c56e4563c375861c8df14fa55146e81ac11c8db49ea6fb7f2ba58bc1ff9a", size = 117350, upload-time = "2026-03-06T02:54:37.651Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/2d/afc18dc57a4600a6e594f77a9ae09db54f55ba455440a54886694a84c71b/wrapt-2.1.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:57d7c0c980abdc5f1d98b11a2aa3bb159790add80258c717fa49a99921456d90", size = 121223, upload-time = "2026-03-06T02:54:35.221Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/5b/5ec189b22205697bc56eb3b62aed87a1e0423e9c8285d0781c7a83170d15/wrapt-2.1.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:776867878e83130c7a04237010463372e877c1c994d449ca6aaafeab6aab2586", size = 116287, upload-time = "2026-03-06T02:54:19.654Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/2d/f84939a7c9b5e6cdd8a8d0f6a26cabf36a0f7e468b967720e8b0cd2bdf69/wrapt-2.1.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fab036efe5464ec3291411fabb80a7a39e2dd80bae9bcbeeca5087fdfa891e19", size = 119593, upload-time = "2026-03-06T02:54:16.697Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/fe/ccd22a1263159c4ac811ab9374c061bcb4a702773f6e06e38de5f81a1bdc/wrapt-2.1.2-cp314-cp314-win32.whl", hash = "sha256:e6ed62c82ddf58d001096ae84ce7f833db97ae2263bff31c9b336ba8cfe3f508", size = 58631, upload-time = "2026-03-06T02:53:06.498Z" },
+    { url = "https://files.pythonhosted.org/packages/65/0a/6bd83be7bff2e7efaac7b4ac9748da9d75a34634bbbbc8ad077d527146df/wrapt-2.1.2-cp314-cp314-win_amd64.whl", hash = "sha256:467e7c76315390331c67073073d00662015bb730c566820c9ca9b54e4d67fd04", size = 60875, upload-time = "2026-03-06T02:53:50.252Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/c0/0b3056397fe02ff80e5a5d72d627c11eb885d1ca78e71b1a5c1e8c7d45de/wrapt-2.1.2-cp314-cp314-win_arm64.whl", hash = "sha256:da1f00a557c66225d53b095a97eace0fc5349e3bfda28fa34ffae238978ee575", size = 59164, upload-time = "2026-03-06T02:53:59.128Z" },
+    { url = "https://files.pythonhosted.org/packages/71/ed/5d89c798741993b2371396eb9d4634f009ff1ad8a6c78d366fe2883ea7a6/wrapt-2.1.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:62503ffbc2d3a69891cf29beeaccdb4d5e0a126e2b6a851688d4777e01428dbb", size = 63163, upload-time = "2026-03-06T02:52:54.873Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/8c/05d277d182bf36b0a13d6bd393ed1dec3468a25b59d01fba2dd70fe4d6ae/wrapt-2.1.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c7e6cd120ef837d5b6f860a6ea3745f8763805c418bb2f12eeb1fa6e25f22d22", size = 63723, upload-time = "2026-03-06T02:52:56.374Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/27/6c51ec1eff4413c57e72d6106bb8dec6f0c7cdba6503d78f0fa98767bcc9/wrapt-2.1.2-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3769a77df8e756d65fbc050333f423c01ae012b4f6731aaf70cf2bef61b34596", size = 152652, upload-time = "2026-03-06T02:53:23.79Z" },
+    { url = "https://files.pythonhosted.org/packages/db/4c/d7dd662d6963fc7335bfe29d512b02b71cdfa23eeca7ab3ac74a67505deb/wrapt-2.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a76d61a2e851996150ba0f80582dd92a870643fa481f3b3846f229de88caf044", size = 158807, upload-time = "2026-03-06T02:53:35.742Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/4d/1e5eea1a78d539d346765727422976676615814029522c76b87a95f6bcdd/wrapt-2.1.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6f97edc9842cf215312b75fe737ee7c8adda75a89979f8e11558dfff6343cc4b", size = 146061, upload-time = "2026-03-06T02:52:57.574Z" },
+    { url = "https://files.pythonhosted.org/packages/89/bc/62cabea7695cd12a288023251eeefdcb8465056ddaab6227cb78a2de005b/wrapt-2.1.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4006c351de6d5007aa33a551f600404ba44228a89e833d2fadc5caa5de8edfbf", size = 155667, upload-time = "2026-03-06T02:53:39.422Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/99/6f2888cd68588f24df3a76572c69c2de28287acb9e1972bf0c83ce97dbc1/wrapt-2.1.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:a9372fc3639a878c8e7d87e1556fa209091b0a66e912c611e3f833e2c4202be2", size = 144392, upload-time = "2026-03-06T02:54:22.41Z" },
+    { url = "https://files.pythonhosted.org/packages/40/51/1dfc783a6c57971614c48e361a82ca3b6da9055879952587bc99fe1a7171/wrapt-2.1.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3144b027ff30cbd2fca07c0a87e67011adb717eb5f5bd8496325c17e454257a3", size = 150296, upload-time = "2026-03-06T02:54:07.848Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/38/cbb8b933a0201076c1f64fc42883b0023002bdc14a4964219154e6ff3350/wrapt-2.1.2-cp314-cp314t-win32.whl", hash = "sha256:3b8d15e52e195813efe5db8cec156eebe339aaf84222f4f4f051a6c01f237ed7", size = 60539, upload-time = "2026-03-06T02:54:00.594Z" },
+    { url = "https://files.pythonhosted.org/packages/82/dd/e5176e4b241c9f528402cebb238a36785a628179d7d8b71091154b3e4c9e/wrapt-2.1.2-cp314-cp314t-win_amd64.whl", hash = "sha256:08ffa54146a7559f5b8df4b289b46d963a8e74ed16ba3687f99896101a3990c5", size = 63969, upload-time = "2026-03-06T02:54:39Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/99/79f17046cf67e4a95b9987ea129632ba8bcec0bc81f3fb3d19bdb0bd60cd/wrapt-2.1.2-cp314-cp314t-win_arm64.whl", hash = "sha256:72aaa9d0d8e4ed0e2e98019cea47a21f823c9dd4b43c7b77bba6679ffcca6a00", size = 60554, upload-time = "2026-03-06T02:53:14.132Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/c7/8528ac2dfa2c1e6708f647df7ae144ead13f0a31146f43c7264b4942bf12/wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8", size = 43993, upload-time = "2026-03-06T02:53:12.905Z" },
+]
+
 [[package]]
 name = "xmltodict"
 version = "1.0.4"
diff --git a/app-expo/.env.example b/app-expo/.env.example
index f525d74..f952c6c 100644
--- a/app-expo/.env.example
+++ b/app-expo/.env.example
@@ -20,8 +20,12 @@
 # EXPO_PUBLIC_API_URL=http://127.0.0.1:8000
 # EXPO_PUBLIC_WS_URL=ws://127.0.0.1:8000
 
-# --- staging ---
+# --- staging（必填，无默认值；示例见 env/staging）---
 # APP_VARIANT=staging
 # EXPO_PUBLIC_APP_VARIANT=staging
-EXPO_PUBLIC_API_URL=https://your-api.example.com
-EXPO_PUBLIC_WS_URL=wss://your-api.example.com
+# EXPO_PUBLIC_API_URL=http://your-staging-host:8000
+# EXPO_PUBLIC_WS_URL=ws://your-staging-host:8000
+
+# --- production ---
+# EXPO_PUBLIC_API_URL=https://your-api.example.com
+# EXPO_PUBLIC_WS_URL=wss://your-api.example.com
diff --git a/app-expo/app.config.ts b/app-expo/app.config.ts
index 3931de8..602cfe1 100644
--- a/app-expo/app.config.ts
+++ b/app-expo/app.config.ts
@@ -28,7 +28,16 @@ const LOCALES: Record<string, LocaleMessages> = {
 
 const SUPPORTED_LOCALES = ['zh', 'en'] as const;
 const PRIMARY_LOCALE = process.env.EXPO_PUBLIC_PRIMARY_LOCALE ?? 'zh';
-const API_BASE_URL = process.env.EXPO_PUBLIC_API_URL ?? '';
+const API_BASE_URL = process.env.EXPO_PUBLIC_API_URL?.trim() ?? '';
+const WS_BASE_URL = process.env.EXPO_PUBLIC_WS_URL?.trim() ?? '';
+
+if (!API_BASE_URL || !WS_BASE_URL) {
+  throw new Error(
+    '[app.config] Missing EXPO_PUBLIC_API_URL or EXPO_PUBLIC_WS_URL. ' +
+      'Run `npm run use-env -- <development|staging|production>` in app-expo before prebuild or Metro.',
+  );
+}
+
 const ALLOW_INSECURE_HTTP = API_BASE_URL.startsWith('http://');
 
 const APP_VARIANT =
@@ -176,7 +185,14 @@ export default ({ config }: ConfigContext): ExpoConfig => {
         './plugins/withAndroidCleartextTraffic',
         { enabled: ALLOW_INSECURE_HTTP },
       ],
-      ['./plugins/withIosInsecureHttp', { enabled: ALLOW_INSECURE_HTTP }],
+      [
+        './plugins/withIosInsecureHttp',
+        {
+          enabled: ALLOW_INSECURE_HTTP,
+          apiUrl: API_BASE_URL,
+          wsUrl: WS_BASE_URL,
+        },
+      ],
       'expo-router',
       [
         'expo-splash-screen',
diff --git a/app-expo/jest.config.js b/app-expo/jest.config.js
index 1082f91..7db9baa 100644
--- a/app-expo/jest.config.js
+++ b/app-expo/jest.config.js
@@ -1,5 +1,6 @@
 module.exports = {
   preset: 'jest-expo',
+  setupFiles: ['<rootDir>/tests/jest.setup.ts'],
   clearMocks: true,
   moduleNameMapper: {
     '^@/(.*)$': '<rootDir>/src/$1',
diff --git a/app-expo/plugins/withIosInsecureHttp.js b/app-expo/plugins/withIosInsecureHttp.js
index 9e9810c..1203de5 100644
--- a/app-expo/plugins/withIosInsecureHttp.js
+++ b/app-expo/plugins/withIosInsecureHttp.js
@@ -1,43 +1,81 @@
 // @ts-check
 /**
- * Allow HTTP / WS to staging API host via App Transport Security exception.
+ * Allow HTTP / WS to staging API hosts via App Transport Security.
  *
  * Enabled when EXPO_PUBLIC_API_URL uses http:// (same rule as Android cleartext).
- * Host is parsed from the URL so IP:port staging endpoints work without hard-coding.
+ * Collects hosts from both API and WS URLs (IP:port staging often differs only by scheme).
  */
 const { withInfoPlist } = require('@expo/config-plugins');
 
 /**
+ * @param {string | undefined} raw
  * @returns {string | null}
  */
-function getHttpExceptionHost() {
-  const raw = process.env.EXPO_PUBLIC_API_URL ?? '';
-  if (!raw.startsWith('http://')) {
+function insecureHttpHostFromUrl(raw) {
+  if (!raw || !raw.startsWith('http://')) {
     return null;
   }
   try {
-    return new URL(raw).hostname;
+    return new URL(raw).hostname || null;
   } catch {
     return null;
   }
 }
 
+/**
+ * @param {string | undefined} raw
+ * @returns {string | null}
+ */
+function insecureWsHostFromUrl(raw) {
+  if (!raw || !raw.startsWith('ws://')) {
+    return null;
+  }
+  try {
+    return new URL(raw).hostname || null;
+  } catch {
+    return null;
+  }
+}
+
+/**
+ * @param {string | undefined} apiUrl
+ * @param {string | undefined} wsUrl
+ * @returns {string[]}
+ */
+function collectInsecureHosts(apiUrl, wsUrl) {
+  const hosts = new Set(
+    [insecureHttpHostFromUrl(apiUrl), insecureWsHostFromUrl(wsUrl)].filter(
+      (h) => typeof h === 'string' && h.length > 0,
+    ),
+  );
+  return [...hosts];
+}
+
+/**
+ * @param {string} host
+ */
+function isIpv4Literal(host) {
+  return /^\d{1,3}(\.\d{1,3}){3}$/u.test(host);
+}
+
 /**
  * @param {import('expo/config').ExpoConfig} config
- * @param {{ enabled?: boolean }} props
+ * @param {{ enabled?: boolean; apiUrl?: string; wsUrl?: string }} props
  */
 function withIosInsecureHttp(config, props = {}) {
   const enabled = props.enabled ?? false;
+  const apiUrl = props.apiUrl ?? process.env.EXPO_PUBLIC_API_URL ?? '';
+  const wsUrl = props.wsUrl ?? process.env.EXPO_PUBLIC_WS_URL ?? '';
 
   return withInfoPlist(config, (mod) => {
     if (!enabled) {
       return mod;
     }
 
-    const host = getHttpExceptionHost();
-    if (!host) {
+    const hosts = collectInsecureHosts(apiUrl, wsUrl);
+    if (hosts.length === 0) {
       console.warn(
-        '[withIosInsecureHttp] enabled but EXPO_PUBLIC_API_URL has no http host; skipping ATS exception.',
+        '[withIosInsecureHttp] enabled but no http/ws hosts found in apiUrl/wsUrl; skipping ATS exception.',
       );
       return mod;
     }
@@ -45,17 +83,32 @@ function withIosInsecureHttp(config, props = {}) {
     const existing = mod.modResults.NSAppTransportSecurity ?? {};
     const existingDomains = existing.NSExceptionDomains ?? {};
 
+    /** @type {Record<string, object>} */
+    const exceptionDomains = { ...existingDomains };
+
+    for (const host of hosts) {
+      exceptionDomains[host] = {
+        NSExceptionAllowsInsecureHTTPLoads: true,
+        // IP literals have no subdomains; false avoids odd ATS behavior on some iOS versions.
+        NSIncludesSubdomains: !isIpv4Literal(host),
+        NSExceptionRequiresForwardSecrecy: false,
+      };
+    }
+
     mod.modResults.NSAppTransportSecurity = {
       ...existing,
-      NSExceptionDomains: {
-        ...existingDomains,
-        [host]: {
-          NSExceptionAllowsInsecureHTTPLoads: true,
-          NSIncludesSubdomains: true,
-        },
-      },
+      /**
+       * Staging often uses bare IP:port HTTP. Domain exceptions alone can fail on
+       * newer iOS builds; allow cleartext while this plugin is enabled (http:// API only).
+       */
+      NSAllowsArbitraryLoads: true,
+      NSExceptionDomains: exceptionDomains,
     };
 
+    console.log(
+      `[withIosInsecureHttp] ATS cleartext enabled for host(s): ${hosts.join(', ')}`,
+    );
+
     return mod;
   });
 }
diff --git a/app-expo/src/app/(tabs)/memoir.tsx b/app-expo/src/app/(tabs)/memoir.tsx
index 1b6045e..266ce29 100644
--- a/app-expo/src/app/(tabs)/memoir.tsx
+++ b/app-expo/src/app/(tabs)/memoir.tsx
@@ -1,5 +1,5 @@
 import { Image } from 'expo-image';
-import { router } from 'expo-router';
+import { router, useFocusEffect } from 'expo-router';
 import React, {
   useCallback,
   useEffect,
@@ -17,22 +17,26 @@ import {
 } from 'react-native';
 import { SafeAreaView } from 'react-native-safe-area-context';
 import { useTranslation } from 'react-i18next';
-import { FileText } from 'lucide-react-native';
+import { FileText, MessageCirclePlus } from 'lucide-react-native';
 
 import { Icon } from '@/components/ui/icon';
 import { Skeleton } from '@/components/ui/skeleton';
 import { Text } from '@/components/ui/text';
 import { ScreenGutter } from '@/constants/layout';
+import { ApiError, NetworkError } from '@/core/api/types';
+import { config, shouldShowAboutBackendUrl } from '@/core/config';
 import { useTypography } from '@/core/typography-context';
 import {
   buildFrameworkChapterPlaceholders,
   mergeFrameworkChaptersWithFetched,
 } from '@/features/memoir/framework-chapter-keys';
 import {
+  hasAnyMemoirDraftingActivity,
   memoirDraftCharsRemaining,
   memoirDraftHasStarted,
   resolvedChapterCategory,
 } from '@/features/memoir/draft-progress';
+import { useSession } from '@/features/auth/hooks';
 import {
   useChapters,
   useCheckCoverGeneration,
@@ -286,13 +290,41 @@ function ChapterCard({
   return null;
 }
 
-function MemoirLoadError({ onRetry }: { onRetry: () => void }) {
+function formatChapterLoadErrorHint(error: unknown): string | null {
+  if (!shouldShowAboutBackendUrl()) return null;
+  if (error instanceof NetworkError) {
+    return `${error.message}\n${config.apiBaseUrl}`;
+  }
+  if (error instanceof ApiError) {
+    return `HTTP ${error.status}: ${error.message}`;
+  }
+  if (error instanceof Error) return error.message;
+  return null;
+}
+
+function MemoirLoadError({
+  error,
+  onRetry,
+}: {
+  error: unknown;
+  onRetry: () => void;
+}) {
   const { t } = useTranslation('memoir');
+  const hint = formatChapterLoadErrorHint(error);
   return (
     <View className="items-center gap-4 rounded-2xl border border-dashed border-border bg-muted/20 p-10">
       <Text variant="bodyLarge" className="text-center text-destructive">
         {t('loadErrorMessage')}
       </Text>
+      {hint ? (
+        <Text
+          variant="bodySmall"
+          className="text-center text-muted-foreground"
+          selectable
+        >
+          {hint}
+        </Text>
+      ) : null}
       <Pressable
         className="rounded-lg bg-primary px-6 py-3 active:opacity-90"
         style={{ borderCurve: 'continuous' }}
@@ -309,10 +341,47 @@ function MemoirLoadError({ onRetry }: { onRetry: () => void }) {
   );
 }
 
+function MemoirEmptyState({ onStartChat }: { onStartChat: () => void }) {
+  const { t } = useTranslation('memoir');
+  return (
+    <Pressable
+      className="items-center gap-6 rounded-2xl bg-muted/30 p-10 active:opacity-90"
+      style={{ borderCurve: 'continuous' }}
+      onPress={onStartChat}
+    >
+      <Icon as={MessageCirclePlus} className="text-primary" size={40} />
+      <View className="items-center gap-3">
+        <Text variant="h2" className="text-center font-display text-primary">
+          {t('emptyTitle')}
+        </Text>
+        <Text
+          variant="bodyLarge"
+          className="text-center font-medium text-muted-foreground"
+        >
+          {t('emptySubtitle')}
+        </Text>
+      </View>
+    </Pressable>
+  );
+}
+
 export default function MemoirScreen() {
   const { t } = useTranslation('memoir');
-  const { viewModels: chapters, isLoading, isError, refetch } = useChapters();
-  const { data: memoirState, refetch: refetchMemoirState } = useMemoirState();
+  const { isAuthenticated } = useSession();
+  const {
+    viewModels: chapters,
+    isLoading,
+    hasCompletedChapters,
+    isEmptyList,
+    showLoadError,
+    error: chaptersError,
+    refetch,
+  } = useChapters({ enabled: isAuthenticated });
+  const {
+    data: memoirState,
+    isLoading: isMemoirStateLoading,
+    refetch: refetchMemoirState,
+  } = useMemoirState({ enabled: isAuthenticated });
   const checkCover = useCheckCoverGeneration();
   const [refreshing, setRefreshing] = useState(false);
   const didRunInitialCoverCheckRef = useRef(false);
@@ -327,6 +396,29 @@ export default function MemoirScreen() {
     [frameworkPlaceholders, chapters],
   );
 
+  const hasDraftingActivity = useMemo(() => {
+    if (hasCompletedChapters) return true;
+    if (chapters.some((ch) => !ch.isEmpty || ch.wordCount > 0)) return true;
+    return hasAnyMemoirDraftingActivity(memoirState?.slots);
+  }, [chapters, hasCompletedChapters, memoirState?.slots]);
+
+  const isBootstrapping =
+    isLoading || (isEmptyList && isMemoirStateLoading);
+
+  const isEmptyMemoir =
+    !isBootstrapping &&
+    !showLoadError &&
+    isEmptyList &&
+    !hasDraftingActivity;
+
+  useFocusEffect(
+    useCallback(() => {
+      if (!isAuthenticated) return;
+      void refetch();
+      void refetchMemoirState();
+    }, [isAuthenticated, refetch, refetchMemoirState]),
+  );
+
   useEffect(() => {
     if (didRunInitialCoverCheckRef.current) return;
     didRunInitialCoverCheckRef.current = true;
@@ -336,7 +428,7 @@ export default function MemoirScreen() {
   const handleRefresh = useCallback(async () => {
     setRefreshing(true);
     try {
-      await checkCover.mutateAsync(undefined);
+      await checkCover.mutateAsync(undefined).catch(() => undefined);
       await Promise.all([refetch(), refetchMemoirState()]);
     } finally {
       setRefreshing(false);
@@ -347,6 +439,10 @@ export default function MemoirScreen() {
     router.push(`/(main)/chapter/${chapterId}`);
   }, []);
 
+  const handleStartChat = useCallback(() => {
+    router.push('/(tabs)');
+  }, []);
+
   return (
     <View className="flex-1 bg-background">
       <SafeAreaView className="flex-1" edges={['top']}>
@@ -361,19 +457,27 @@ export default function MemoirScreen() {
             paddingTop: 24,
             paddingBottom: 96,
             gap: 24,
-            ...(!isLoading && isError
+            ...(!isBootstrapping && (showLoadError || isEmptyMemoir)
               ? { flexGrow: 1, justifyContent: 'center' }
               : {}),
           }}
         >
-          {isLoading ? (
+          {isBootstrapping ? (
             <>
               <ChapterCardSkeleton />
               <ChapterCardSkeleton />
               <ChapterCardSkeleton />
             </>
-          ) : isError ? (
-            <MemoirLoadError onRetry={() => void refetch()} />
+          ) : showLoadError ? (
+            <MemoirLoadError
+              error={chaptersError}
+              onRetry={() => {
+                void refetch();
+                void refetchMemoirState();
+              }}
+            />
+          ) : isEmptyMemoir ? (
+            <MemoirEmptyState onStartChat={handleStartChat} />
           ) : (
             displayChapters.map((item) => {
               const variant = getChapterVariant(item);
diff --git a/app-expo/src/core/config.ts b/app-expo/src/core/config.ts
index f2349e6..7fc0f50 100644
--- a/app-expo/src/core/config.ts
+++ b/app-expo/src/core/config.ts
@@ -4,6 +4,57 @@ function trimTrailingSlashes(value: string): string {
 
 export type AppVariant = 'development' | 'staging' | 'production';
 
+const MISSING_ENV_HINT =
+  'Run `npm run use-env -- <development|staging|production>` in app-expo, ' +
+  'then restart Metro or re-run `expo prebuild` before building.';
+
+/**
+ * EXPO_PUBLIC_* must be set at bundle time (Metro / EAS / Xcode Archive).
+ * Refuses silent fallbacks to a hard-coded LAN IP.
+ */
+export function requirePublicEnv(name: string): string {
+  const value = process.env[name]?.trim();
+  if (!value) {
+    throw new Error(`[config] Missing ${name}. ${MISSING_ENV_HINT}`);
+  }
+  return value;
+}
+
+function parseBackendUrl(raw: string, envName: string): URL {
+  let parsed: URL;
+  try {
+    parsed = new URL(raw);
+  } catch {
+    throw new Error(`[config] Invalid ${envName}: ${raw}`);
+  }
+  if (!parsed.protocol || parsed.protocol === ':') {
+    throw new Error(`[config] ${envName} must include a scheme (http/https or ws/wss): ${raw}`);
+  }
+  return parsed;
+}
+
+function resolveApiBaseUrl(): string {
+  const raw = requirePublicEnv('EXPO_PUBLIC_API_URL');
+  const parsed = parseBackendUrl(raw, 'EXPO_PUBLIC_API_URL');
+  if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
+    throw new Error(
+      `[config] EXPO_PUBLIC_API_URL must use http:// or https:// (got ${parsed.protocol})`,
+    );
+  }
+  return trimTrailingSlashes(raw);
+}
+
+function resolveWsBaseUrl(): string {
+  const raw = requirePublicEnv('EXPO_PUBLIC_WS_URL');
+  const parsed = parseBackendUrl(raw, 'EXPO_PUBLIC_WS_URL');
+  if (parsed.protocol !== 'ws:' && parsed.protocol !== 'wss:') {
+    throw new Error(
+      `[config] EXPO_PUBLIC_WS_URL must use ws:// or wss:// (got ${parsed.protocol})`,
+    );
+  }
+  return trimTrailingSlashes(raw);
+}
+
 function resolveAppVariant(): AppVariant {
   const raw = process.env.EXPO_PUBLIC_APP_VARIANT;
   if (raw === 'development' || raw === 'staging' || raw === 'production') {
@@ -33,12 +84,8 @@ export function shouldShowAboutBackendUrl(variant: AppVariant = appVariant): boo
 export const appVariant = resolveAppVariant();
 
 export const config = {
-  apiBaseUrl: trimTrailingSlashes(
-    process.env.EXPO_PUBLIC_API_URL ?? 'http://192.168.10.151:8000',
-  ),
-  wsBaseUrl: trimTrailingSlashes(
-    process.env.EXPO_PUBLIC_WS_URL ?? 'ws://192.168.10.151:8000',
-  ),
+  apiBaseUrl: resolveApiBaseUrl(),
+  wsBaseUrl: resolveWsBaseUrl(),
   isDebugMode: __DEV__,
   appVariant,
   showAboutBackendUrl: shouldShowAboutBackendUrl(),
diff --git a/app-expo/src/features/auth/hooks.ts b/app-expo/src/features/auth/hooks.ts
index b6cfefc..7c06a4e 100644
--- a/app-expo/src/features/auth/hooks.ts
+++ b/app-expo/src/features/auth/hooks.ts
@@ -6,6 +6,8 @@ import { tokenManager } from '@/core/auth/token-manager';
 import { clearLocalSessionAndReplayEntry } from '@/features/auth/clear-local-session-and-replay-entry';
 import { getDeviceLanguage } from '@/i18n';
 
+import { memoirKeys } from '@/features/memoir/query-keys';
+
 import { authApi } from './api';
 import { authKeys } from './auth-query-keys';
 import type {
@@ -126,7 +128,10 @@ function usePostAuthSetup() {
     async (tokens: TokenResponse) => {
       await tokenManager.setTokens(tokens.access_token, tokens.refresh_token);
       queryClient.setQueryData(authKeys.tokenCheck, true);
-      await queryClient.invalidateQueries({ queryKey: authKeys.session });
+      await Promise.all([
+        queryClient.invalidateQueries({ queryKey: authKeys.session }),
+        queryClient.invalidateQueries({ queryKey: memoirKeys.all }),
+      ]);
     },
     [queryClient],
   );
diff --git a/app-expo/src/features/memoir/api.ts b/app-expo/src/features/memoir/api.ts
index 82266e9..d054e07 100644
--- a/app-expo/src/features/memoir/api.ts
+++ b/app-expo/src/features/memoir/api.ts
@@ -1,5 +1,9 @@
 import { api } from '@/core/api/client';
 
+import {
+  isChapterListNotFoundError,
+  normalizeChapterList,
+} from './chapter-list-response';
 import type {
   Book,
   Chapter,
@@ -32,10 +36,18 @@ export const memoirApi = {
     return api.post<ExportPdfResponse>('/api/books/export-pdf', { body });
   },
 
-  fetchChapters(isNew?: boolean) {
-    return api.get<Chapter[]>('/api/chapters', {
-      params: isNew !== undefined ? { is_new: isNew } : undefined,
-    });
+  async fetchChapters(isNew?: boolean): Promise<Chapter[]> {
+    try {
+      const data = await api.get<unknown>('/api/chapters', {
+        params: isNew !== undefined ? { is_new: isNew } : undefined,
+      });
+      return normalizeChapterList(data);
+    } catch (error) {
+      if (isChapterListNotFoundError(error)) {
+        return [];
+      }
+      throw error;
+    }
   },
 
   fetchChapterDetail(chapterId: string) {
diff --git a/app-expo/src/features/memoir/chapter-list-response.ts b/app-expo/src/features/memoir/chapter-list-response.ts
new file mode 100644
index 0000000..980d760
--- /dev/null
+++ b/app-expo/src/features/memoir/chapter-list-response.ts
@@ -0,0 +1,47 @@
+import { ApiError, AuthError } from '@/core/api/types';
+
+import type { Chapter } from './types';
+
+/** Normalize GET /api/chapters payload; reject non-arrays without surfacing as query errors. */
+export function normalizeChapterList(data: unknown): Chapter[] {
+  if (data == null) return [];
+  if (Array.isArray(data)) return data as Chapter[];
+  return [];
+}
+
+export function isChapterListNotFoundError(error: unknown): boolean {
+  return error instanceof ApiError && error.status === 404;
+}
+
+/** 未登录/无权限：不应展示「加载章节失败」（会话层会处理或展示框架位）。 */
+export function isChapterListAuthError(error: unknown): boolean {
+  if (error instanceof AuthError) return true;
+  return (
+    error instanceof ApiError &&
+    (error.status === 401 || error.status === 403)
+  );
+}
+
+/**
+ * True when GET /api/chapters succeeded but there are no list items (incl. filtered
+ * non-displayable chapters). Distinct from transport/auth failures.
+ */
+export function isChapterListEmptySuccess(
+  isSuccess: boolean,
+  chapters: Chapter[],
+): boolean {
+  return isSuccess && chapters.length === 0;
+}
+
+/** Only show "Could not load chapters" for real failures, not empty memoir or auth redirect. */
+export function shouldShowChapterListLoadError(
+  error: unknown,
+  isSuccess: boolean,
+  chapterCount: number,
+): boolean {
+  if (isSuccess && chapterCount === 0) return false;
+  if (error == null) return false;
+  if (isChapterListNotFoundError(error)) return false;
+  if (isChapterListAuthError(error)) return false;
+  return true;
+}
diff --git a/app-expo/src/features/memoir/draft-progress.ts b/app-expo/src/features/memoir/draft-progress.ts
index 34d04ff..d74fe83 100644
--- a/app-expo/src/features/memoir/draft-progress.ts
+++ b/app-expo/src/features/memoir/draft-progress.ts
@@ -37,6 +37,16 @@ export function interviewStageHasSnippetMaterial(
   );
 }
 
+/** 访谈槽位是否已有任意口述片段（尚无成稿章节时仍视为「进行中」）。 */
+export function hasAnyMemoirDraftingActivity(
+  slots: MemoirState['slots'] | undefined,
+): boolean {
+  if (!slots) return false;
+  return Object.keys(slots).some((stage) =>
+    interviewStageHasSnippetMaterial(slots, stage),
+  );
+}
+
 export function memoirDraftHasStarted(
   slots: MemoirState['slots'] | undefined,
   chapterCategory: string,
diff --git a/app-expo/src/features/memoir/hooks.ts b/app-expo/src/features/memoir/hooks.ts
index 1972816..c55667e 100644
--- a/app-expo/src/features/memoir/hooks.ts
+++ b/app-expo/src/features/memoir/hooks.ts
@@ -1,6 +1,9 @@
 import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
 
+import { AuthError } from '@/core/api/types';
+
 import { memoirApi } from './api';
+import { shouldShowChapterListLoadError } from './chapter-list-response';
 import { toChapterViewModels } from './mappers';
 import { memoirKeys } from './query-keys';
 import type { ExportPdfRequest, UpdateBookRequest } from './types';
@@ -38,15 +41,42 @@ export function useUpdateBookTitle() {
 
 // ─── Chapters ───
 
-export function useChapters() {
+export function hasCompletedMemoirChapter(
+  chapters: { isEmpty: boolean }[],
+): boolean {
+  return chapters.some((ch) => !ch.isEmpty);
+}
+
+export function useChapters(options?: { enabled?: boolean }) {
+  const enabled = options?.enabled ?? true;
   const query = useQuery({
     queryKey: memoirKeys.chapters(),
     queryFn: () => memoirApi.fetchChapters(),
+    enabled,
+    retry: (failureCount, error) => {
+      if (error instanceof AuthError) return false;
+      return failureCount < 1;
+    },
   });
 
+  const viewModels = query.data ? toChapterViewModels(query.data) : [];
+  const hasCompletedChapters = hasCompletedMemoirChapter(viewModels);
+  const isEmptyList =
+    query.isSuccess && viewModels.length === 0 && !hasCompletedChapters;
+  const showLoadError =
+    !query.isLoading &&
+    shouldShowChapterListLoadError(
+      query.error,
+      query.isSuccess,
+      viewModels.length,
+    );
+
   return {
     ...query,
-    viewModels: query.data ? toChapterViewModels(query.data) : [],
+    viewModels,
+    hasCompletedChapters,
+    isEmptyList,
+    showLoadError,
   };
 }
 
@@ -84,10 +114,12 @@ export function useCheckCoverGeneration() {
 
 // ─── Memoir state ───
 
-export function useMemoirState() {
+export function useMemoirState(options?: { enabled?: boolean }) {
+  const enabled = options?.enabled ?? true;
   return useQuery({
     queryKey: memoirKeys.state(),
     queryFn: () => memoirApi.fetchMemoirState(),
+    enabled,
   });
 }
 
diff --git a/app-expo/tests/core/config.test.ts b/app-expo/tests/core/config.test.ts
index 6cffd7e..0e74766 100644
--- a/app-expo/tests/core/config.test.ts
+++ b/app-expo/tests/core/config.test.ts
@@ -1,10 +1,37 @@
 import {
   appVariant,
   config,
+  requirePublicEnv,
   shouldShowAboutBackendUrl,
   type AppVariant,
 } from '@/core/config';
 
+describe('requirePublicEnv', () => {
+  it('throws when variable is missing or blank', () => {
+    const key = 'EXPO_PUBLIC_API_URL';
+    const previous = process.env[key];
+    try {
+      delete process.env[key];
+      expect(() => requirePublicEnv(key)).toThrow(/Missing EXPO_PUBLIC_API_URL/);
+      process.env[key] = '  ';
+      expect(() => requirePublicEnv(key)).toThrow(/Missing EXPO_PUBLIC_API_URL/);
+    } finally {
+      if (previous === undefined) {
+        process.env[key] = 'http://127.0.0.1:8000';
+      } else {
+        process.env[key] = previous;
+      }
+    }
+  });
+});
+
+describe('config backend URLs', () => {
+  it('loads API and WS from EXPO_PUBLIC_* (jest.setup defaults)', () => {
+    expect(config.apiBaseUrl).toBe('http://127.0.0.1:8000');
+    expect(config.wsBaseUrl).toBe('ws://127.0.0.1:8000');
+  });
+});
+
 describe('shouldShowAboutBackendUrl', () => {
   it('shows backend URL for development and staging', () => {
     expect(shouldShowAboutBackendUrl('development')).toBe(true);
diff --git a/app-expo/tests/features/memoir/chapter-list-response.test.ts b/app-expo/tests/features/memoir/chapter-list-response.test.ts
new file mode 100644
index 0000000..b5136da
--- /dev/null
+++ b/app-expo/tests/features/memoir/chapter-list-response.test.ts
@@ -0,0 +1,71 @@
+import { ApiError, AuthError, NetworkError } from '@/core/api/types';
+import {
+  isChapterListAuthError,
+  isChapterListEmptySuccess,
+  isChapterListNotFoundError,
+  normalizeChapterList,
+  shouldShowChapterListLoadError,
+} from '@/features/memoir/chapter-list-response';
+import type { Chapter } from '@/features/memoir/types';
+
+describe('normalizeChapterList', () => {
+  it('returns empty array for nullish or non-array payloads', () => {
+    expect(normalizeChapterList(null)).toEqual([]);
+    expect(normalizeChapterList(undefined)).toEqual([]);
+    expect(normalizeChapterList({ items: [] })).toEqual([]);
+  });
+
+  it('passes through chapter arrays', () => {
+    const chapters = [{ id: 'ch-1' }] as Chapter[];
+    expect(normalizeChapterList(chapters)).toBe(chapters);
+  });
+});
+
+describe('isChapterListNotFoundError', () => {
+  it('detects ApiError 404', () => {
+    expect(isChapterListNotFoundError(new ApiError('missing', 404))).toBe(true);
+    expect(isChapterListNotFoundError(new ApiError('bad', 500))).toBe(false);
+    expect(isChapterListNotFoundError(new Error('other'))).toBe(false);
+  });
+});
+
+describe('isChapterListEmptySuccess', () => {
+  it('is true only for successful empty arrays', () => {
+    expect(isChapterListEmptySuccess(true, [])).toBe(true);
+    expect(isChapterListEmptySuccess(true, [{ id: 'x' } as never])).toBe(
+      false,
+    );
+    expect(isChapterListEmptySuccess(false, [])).toBe(false);
+  });
+});
+
+describe('isChapterListAuthError', () => {
+  it('treats AuthError and 401/403 ApiError as auth errors', () => {
+    expect(isChapterListAuthError(new AuthError())).toBe(true);
+    expect(isChapterListAuthError(new ApiError('unauthorized', 401))).toBe(true);
+    expect(isChapterListAuthError(new ApiError('forbidden', 403))).toBe(true);
+    expect(isChapterListAuthError(new ApiError('server', 500))).toBe(false);
+  });
+});
+
+describe('shouldShowChapterListLoadError', () => {
+  it('hides load error for empty success, 404, and auth failures', () => {
+    expect(shouldShowChapterListLoadError(null, true, 0)).toBe(false);
+    expect(shouldShowChapterListLoadError(new ApiError('nope', 404), false, 0)).toBe(
+      false,
+    );
+    expect(shouldShowChapterListLoadError(new AuthError(), false, 0)).toBe(false);
+    expect(
+      shouldShowChapterListLoadError(new ApiError('unauthorized', 401), false, 0),
+    ).toBe(false);
+  });
+
+  it('shows load error for network and server failures', () => {
+    expect(
+      shouldShowChapterListLoadError(new NetworkError('offline'), false, 0),
+    ).toBe(true);
+    expect(
+      shouldShowChapterListLoadError(new ApiError('boom', 500), false, 0),
+    ).toBe(true);
+  });
+});
diff --git a/app-expo/tests/features/memoir/draft-progress.test.ts b/app-expo/tests/features/memoir/draft-progress.test.ts
index 7916f79..cabc32e 100644
--- a/app-expo/tests/features/memoir/draft-progress.test.ts
+++ b/app-expo/tests/features/memoir/draft-progress.test.ts
@@ -1,5 +1,6 @@
 import {
   chapterCategoryToInterviewStage,
+  hasAnyMemoirDraftingActivity,
   memoirDraftCharsRemaining,
   memoirDraftHasStarted,
   MIN_CHAPTER_DISPLAY_CHARS,
@@ -23,6 +24,14 @@ describe('draft-progress', () => {
     ).toBe('career_early');
   });
 
+  test('hasAnyMemoirDraftingActivity when any stage has snippet', () => {
+    const slots = {
+      childhood: { q1: { snippet: '小时候…', status: 'filled' } },
+    };
+    expect(hasAnyMemoirDraftingActivity(slots)).toBe(true);
+    expect(hasAnyMemoirDraftingActivity({})).toBe(false);
+  });
+
   test('memoirDraftHasStarted when interview slots have snippet', () => {
     const slots = {
       childhood: { place: { snippet: '老家在小城', segment_ids: [] } },
diff --git a/app-expo/tests/jest.setup.ts b/app-expo/tests/jest.setup.ts
new file mode 100644
index 0000000..9179b0c
--- /dev/null
+++ b/app-expo/tests/jest.setup.ts
@@ -0,0 +1,9 @@
+/**
+ * Jest loads config at import time; EXPO_PUBLIC_* must be set before any @/core/config import.
+ */
+process.env.EXPO_PUBLIC_API_URL =
+  process.env.EXPO_PUBLIC_API_URL ?? 'http://127.0.0.1:8000';
+process.env.EXPO_PUBLIC_WS_URL =
+  process.env.EXPO_PUBLIC_WS_URL ?? 'ws://127.0.0.1:8000';
+process.env.EXPO_PUBLIC_APP_VARIANT =
+  process.env.EXPO_PUBLIC_APP_VARIANT ?? 'development';