feat: OpenTelemetry LGTM observability, dev tooling, and memoir UX fixes (#31)

* add staging ios app build script * feat(api): add OpenTelemetry LGTM stack for local observability Wire OTel traces, metrics, and logs through a collector to Tempo, Prometheus, and Loki, with custom LLM instrumentation, dev compose overlay, Grafana provisioning, env templates, and development.sh auto-start. Co-authored-by: Cursor <cursoragent@cursor.com> * feat: expand observability, harden dev tooling, and fix expo staging UX Add business and LLM Prometheus metrics with Grafana dashboards, alerting, and a metrics verification script. Wire telemetry through adapters and core LLM paths, and document the local LGTM workflow. Fix development.sh for macOS bash 3.2, open Grafana and eval-web in Chrome, and repair eval-web auto-open (unbound EVAL_WEB_BROWSER_SCHEDULED). Merge internal-eval into the main dev script with improved compose handling. Require EXPO_PUBLIC_* at build time, improve iOS HTTP ATS for staging IPs, show memoir empty state instead of load errors when no chapters exist, and add jest env setup plus chapter list response normalization. Co-authored-by: Cursor <cursoragent@cursor.com> * chore: enable Grafana Assistant Cursor plugin Co-authored-by: Cursor <cursoragent@cursor.com> * fix: memoir empty state and repair withdrawn 0020_chapters_book_id stamp Show empty memoir UI when the chapter list succeeds with no items; treat auth/404 as non-fatal. Extend alembic revision repair so local dev DBs stamped with the removed 0020_chapters_book_id migration can roll back and upgrade to 0019. Co-authored-by: Cursor <cursoragent@cursor.com> --------- Co-authored-by: Kevin <kevin@brighteng.org> Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-20 15:12:21 +08:00
parent 0d417331fd
commit fa42757916
85 changed files with 3894 additions and 405 deletions
--- a/api/deploy/observability/grafana/dashboards/life-echo-business.json
+++ b/api/deploy/observability/grafana/dashboards/life-echo-business.json
@@ -0,0 +1,75 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
+      "id": 1,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(business_operation_duration_milliseconds_bucket[5m])) by (le, operation))",
+          "legendFormat": "{{operation}} p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Business operation duration p95",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "id": 2,
+      "targets": [
+        {
+          "expr": "sum(rate(business_operation_duration_milliseconds_count[5m])) by (operation, outcome)",
+          "legendFormat": "{{operation}} / {{outcome}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Business operations rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "id": 3,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(business_operation_duration_milliseconds_bucket[5m])) by (le, operation)) and on(operation) (operation=~\"conversation\\\\.ws\\\\..*|asr\\\\.transcribe|tts\\\\.synthesize\")",
+          "legendFormat": "{{operation}}",
+          "refId": "A"
+        }
+      ],
+      "title": "WS / ASR / TTS p95",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
+      "id": 4,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(business_operation_duration_milliseconds_bucket[5m])) by (le, operation)) and on(operation) (operation=~\"memoir\\\\..*\")",
+          "legendFormat": "{{operation}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Memoir pipeline phases p95",
+      "type": "timeseries"
+    }
+  ],
+  "schemaVersion": 39,
+  "tags": ["life-echo", "business"],
+  "templating": { "list": [] },
+  "time": { "from": "now-6h", "to": "now" },
+  "title": "Life Echo Business",
+  "uid": "life-echo-business",
+  "version": 1
+}
--- a/api/deploy/observability/grafana/dashboards/life-echo-llm.json
+++ b/api/deploy/observability/grafana/dashboards/life-echo-llm.json
@@ -0,0 +1,79 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "id": 1,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(llm_call_duration_milliseconds_bucket[5m])) by (le, agent, call_type))",
+          "legendFormat": "{{agent}} / {{call_type}} p95",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM duration p95 by agent / call_type",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "id": 2,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum(rate(llm_call_duration_milliseconds_bucket[5m])) by (le, call_type))",
+          "legendFormat": "{{call_type}} p50",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM duration p50 by call_type (json vs chat vs stream)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "id": 3,
+      "targets": [
+        {
+          "expr": "sum(rate(llm_call_total[5m])) by (outcome, call_type)",
+          "legendFormat": "{{outcome}} / {{call_type}}",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM calls by outcome",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "id": 4,
+      "targets": [
+        {
+          "expr": "sum(rate(llm_tokens_input_total[5m])) by (agent)",
+          "legendFormat": "in {{agent}}",
+          "refId": "A"
+        },
+        {
+          "expr": "sum(rate(llm_tokens_output_total[5m])) by (agent)",
+          "legendFormat": "out {{agent}}",
+          "refId": "B"
+        }
+      ],
+      "title": "LLM tokens/min",
+      "type": "timeseries"
+    }
+  ],
+  "schemaVersion": 39,
+  "tags": ["life-echo", "llm"],
+  "templating": { "list": [] },
+  "time": { "from": "now-1h", "to": "now" },
+  "title": "Life Echo LLM",
+  "uid": "life-echo-llm",
+  "version": 1
+}
--- a/api/deploy/observability/grafana/dashboards/life-echo-logs.json
+++ b/api/deploy/observability/grafana/dashboards/life-echo-logs.json
@@ -0,0 +1,69 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "datasource": { "type": "loki", "uid": "loki" },
+      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 0 },
+      "id": 1,
+      "options": { "showTime": true, "sortOrder": "Descending" },
+      "targets": [
+        {
+          "expr": "{compose_service=~\".+\"} |= \"event=llm_json_call\"",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM JSON calls (event=llm_json_call)",
+      "type": "logs"
+    },
+    {
+      "datasource": { "type": "loki", "uid": "loki" },
+      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 10 },
+      "id": 2,
+      "options": { "showTime": true, "sortOrder": "Descending" },
+      "targets": [
+        {
+          "expr": "{compose_service=~\".+\"} |= \"event=celery_task_failed\"",
+          "refId": "A"
+        }
+      ],
+      "title": "Celery task failures",
+      "type": "logs"
+    },
+    {
+      "datasource": { "type": "loki", "uid": "loki" },
+      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 20 },
+      "id": 3,
+      "options": { "showTime": true, "sortOrder": "Descending" },
+      "targets": [
+        {
+          "expr": "{trace_id=~\"$trace_id\"}",
+          "refId": "A"
+        }
+      ],
+      "title": "Logs by trace_id",
+      "type": "logs"
+    }
+  ],
+  "schemaVersion": 39,
+  "tags": ["life-echo", "logs"],
+  "templating": {
+    "list": [
+      {
+        "current": { "text": "", "value": "" },
+        "label": "trace_id",
+        "name": "trace_id",
+        "options": [],
+        "query": "",
+        "type": "textbox"
+      }
+    ]
+  },
+  "time": { "from": "now-1h", "to": "now" },
+  "title": "Life Echo Logs",
+  "uid": "life-echo-logs",
+  "version": 1
+}
--- a/api/deploy/observability/grafana/dashboards/life-echo-overview.json
+++ b/api/deploy/observability/grafana/dashboards/life-echo-overview.json
@@ -0,0 +1,154 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 0 },
+      "id": 1,
+      "options": { "legend": { "displayMode": "list", "placement": "bottom" } },
+      "targets": [
+        {
+          "expr": "sum(rate(http_server_request_duration_seconds_count[5m]))",
+          "legendFormat": "HTTP requests/s",
+          "refId": "A"
+        }
+      ],
+      "title": "API request rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 0 },
+      "id": 2,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le)) * 1000",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "API latency p95",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 0 },
+      "id": 3,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(llm_call_duration_milliseconds_bucket[5m])) by (le, agent, provider))",
+          "legendFormat": "{{agent}} / {{provider}}",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM call duration p95",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "id": 4,
+      "targets": [
+        {
+          "expr": "sum(rate(llm_call_total[5m])) by (outcome)",
+          "legendFormat": "{{outcome}}",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM calls by outcome",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "loki", "uid": "loki" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "id": 5,
+      "options": { "showTime": true, "sortOrder": "Descending" },
+      "targets": [
+        {
+          "expr": "{compose_service=~\".+\"} |= \"llm_json_call\"",
+          "refId": "A"
+        }
+      ],
+      "title": "LLM JSON call logs",
+      "type": "logs"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
+      "id": 6,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(db_client_operation_duration_seconds_bucket[5m])) by (le)) * 1000",
+          "legendFormat": "DB p95",
+          "refId": "A"
+        }
+      ],
+      "title": "DB client latency p95",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
+      "id": 7,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(http_client_request_duration_seconds_bucket[5m])) by (le)) * 1000",
+          "legendFormat": "HTTP client p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Outbound HTTP latency p95",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
+      "id": 8,
+      "targets": [
+        {
+          "expr": "sum(rate(http_server_request_duration_seconds_count{http_response_status_code=~\"5..\"}[5m])) / clamp_min(sum(rate(http_server_request_duration_seconds_count[5m])), 1e-9)",
+          "legendFormat": "5xx rate",
+          "refId": "A"
+        }
+      ],
+      "title": "HTTP 5xx error rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": { "type": "prometheus", "uid": "Prometheus" },
+      "fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
+      "id": 9,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(redis_client_operation_duration_seconds_bucket[5m])) by (le)) * 1000",
+          "legendFormat": "Redis p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Redis client latency p95",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 39,
+  "tags": ["life-echo"],
+  "templating": { "list": [] },
+  "time": { "from": "now-1h", "to": "now" },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Life Echo Overview",
+  "uid": "life-echo-overview",
+  "version": 1
+}
--- a/api/deploy/observability/grafana/provisioning/alerting/contact_points.yml
+++ b/api/deploy/observability/grafana/provisioning/alerting/contact_points.yml
@@ -0,0 +1,4 @@
+apiVersion: 1
+
+# 本地 dev 占位：不配置真实通知渠道。在 Grafana UI 中可绑定 Slack/Webhook。
+contactPoints: []
--- a/api/deploy/observability/grafana/provisioning/alerting/rules.yml
+++ b/api/deploy/observability/grafana/provisioning/alerting/rules.yml
@@ -0,0 +1,147 @@
+apiVersion: 1
+
+groups:
+  - orgId: 1
+    name: life-echo-alerts
+    folder: Life Echo
+    interval: 1m
+    rules:
+      - uid: life_echo_api_p95_high
+        title: API latency p95 > 2s
+        condition: C
+        data:
+          - refId: A
+            relativeTimeRange: { from: 300, to: 0 }
+            datasourceUid: Prometheus
+            model:
+              expr: histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le)) * 1000
+              refId: A
+          - refId: B
+            datasourceUid: __expr__
+            model:
+              type: reduce
+              expression: A
+              reducer: last
+              refId: B
+          - refId: C
+            datasourceUid: __expr__
+            model:
+              type: threshold
+              expression: B
+              conditions:
+                - evaluator: { type: gt, params: [2000] }
+                  operator: { type: and }
+                  reducer: { type: last }
+              refId: C
+        noDataState: NoData
+        execErrState: Error
+        for: 5m
+        annotations:
+          summary: API p95 latency above 2s for 5 minutes
+        labels:
+          severity: warning
+
+      - uid: life_echo_llm_error_rate
+        title: LLM error rate > 5%
+        condition: C
+        data:
+          - refId: A
+            relativeTimeRange: { from: 300, to: 0 }
+            datasourceUid: Prometheus
+            model:
+              expr: sum(rate(llm_call_total{outcome="error"}[5m])) / clamp_min(sum(rate(llm_call_total[5m])), 1e-9)
+              refId: A
+          - refId: B
+            datasourceUid: __expr__
+            model:
+              type: reduce
+              expression: A
+              reducer: last
+              refId: B
+          - refId: C
+            datasourceUid: __expr__
+            model:
+              type: threshold
+              expression: B
+              conditions:
+                - evaluator: { type: gt, params: [0.05] }
+                  operator: { type: and }
+                  reducer: { type: last }
+              refId: C
+        noDataState: NoData
+        execErrState: Error
+        for: 5m
+        annotations:
+          summary: LLM call error rate above 5%
+        labels:
+          severity: warning
+
+      - uid: life_echo_otel_collector_down
+        title: OTel Collector scrape down
+        condition: C
+        data:
+          - refId: A
+            relativeTimeRange: { from: 120, to: 0 }
+            datasourceUid: Prometheus
+            model:
+              expr: up{job="otel-collector"}
+              refId: A
+          - refId: B
+            datasourceUid: __expr__
+            model:
+              type: reduce
+              expression: A
+              reducer: last
+              refId: B
+          - refId: C
+            datasourceUid: __expr__
+            model:
+              type: threshold
+              expression: B
+              conditions:
+                - evaluator: { type: lt, params: [1] }
+                  operator: { type: and }
+                  reducer: { type: last }
+              refId: C
+        noDataState: Alerting
+        execErrState: Error
+        for: 2m
+        annotations:
+          summary: Prometheus cannot scrape otel-collector
+        labels:
+          severity: critical
+
+      - uid: life_echo_celery_task_failed
+        title: Celery task failures detected
+        condition: C
+        data:
+          - refId: A
+            relativeTimeRange: { from: 300, to: 0 }
+            datasourceUid: loki
+            model:
+              expr: sum(count_over_time({compose_service=~".+"} |= "event=celery_task_failed" [5m]))
+              refId: A
+          - refId: B
+            datasourceUid: __expr__
+            model:
+              type: reduce
+              expression: A
+              reducer: last
+              refId: B
+          - refId: C
+            datasourceUid: __expr__
+            model:
+              type: threshold
+              expression: B
+              conditions:
+                - evaluator: { type: gt, params: [0] }
+                  operator: { type: and }
+                  reducer: { type: last }
+              refId: C
+        noDataState: NoData
+        execErrState: Error
+        for: 5m
+        annotations:
+          summary: Celery task failure logs in last 5 minutes
+        labels:
+          severity: warning
--- a/api/deploy/observability/grafana/provisioning/dashboards/dashboards.yml
+++ b/api/deploy/observability/grafana/provisioning/dashboards/dashboards.yml
@@ -0,0 +1,11 @@
+apiVersion: 1
+
+providers:
+  - name: Life Echo
+    orgId: 1
+    folder: Life Echo
+    type: file
+    disableDeletion: false
+    editable: true
+    options:
+      path: /etc/grafana/dashboards
--- a/api/deploy/observability/grafana/provisioning/datasources/datasources.yml
+++ b/api/deploy/observability/grafana/provisioning/datasources/datasources.yml
@@ -0,0 +1,43 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: false
+
+  - name: Tempo
+    type: tempo
+    access: proxy
+    url: http://tempo:3200
+    editable: false
+    jsonData:
+      httpMethod: GET
+      tracesToLogsV2:
+        datasourceUid: loki
+        spanStartTimeShift: -1m
+        spanEndTimeShift: 1m
+        filterByTraceID: true
+        filterBySpanID: false
+        customQuery: true
+        query: '{container=~".+"} | json | trace_id="$${__trace.traceId}"'
+      serviceMap:
+        datasourceUid: prometheus
+      nodeGraph:
+        enabled: true
+
+  - name: Loki
+    type: loki
+    uid: loki
+    access: proxy
+    url: http://loki:3100
+    editable: false
+    jsonData:
+      derivedFields:
+        - datasourceUid: tempo
+          matcherRegex: '"trace_id":"([a-f0-9]+)"'
+          name: TraceID
+          url: "$${__value.raw}"
+          urlDisplayLabel: View Trace
--- a/api/deploy/observability/loki-config.yaml
+++ b/api/deploy/observability/loki-config.yaml
@@ -0,0 +1,32 @@
+auth_enabled: false
+
+server:
+  http_listen_port: 3100
+
+common:
+  instance_addr: 127.0.0.1
+  path_prefix: /loki
+  storage:
+    filesystem:
+      chunks_directory: /loki/chunks
+      rules_directory: /loki/rules
+  replication_factor: 1
+  ring:
+    kvstore:
+      store: inmemory
+
+schema_config:
+  configs:
+    - from: 2024-01-01
+      store: tsdb
+      object_store: filesystem
+      schema: v13
+      index:
+        prefix: index_
+        period: 24h
+
+limits_config:
+  retention_period: 168h
+
+ruler:
+  alertmanager_url: http://localhost:9093
--- a/api/deploy/observability/otel-collector-config.yaml
+++ b/api/deploy/observability/otel-collector-config.yaml
@@ -0,0 +1,53 @@
+receivers:
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+
+processors:
+  batch:
+    timeout: 5s
+    send_batch_size: 1024
+  memory_limiter:
+    check_interval: 1s
+    limit_mib: 512
+    spike_limit_mib: 128
+  resource:
+    attributes:
+      - key: deployment.environment
+        value: development
+        action: upsert
+
+exporters:
+  otlp/tempo:
+    endpoint: tempo:4317
+    tls:
+      insecure: true
+  prometheus:
+    endpoint: 0.0.0.0:8889
+  loki:
+    endpoint: http://loki:3100/loki/api/v1/push
+    tls:
+      insecure: true
+
+extensions:
+  health_check:
+    endpoint: 0.0.0.0:13133
+
+service:
+  extensions: [health_check]
+  pipelines:
+    traces:
+      receivers: [otlp]
+      processors: [memory_limiter, batch]
+      exporters: [otlp/tempo]
+    metrics:
+      receivers: [otlp]
+      processors: [memory_limiter, batch]
+      exporters: [prometheus]
+    logs:
+      receivers: [otlp]
+      processors: [memory_limiter, batch]
+      exporters: [loki]
--- a/api/deploy/observability/prometheus.yml
+++ b/api/deploy/observability/prometheus.yml
@@ -0,0 +1,12 @@
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+scrape_configs:
+  - job_name: prometheus
+    static_configs:
+      - targets: ["localhost:9090"]
+
+  - job_name: otel-collector
+    static_configs:
+      - targets: ["otel-collector:8889"]
--- a/api/deploy/observability/promtail-config.yaml
+++ b/api/deploy/observability/promtail-config.yaml
@@ -0,0 +1,41 @@
+server:
+  http_listen_port: 9080
+  grpc_listen_port: 0
+
+positions:
+  filename: /tmp/positions.yaml
+
+clients:
+  - url: http://loki:3100/loki/api/v1/push
+
+scrape_configs:
+  - job_name: docker
+    docker_sd_configs:
+      - host: unix:///var/run/docker.sock
+        refresh_interval: 5s
+    relabel_configs:
+      - source_labels: ["__meta_docker_container_name"]
+        regex: "/(.*)"
+        target_label: container
+      - source_labels: ["__meta_docker_container_log_stream"]
+        target_label: stream
+      - source_labels: ["__meta_docker_container_label_com_docker_compose_service"]
+        target_label: compose_service
+    pipeline_stages:
+      - regex:
+          expression: '(?:tid=|trace_id=)(?P<trace_id>[0-9a-f]{12,32})'
+      - regex:
+          expression: 'event=(?P<event>[a-zA-Z0-9_.-]+)'
+      - regex:
+          expression: 'duration_ms=(?P<duration_ms>[0-9.]+)'
+      - json:
+          expressions:
+            trace_id: trace_id
+            span_id: span_id
+            request_id: request_id
+            event: event
+      - structured_metadata:
+          trace_id:
+      - labels:
+          request_id:
+          event:
--- a/api/deploy/observability/tempo.yaml
+++ b/api/deploy/observability/tempo.yaml
@@ -0,0 +1,29 @@
+server:
+  http_listen_port: 3200
+
+distributor:
+  receivers:
+    otlp:
+      protocols:
+        grpc:
+          endpoint: 0.0.0.0:4317
+
+ingester:
+  max_block_duration: 5m
+
+compactor:
+  compaction:
+    block_retention: 48h
+
+storage:
+  trace:
+    backend: local
+    local:
+      path: /var/tempo/traces
+    wal:
+      path: /var/tempo/wal
+
+query_frontend:
+  search:
+    duration_slo: 5s
+    throughput_bytes_slo: 1.073741824e+09