feat: OpenTelemetry LGTM observability, dev tooling, and memoir UX fixes (#31)

* add staging ios app build script

* feat(api): add OpenTelemetry LGTM stack for local observability

Wire OTel traces, metrics, and logs through a collector to Tempo,
Prometheus, and Loki, with custom LLM instrumentation, dev compose overlay,
Grafana provisioning, env templates, and development.sh auto-start.

Co-authored-by: Cursor <cursoragent@cursor.com>

* feat: expand observability, harden dev tooling, and fix expo staging UX

Add business and LLM Prometheus metrics with Grafana dashboards, alerting,
and a metrics verification script. Wire telemetry through adapters and core
LLM paths, and document the local LGTM workflow.

Fix development.sh for macOS bash 3.2, open Grafana and eval-web in Chrome,
and repair eval-web auto-open (unbound EVAL_WEB_BROWSER_SCHEDULED). Merge
internal-eval into the main dev script with improved compose handling.

Require EXPO_PUBLIC_* at build time, improve iOS HTTP ATS for staging IPs,
show memoir empty state instead of load errors when no chapters exist, and
add jest env setup plus chapter list response normalization.

Co-authored-by: Cursor <cursoragent@cursor.com>

* chore: enable Grafana Assistant Cursor plugin

Co-authored-by: Cursor <cursoragent@cursor.com>

* fix: memoir empty state and repair withdrawn 0020_chapters_book_id stamp

Show empty memoir UI when the chapter list succeeds with no items; treat auth/404 as non-fatal. Extend alembic revision repair so local dev DBs stamped with the removed 0020_chapters_book_id migration can roll back and upgrade to 0019.

Co-authored-by: Cursor <cursoragent@cursor.com>

---------

Co-authored-by: Kevin <kevin@brighteng.org>
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Sully
2026-05-20 15:12:21 +08:00
committed by GitHub
parent 0d417331fd
commit fa42757916
85 changed files with 3894 additions and 405 deletions

View File

@@ -0,0 +1,75 @@
{
"annotations": { "list": [] },
"editable": true,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
"id": 1,
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(business_operation_duration_milliseconds_bucket[5m])) by (le, operation))",
"legendFormat": "{{operation}} p95",
"refId": "A"
}
],
"title": "Business operation duration p95",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"id": 2,
"targets": [
{
"expr": "sum(rate(business_operation_duration_milliseconds_count[5m])) by (operation, outcome)",
"legendFormat": "{{operation}} / {{outcome}}",
"refId": "A"
}
],
"title": "Business operations rate",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"id": 3,
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(business_operation_duration_milliseconds_bucket[5m])) by (le, operation)) and on(operation) (operation=~\"conversation\\\\.ws\\\\..*|asr\\\\.transcribe|tts\\\\.synthesize\")",
"legendFormat": "{{operation}}",
"refId": "A"
}
],
"title": "WS / ASR / TTS p95",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
"id": 4,
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(business_operation_duration_milliseconds_bucket[5m])) by (le, operation)) and on(operation) (operation=~\"memoir\\\\..*\")",
"legendFormat": "{{operation}}",
"refId": "A"
}
],
"title": "Memoir pipeline phases p95",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": ["life-echo", "business"],
"templating": { "list": [] },
"time": { "from": "now-6h", "to": "now" },
"title": "Life Echo Business",
"uid": "life-echo-business",
"version": 1
}

View File

@@ -0,0 +1,79 @@
{
"annotations": { "list": [] },
"editable": true,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"id": 1,
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(llm_call_duration_milliseconds_bucket[5m])) by (le, agent, call_type))",
"legendFormat": "{{agent}} / {{call_type}} p95",
"refId": "A"
}
],
"title": "LLM duration p95 by agent / call_type",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"id": 2,
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(llm_call_duration_milliseconds_bucket[5m])) by (le, call_type))",
"legendFormat": "{{call_type}} p50",
"refId": "A"
}
],
"title": "LLM duration p50 by call_type (json vs chat vs stream)",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"id": 3,
"targets": [
{
"expr": "sum(rate(llm_call_total[5m])) by (outcome, call_type)",
"legendFormat": "{{outcome}} / {{call_type}}",
"refId": "A"
}
],
"title": "LLM calls by outcome",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"id": 4,
"targets": [
{
"expr": "sum(rate(llm_tokens_input_total[5m])) by (agent)",
"legendFormat": "in {{agent}}",
"refId": "A"
},
{
"expr": "sum(rate(llm_tokens_output_total[5m])) by (agent)",
"legendFormat": "out {{agent}}",
"refId": "B"
}
],
"title": "LLM tokens/min",
"type": "timeseries"
}
],
"schemaVersion": 39,
"tags": ["life-echo", "llm"],
"templating": { "list": [] },
"time": { "from": "now-1h", "to": "now" },
"title": "Life Echo LLM",
"uid": "life-echo-llm",
"version": 1
}

View File

@@ -0,0 +1,69 @@
{
"annotations": { "list": [] },
"editable": true,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 0 },
"id": 1,
"options": { "showTime": true, "sortOrder": "Descending" },
"targets": [
{
"expr": "{compose_service=~\".+\"} |= \"event=llm_json_call\"",
"refId": "A"
}
],
"title": "LLM JSON calls (event=llm_json_call)",
"type": "logs"
},
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 10 },
"id": 2,
"options": { "showTime": true, "sortOrder": "Descending" },
"targets": [
{
"expr": "{compose_service=~\".+\"} |= \"event=celery_task_failed\"",
"refId": "A"
}
],
"title": "Celery task failures",
"type": "logs"
},
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 20 },
"id": 3,
"options": { "showTime": true, "sortOrder": "Descending" },
"targets": [
{
"expr": "{trace_id=~\"$trace_id\"}",
"refId": "A"
}
],
"title": "Logs by trace_id",
"type": "logs"
}
],
"schemaVersion": 39,
"tags": ["life-echo", "logs"],
"templating": {
"list": [
{
"current": { "text": "", "value": "" },
"label": "trace_id",
"name": "trace_id",
"options": [],
"query": "",
"type": "textbox"
}
]
},
"time": { "from": "now-1h", "to": "now" },
"title": "Life Echo Logs",
"uid": "life-echo-logs",
"version": 1
}

View File

@@ -0,0 +1,154 @@
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"fieldConfig": { "defaults": { "unit": "reqps" }, "overrides": [] },
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 0 },
"id": 1,
"options": { "legend": { "displayMode": "list", "placement": "bottom" } },
"targets": [
{
"expr": "sum(rate(http_server_request_duration_seconds_count[5m]))",
"legendFormat": "HTTP requests/s",
"refId": "A"
}
],
"title": "API request rate",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 0 },
"id": 2,
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le)) * 1000",
"legendFormat": "p95",
"refId": "A"
}
],
"title": "API latency p95",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 0 },
"id": 3,
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(llm_call_duration_milliseconds_bucket[5m])) by (le, agent, provider))",
"legendFormat": "{{agent}} / {{provider}}",
"refId": "A"
}
],
"title": "LLM call duration p95",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"id": 4,
"targets": [
{
"expr": "sum(rate(llm_call_total[5m])) by (outcome)",
"legendFormat": "{{outcome}}",
"refId": "A"
}
],
"title": "LLM calls by outcome",
"type": "timeseries"
},
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"id": 5,
"options": { "showTime": true, "sortOrder": "Descending" },
"targets": [
{
"expr": "{compose_service=~\".+\"} |= \"llm_json_call\"",
"refId": "A"
}
],
"title": "LLM JSON call logs",
"type": "logs"
},
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
"id": 6,
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(db_client_operation_duration_seconds_bucket[5m])) by (le)) * 1000",
"legendFormat": "DB p95",
"refId": "A"
}
],
"title": "DB client latency p95",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
"id": 7,
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_client_request_duration_seconds_bucket[5m])) by (le)) * 1000",
"legendFormat": "HTTP client p95",
"refId": "A"
}
],
"title": "Outbound HTTP latency p95",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
"id": 8,
"targets": [
{
"expr": "sum(rate(http_server_request_duration_seconds_count{http_response_status_code=~\"5..\"}[5m])) / clamp_min(sum(rate(http_server_request_duration_seconds_count[5m])), 1e-9)",
"legendFormat": "5xx rate",
"refId": "A"
}
],
"title": "HTTP 5xx error rate",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "Prometheus" },
"fieldConfig": { "defaults": { "unit": "ms" }, "overrides": [] },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
"id": 9,
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(redis_client_operation_duration_seconds_bucket[5m])) by (le)) * 1000",
"legendFormat": "Redis p95",
"refId": "A"
}
],
"title": "Redis client latency p95",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["life-echo"],
"templating": { "list": [] },
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Life Echo Overview",
"uid": "life-echo-overview",
"version": 1
}

View File

@@ -0,0 +1,4 @@
apiVersion: 1
# 本地 dev 占位:不配置真实通知渠道。在 Grafana UI 中可绑定 Slack/Webhook。
contactPoints: []

View File

@@ -0,0 +1,147 @@
apiVersion: 1
groups:
- orgId: 1
name: life-echo-alerts
folder: Life Echo
interval: 1m
rules:
- uid: life_echo_api_p95_high
title: API latency p95 > 2s
condition: C
data:
- refId: A
relativeTimeRange: { from: 300, to: 0 }
datasourceUid: Prometheus
model:
expr: histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le)) * 1000
refId: A
- refId: B
datasourceUid: __expr__
model:
type: reduce
expression: A
reducer: last
refId: B
- refId: C
datasourceUid: __expr__
model:
type: threshold
expression: B
conditions:
- evaluator: { type: gt, params: [2000] }
operator: { type: and }
reducer: { type: last }
refId: C
noDataState: NoData
execErrState: Error
for: 5m
annotations:
summary: API p95 latency above 2s for 5 minutes
labels:
severity: warning
- uid: life_echo_llm_error_rate
title: LLM error rate > 5%
condition: C
data:
- refId: A
relativeTimeRange: { from: 300, to: 0 }
datasourceUid: Prometheus
model:
expr: sum(rate(llm_call_total{outcome="error"}[5m])) / clamp_min(sum(rate(llm_call_total[5m])), 1e-9)
refId: A
- refId: B
datasourceUid: __expr__
model:
type: reduce
expression: A
reducer: last
refId: B
- refId: C
datasourceUid: __expr__
model:
type: threshold
expression: B
conditions:
- evaluator: { type: gt, params: [0.05] }
operator: { type: and }
reducer: { type: last }
refId: C
noDataState: NoData
execErrState: Error
for: 5m
annotations:
summary: LLM call error rate above 5%
labels:
severity: warning
- uid: life_echo_otel_collector_down
title: OTel Collector scrape down
condition: C
data:
- refId: A
relativeTimeRange: { from: 120, to: 0 }
datasourceUid: Prometheus
model:
expr: up{job="otel-collector"}
refId: A
- refId: B
datasourceUid: __expr__
model:
type: reduce
expression: A
reducer: last
refId: B
- refId: C
datasourceUid: __expr__
model:
type: threshold
expression: B
conditions:
- evaluator: { type: lt, params: [1] }
operator: { type: and }
reducer: { type: last }
refId: C
noDataState: Alerting
execErrState: Error
for: 2m
annotations:
summary: Prometheus cannot scrape otel-collector
labels:
severity: critical
- uid: life_echo_celery_task_failed
title: Celery task failures detected
condition: C
data:
- refId: A
relativeTimeRange: { from: 300, to: 0 }
datasourceUid: loki
model:
expr: sum(count_over_time({compose_service=~".+"} |= "event=celery_task_failed" [5m]))
refId: A
- refId: B
datasourceUid: __expr__
model:
type: reduce
expression: A
reducer: last
refId: B
- refId: C
datasourceUid: __expr__
model:
type: threshold
expression: B
conditions:
- evaluator: { type: gt, params: [0] }
operator: { type: and }
reducer: { type: last }
refId: C
noDataState: NoData
execErrState: Error
for: 5m
annotations:
summary: Celery task failure logs in last 5 minutes
labels:
severity: warning

View File

@@ -0,0 +1,11 @@
apiVersion: 1
providers:
- name: Life Echo
orgId: 1
folder: Life Echo
type: file
disableDeletion: false
editable: true
options:
path: /etc/grafana/dashboards

View File

@@ -0,0 +1,43 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
- name: Tempo
type: tempo
access: proxy
url: http://tempo:3200
editable: false
jsonData:
httpMethod: GET
tracesToLogsV2:
datasourceUid: loki
spanStartTimeShift: -1m
spanEndTimeShift: 1m
filterByTraceID: true
filterBySpanID: false
customQuery: true
query: '{container=~".+"} | json | trace_id="$${__trace.traceId}"'
serviceMap:
datasourceUid: prometheus
nodeGraph:
enabled: true
- name: Loki
type: loki
uid: loki
access: proxy
url: http://loki:3100
editable: false
jsonData:
derivedFields:
- datasourceUid: tempo
matcherRegex: '"trace_id":"([a-f0-9]+)"'
name: TraceID
url: "$${__value.raw}"
urlDisplayLabel: View Trace

View File

@@ -0,0 +1,32 @@
auth_enabled: false
server:
http_listen_port: 3100
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
limits_config:
retention_period: 168h
ruler:
alertmanager_url: http://localhost:9093

View File

@@ -0,0 +1,53 @@
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 5s
send_batch_size: 1024
memory_limiter:
check_interval: 1s
limit_mib: 512
spike_limit_mib: 128
resource:
attributes:
- key: deployment.environment
value: development
action: upsert
exporters:
otlp/tempo:
endpoint: tempo:4317
tls:
insecure: true
prometheus:
endpoint: 0.0.0.0:8889
loki:
endpoint: http://loki:3100/loki/api/v1/push
tls:
insecure: true
extensions:
health_check:
endpoint: 0.0.0.0:13133
service:
extensions: [health_check]
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/tempo]
metrics:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [prometheus]
logs:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [loki]

View File

@@ -0,0 +1,12 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ["localhost:9090"]
- job_name: otel-collector
static_configs:
- targets: ["otel-collector:8889"]

View File

@@ -0,0 +1,41 @@
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
relabel_configs:
- source_labels: ["__meta_docker_container_name"]
regex: "/(.*)"
target_label: container
- source_labels: ["__meta_docker_container_log_stream"]
target_label: stream
- source_labels: ["__meta_docker_container_label_com_docker_compose_service"]
target_label: compose_service
pipeline_stages:
- regex:
expression: '(?:tid=|trace_id=)(?P<trace_id>[0-9a-f]{12,32})'
- regex:
expression: 'event=(?P<event>[a-zA-Z0-9_.-]+)'
- regex:
expression: 'duration_ms=(?P<duration_ms>[0-9.]+)'
- json:
expressions:
trace_id: trace_id
span_id: span_id
request_id: request_id
event: event
- structured_metadata:
trace_id:
- labels:
request_id:
event:

View File

@@ -0,0 +1,29 @@
server:
http_listen_port: 3200
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
ingester:
max_block_duration: 5m
compactor:
compaction:
block_retention: 48h
storage:
trace:
backend: local
local:
path: /var/tempo/traces
wal:
path: /var/tempo/wal
query_frontend:
search:
duration_slo: 5s
throughput_bytes_slo: 1.073741824e+09