feat: OpenTelemetry LGTM observability, dev tooling, and memoir UX fixes (#31)

* add staging ios app build script

* feat(api): add OpenTelemetry LGTM stack for local observability

Wire OTel traces, metrics, and logs through a collector to Tempo,
Prometheus, and Loki, with custom LLM instrumentation, dev compose overlay,
Grafana provisioning, env templates, and development.sh auto-start.

Co-authored-by: Cursor <cursoragent@cursor.com>

* feat: expand observability, harden dev tooling, and fix expo staging UX

Add business and LLM Prometheus metrics with Grafana dashboards, alerting,
and a metrics verification script. Wire telemetry through adapters and core
LLM paths, and document the local LGTM workflow.

Fix development.sh for macOS bash 3.2, open Grafana and eval-web in Chrome,
and repair eval-web auto-open (unbound EVAL_WEB_BROWSER_SCHEDULED). Merge
internal-eval into the main dev script with improved compose handling.

Require EXPO_PUBLIC_* at build time, improve iOS HTTP ATS for staging IPs,
show memoir empty state instead of load errors when no chapters exist, and
add jest env setup plus chapter list response normalization.

Co-authored-by: Cursor <cursoragent@cursor.com>

* chore: enable Grafana Assistant Cursor plugin

Co-authored-by: Cursor <cursoragent@cursor.com>

* fix: memoir empty state and repair withdrawn 0020_chapters_book_id stamp

Show empty memoir UI when the chapter list succeeds with no items; treat auth/404 as non-fatal. Extend alembic revision repair so local dev DBs stamped with the removed 0020_chapters_book_id migration can roll back and upgrade to 0019.

Co-authored-by: Cursor <cursoragent@cursor.com>

---------

Co-authored-by: Kevin <kevin@brighteng.org>
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Sully
2026-05-20 15:12:21 +08:00
committed by GitHub
parent 0d417331fd
commit fa42757916
85 changed files with 3894 additions and 405 deletions

View File

@@ -25,10 +25,19 @@ API_PORT="${API_PORT:-8000}"
CELERY_POOL="${CELERY_POOL:-solo}"
SKIP_INSTALL="${SKIP_INSTALL:-0}"
SKIP_INFRA="${SKIP_INFRA:-0}"
# 可观测性:空=若 .env 中 OTEL_ENABLED=true 则启动 compose0=不启1=强制启动
START_OBSERVABILITY="${START_OBSERVABILITY:-}"
SHUTDOWN_TIMEOUT="${SHUTDOWN_TIMEOUT:-12}"
# 由 internal-eval.sh 开启:在 main:app + Celery 之外再启 internal_main(:8001) 与 app-eval-web
LIFE_ECHO_WITH_INTERNAL_EVAL="${LIFE_ECHO_WITH_INTERNAL_EVAL:-0}"
# 与 docker-compose.observability.yml / .env.example 默认宿主机端口一致
OTEL_GRPC_HOST_PORT="${OTEL_GRPC_HOST_PORT:-48317}"
GRAFANA_HOST_PORT="${GRAFANA_HOST_PORT:-48300}"
PROMETHEUS_HOST_PORT="${PROMETHEUS_HOST_PORT:-49090}"
# 默认一并启动 internal_main + app-eval-web设 0 可仅主站)
LIFE_ECHO_WITH_INTERNAL_EVAL="${LIFE_ECHO_WITH_INTERNAL_EVAL:-1}"
# 自动用 Google Chrome 打开 Grafana / 评测 Web勿用 Vite --open避免落到 Safari
OPEN_OBSERVABILITY_UI="${OPEN_OBSERVABILITY_UI:-1}"
# 若 :8000 已由其他 development 实例占用,仅附加 :8001 + 前端(需自备同一份 Celery/主站)
EVAL_ATTACH_ONLY="${EVAL_ATTACH_ONLY:-0}"
INTERNAL_EVAL_HOST="${INTERNAL_EVAL_HOST:-0.0.0.0}"
@@ -43,6 +52,9 @@ INTERNAL_EVAL_PID=""
EVAL_WEB_PID=""
CLEANED_UP=0
INFRA_STARTED=0
OBSERVABILITY_STARTED=0
OBSERVABILITY_BROWSER_SCHEDULED=0
EVAL_WEB_BROWSER_SCHEDULED=0
print_header() {
echo -e "\n${BLUE}========================================${NC}"
@@ -62,6 +74,64 @@ print_err() {
echo -e "${RED}$1${NC}"
}
open_browser_url() {
local url="$1"
if command -v open >/dev/null 2>&1 && [[ "$(uname -s)" == "Darwin" ]]; then
if open -a "Google Chrome" "${url}" >/dev/null 2>&1; then
return 0
fi
print_warn "未找到 Google Chrome请手动打开: ${url}"
return 1
fi
if command -v google-chrome >/dev/null 2>&1; then
google-chrome "${url}" >/dev/null 2>&1 &
return 0
fi
if command -v chromium-browser >/dev/null 2>&1; then
chromium-browser "${url}" >/dev/null 2>&1 &
return 0
fi
if command -v chromium >/dev/null 2>&1; then
chromium "${url}" >/dev/null 2>&1 &
return 0
fi
print_warn "未找到 Chrome/Chromium请手动打开: ${url}"
return 1
}
schedule_observability_browser() {
if [[ "${OPEN_OBSERVABILITY_UI}" != "1" ]] || [[ "${OBSERVABILITY_BROWSER_SCHEDULED}" == "1" ]]; then
return 0
fi
OBSERVABILITY_BROWSER_SCHEDULED=1
local grafana_url="http://127.0.0.1:${GRAFANA_HOST_PORT}"
(
sleep 4
open_browser_url "${grafana_url}"
) &
print_ok "将自动打开 Grafana: ${grafana_url}"
}
schedule_eval_web_browser() {
if [[ "${OPEN_EVAL_WEB}" != "1" ]] || [[ "${EVAL_WEB_BROWSER_SCHEDULED:-0}" == "1" ]]; then
return 0
fi
EVAL_WEB_BROWSER_SCHEDULED=1
local eval_url="http://127.0.0.1:${EVAL_WEB_PORT}/"
(
local i=0
while (( i < 30 )); do
if is_port_listening "${EVAL_WEB_PORT}"; then
break
fi
sleep 1
i=$((i + 1))
done
open_browser_url "${eval_url}"
) &
print_ok "将自动打开评测 Web (Chrome): ${eval_url}"
}
is_pid_alive() {
local pid="$1"
[[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null
@@ -147,11 +217,9 @@ cleanup() {
fi
if [[ "${INFRA_STARTED}" == "1" ]]; then
print_warn "正在停止 PostgreSQL / Redis 容器..."
(
cd "${ROOT_DIR}" && docker compose -f docker-compose.dev.yml stop
) >/dev/null 2>&1 || true
print_ok "PostgreSQL/Redis 容器已停止"
print_warn "正在停止 Docker 基础设施..."
docker_compose_cmd stop >/dev/null 2>&1 || true
print_ok "Docker 容器已停止"
fi
}
@@ -163,12 +231,107 @@ require_cmd() {
fi
}
read_env_bool() {
local key="$1"
local default="${2:-0}"
local line val
if [[ -n "${!key:-}" ]]; then
val="${!key}"
case "${val}" in
1 | true | TRUE | yes | YES | on | ON) return 0 ;;
*) return 1 ;;
esac
fi
if [[ ! -f "${ROOT_DIR}/.env" ]]; then
[[ "${default}" == "1" ]]
return
fi
line="$(grep -E "^${key}=" "${ROOT_DIR}/.env" | tail -1 | cut -d= -f2- | tr -d '\r' | sed 's/^"//;s/"$//')"
case "${line}" in
1 | true | TRUE | yes | YES | on | ON) return 0 ;;
*) [[ "${default}" == "1" ]] ;;
esac
}
should_start_observability() {
case "${START_OBSERVABILITY}" in
0 | false | FALSE | no | NO | off | OFF) return 1 ;;
1 | true | TRUE | yes | YES | on | ON) return 0 ;;
esac
read_env_bool "OTEL_ENABLED" "0"
}
docker_compose_cmd() {
# 统一 compose -f兼容 macOS 自带 bash 3.2(勿用 local -n / local arr=(-f …)
if should_start_observability; then
(cd "${ROOT_DIR}" && docker compose \
-f docker-compose.dev.yml \
-f docker-compose.observability.yml \
"$@")
return
fi
if [[ "$1" == "up" ]]; then
(cd "${ROOT_DIR}" && docker compose -f docker-compose.dev.yml "$@" --remove-orphans)
else
(cd "${ROOT_DIR}" && docker compose -f docker-compose.dev.yml "$@")
fi
}
wait_otel_collector_ready() {
local retries="${1:-30}"
local i=0
while (( i < retries )); do
if is_port_listening "${OTEL_GRPC_HOST_PORT}"; then
return 0
fi
sleep 1
i=$((i + 1))
done
return 1
}
check_otel_collector_ready() {
if ! read_env_bool "OTEL_ENABLED" "0"; then
return 0
fi
if is_port_listening "${OTEL_GRPC_HOST_PORT}"; then
print_ok "OTel Collector 端口已监听 (:${OTEL_GRPC_HOST_PORT})"
return 0
fi
if [[ "${OBSERVABILITY_STARTED}" == "1" ]]; then
print_warn "等待 OTel Collector 端口 :${OTEL_GRPC_HOST_PORT}"
if wait_otel_collector_ready 45; then
print_ok "OTel Collector 端口已监听 (:${OTEL_GRPC_HOST_PORT})"
return 0
fi
fi
print_warn "OTEL_ENABLED=true 但 :${OTEL_GRPC_HOST_PORT} 未监听"
print_warn "请确认本次启动日志中有「启动可观测性栈」;或手动执行:"
print_warn " docker compose -f docker-compose.dev.yml -f docker-compose.observability.yml up -d"
print_warn "不需要可观测性时在 .env.development 设 OTEL_ENABLED=false"
return 1
}
start_infra() {
print_header "启动 PostgreSQL 和 Redis"
cd "${ROOT_DIR}"
docker compose -f docker-compose.dev.yml up -d
if should_start_observability; then
print_header "启动 PostgreSQL、Redis 与可观测性栈 (OTel / Grafana LGTM)"
OBSERVABILITY_STARTED=1
else
print_header "启动 PostgreSQL 和 Redis"
fi
docker_compose_cmd up -d
INFRA_STARTED=1
print_ok "PostgreSQL 127.0.0.1:48291Redis 127.0.0.1:48307见 docker-compose.dev.yml / .env.example"
if [[ "${OBSERVABILITY_STARTED}" == "1" ]]; then
print_ok "Grafana http://127.0.0.1:${GRAFANA_HOST_PORT} admin/admin"
print_ok "Prometheus http://127.0.0.1:${PROMETHEUS_HOST_PORT}"
print_ok "OTLP gRPC 127.0.0.1:${OTEL_GRPC_HOST_PORT}(应用读 .env 中 OTEL_*,无需 export"
print_ok "详见 docs/observability.md"
schedule_observability_browser
fi
print_ok "基础设施已就绪"
}
@@ -467,19 +630,15 @@ start_eval_web() {
exit 1
fi
local vite_extra=()
if [[ "${OPEN_EVAL_WEB}" == "1" ]]; then
vite_extra+=(--open)
fi
(
cd "${EVAL_WEB_DIR}"
VITE_EVAL_API_KEY="${api_key}" \
VITE_EVAL_PROXY_TARGET="http://127.0.0.1:${INTERNAL_EVAL_PORT}" \
npm run dev -- --host 127.0.0.1 --port "${EVAL_WEB_PORT}" "${vite_extra[@]}"
npm run dev -- --host 127.0.0.1 --port "${EVAL_WEB_PORT}"
) &
EVAL_WEB_PID=$!
print_ok "eval-web 已启动 (PID: ${EVAL_WEB_PID}) → http://127.0.0.1:${EVAL_WEB_PORT}/"
schedule_eval_web_browser
}
start_internal_eval_http() {
@@ -493,7 +652,8 @@ start_internal_eval_http() {
exit 1
fi
"${UVICORN_BIN}" app.internal_main:internal_app --reload \
OTEL_SERVICE_NAME="${INTERNAL_EVAL_OTEL_SERVICE_NAME:-life-echo-internal-api}" \
"${UVICORN_BIN}" app.internal_main:internal_app --reload \
--reload-exclude 'alembic/**' \
--reload-exclude 'alembic.ini' \
--host "${INTERNAL_EVAL_HOST}" --port "${INTERNAL_EVAL_PORT}" &
@@ -547,7 +707,7 @@ start_services() {
fi
if [[ "${skip_main}" == "1" ]] && [[ "${LIFE_ECHO_WITH_INTERNAL_EVAL}" != "1" ]]; then
print_err "EVAL_ATTACH_ONLY=1 仅用于在已有主站时附加内部评测;请使用 ./internal-eval.sh 或导出 LIFE_ECHO_WITH_INTERNAL_EVAL=1"
print_err "EVAL_ATTACH_ONLY=1 仅用于在已有主站时附加内部评测;请设置 LIFE_ECHO_WITH_INTERNAL_EVAL=1"
exit 1
fi
@@ -601,14 +761,27 @@ start_services() {
echo "主站文档: http://localhost:${API_PORT}/docs"
echo "健康检查: http://localhost:${API_PORT}/health"
fi
if [[ "${LIFE_ECHO_WITH_INTERNAL_EVAL}" == "1" ]]; then
echo "评测 Web UI: http://127.0.0.1:${EVAL_WEB_PORT}/"
echo "内部评测 API: http://127.0.0.1:${INTERNAL_EVAL_PORT}/health"
fi
if read_env_bool "OTEL_ENABLED" "0"; then
echo "可观测性: Grafana http://127.0.0.1:${GRAFANA_HOST_PORT} | Prometheus http://127.0.0.1:${PROMETHEUS_HOST_PORT}"
if is_port_listening "${GRAFANA_HOST_PORT}"; then
schedule_observability_browser
fi
fi
if [[ "${LIFE_ECHO_WITH_INTERNAL_EVAL}" == "1" ]] && is_pid_alive "${EVAL_WEB_PID}"; then
schedule_eval_web_browser
fi
echo "按 Ctrl+C 停止所有进程"
}
main() {
if [[ "${LIFE_ECHO_WITH_INTERNAL_EVAL}" == "1" ]]; then
print_header "Life Echo 开发环境 + 内部评测(主站 + :${INTERNAL_EVAL_PORT} + Eval Web"
print_header "Life Echo 开发环境(主站 + 内部评测 + 可观测性)"
else
print_header "Life Echo 开发环境一键启动"
print_header "Life Echo 开发环境一键启动(无内部评测)"
fi
require_cmd "uv"
@@ -618,16 +791,22 @@ main() {
trap cleanup EXIT INT TERM
ensure_venv
# 必须在 start_infra 之前同步,否则 should_start_observability 读不到 .env.development 里的 OTEL_ENABLED
ensure_dotenv_from_development
if [[ "${SKIP_INFRA}" != "1" ]]; then
start_infra
wait_postgres_ready || true
else
print_warn "已跳过 docker 基础设施 (SKIP_INFRA=1)"
if should_start_observability; then
print_warn "SKIP_INFRA=1 未自动启动 observability若需 LGTM 请手动 docker compose up observability overlay"
fi
fi
ensure_venv
ensure_dotenv_from_development
check_env_file
check_otel_collector_ready || true
wait_host_infra_ready
run_migrations
start_services