apiVersion: 1 groups: - orgId: 1 name: life-echo-alerts folder: Life Echo interval: 1m rules: - uid: life_echo_api_p95_high title: API latency p95 > 2s condition: C data: - refId: A relativeTimeRange: { from: 300, to: 0 } datasourceUid: Prometheus model: expr: histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le)) * 1000 refId: A - refId: B datasourceUid: __expr__ model: type: reduce expression: A reducer: last refId: B - refId: C datasourceUid: __expr__ model: type: threshold expression: B conditions: - evaluator: { type: gt, params: [2000] } operator: { type: and } reducer: { type: last } refId: C noDataState: NoData execErrState: Error for: 5m annotations: summary: API p95 latency above 2s for 5 minutes labels: severity: warning - uid: life_echo_llm_error_rate title: LLM error rate > 5% condition: C data: - refId: A relativeTimeRange: { from: 300, to: 0 } datasourceUid: Prometheus model: expr: sum(rate(llm_call_total{outcome="error"}[5m])) / clamp_min(sum(rate(llm_call_total[5m])), 1e-9) refId: A - refId: B datasourceUid: __expr__ model: type: reduce expression: A reducer: last refId: B - refId: C datasourceUid: __expr__ model: type: threshold expression: B conditions: - evaluator: { type: gt, params: [0.05] } operator: { type: and } reducer: { type: last } refId: C noDataState: NoData execErrState: Error for: 5m annotations: summary: LLM call error rate above 5% labels: severity: warning - uid: life_echo_otel_collector_down title: OTel Collector scrape down condition: C data: - refId: A relativeTimeRange: { from: 120, to: 0 } datasourceUid: Prometheus model: expr: up{job="otel-collector"} refId: A - refId: B datasourceUid: __expr__ model: type: reduce expression: A reducer: last refId: B - refId: C datasourceUid: __expr__ model: type: threshold expression: B conditions: - evaluator: { type: lt, params: [1] } operator: { type: and } reducer: { type: last } refId: C noDataState: Alerting execErrState: Error for: 2m annotations: summary: Prometheus cannot scrape otel-collector labels: severity: critical - uid: life_echo_celery_task_failed title: Celery task failures detected condition: C data: - refId: A relativeTimeRange: { from: 300, to: 0 } datasourceUid: loki model: expr: sum(count_over_time({compose_service=~".+"} |= "event=celery_task_failed" [5m])) refId: A - refId: B datasourceUid: __expr__ model: type: reduce expression: A reducer: last refId: B - refId: C datasourceUid: __expr__ model: type: threshold expression: B conditions: - evaluator: { type: gt, params: [0] } operator: { type: and } reducer: { type: last } refId: C noDataState: NoData execErrState: Error for: 5m annotations: summary: Celery task failure logs in last 5 minutes labels: severity: warning