148 lines
4.3 KiB
YAML
148 lines
4.3 KiB
YAML
|
|
apiVersion: 1
|
||
|
|
|
||
|
|
groups:
|
||
|
|
- orgId: 1
|
||
|
|
name: life-echo-alerts
|
||
|
|
folder: Life Echo
|
||
|
|
interval: 1m
|
||
|
|
rules:
|
||
|
|
- uid: life_echo_api_p95_high
|
||
|
|
title: API latency p95 > 2s
|
||
|
|
condition: C
|
||
|
|
data:
|
||
|
|
- refId: A
|
||
|
|
relativeTimeRange: { from: 300, to: 0 }
|
||
|
|
datasourceUid: Prometheus
|
||
|
|
model:
|
||
|
|
expr: histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le)) * 1000
|
||
|
|
refId: A
|
||
|
|
- refId: B
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
type: reduce
|
||
|
|
expression: A
|
||
|
|
reducer: last
|
||
|
|
refId: B
|
||
|
|
- refId: C
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
type: threshold
|
||
|
|
expression: B
|
||
|
|
conditions:
|
||
|
|
- evaluator: { type: gt, params: [2000] }
|
||
|
|
operator: { type: and }
|
||
|
|
reducer: { type: last }
|
||
|
|
refId: C
|
||
|
|
noDataState: NoData
|
||
|
|
execErrState: Error
|
||
|
|
for: 5m
|
||
|
|
annotations:
|
||
|
|
summary: API p95 latency above 2s for 5 minutes
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
|
||
|
|
- uid: life_echo_llm_error_rate
|
||
|
|
title: LLM error rate > 5%
|
||
|
|
condition: C
|
||
|
|
data:
|
||
|
|
- refId: A
|
||
|
|
relativeTimeRange: { from: 300, to: 0 }
|
||
|
|
datasourceUid: Prometheus
|
||
|
|
model:
|
||
|
|
expr: sum(rate(llm_call_total{outcome="error"}[5m])) / clamp_min(sum(rate(llm_call_total[5m])), 1e-9)
|
||
|
|
refId: A
|
||
|
|
- refId: B
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
type: reduce
|
||
|
|
expression: A
|
||
|
|
reducer: last
|
||
|
|
refId: B
|
||
|
|
- refId: C
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
type: threshold
|
||
|
|
expression: B
|
||
|
|
conditions:
|
||
|
|
- evaluator: { type: gt, params: [0.05] }
|
||
|
|
operator: { type: and }
|
||
|
|
reducer: { type: last }
|
||
|
|
refId: C
|
||
|
|
noDataState: NoData
|
||
|
|
execErrState: Error
|
||
|
|
for: 5m
|
||
|
|
annotations:
|
||
|
|
summary: LLM call error rate above 5%
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
|
||
|
|
- uid: life_echo_otel_collector_down
|
||
|
|
title: OTel Collector scrape down
|
||
|
|
condition: C
|
||
|
|
data:
|
||
|
|
- refId: A
|
||
|
|
relativeTimeRange: { from: 120, to: 0 }
|
||
|
|
datasourceUid: Prometheus
|
||
|
|
model:
|
||
|
|
expr: up{job="otel-collector"}
|
||
|
|
refId: A
|
||
|
|
- refId: B
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
type: reduce
|
||
|
|
expression: A
|
||
|
|
reducer: last
|
||
|
|
refId: B
|
||
|
|
- refId: C
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
type: threshold
|
||
|
|
expression: B
|
||
|
|
conditions:
|
||
|
|
- evaluator: { type: lt, params: [1] }
|
||
|
|
operator: { type: and }
|
||
|
|
reducer: { type: last }
|
||
|
|
refId: C
|
||
|
|
noDataState: Alerting
|
||
|
|
execErrState: Error
|
||
|
|
for: 2m
|
||
|
|
annotations:
|
||
|
|
summary: Prometheus cannot scrape otel-collector
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
|
||
|
|
- uid: life_echo_celery_task_failed
|
||
|
|
title: Celery task failures detected
|
||
|
|
condition: C
|
||
|
|
data:
|
||
|
|
- refId: A
|
||
|
|
relativeTimeRange: { from: 300, to: 0 }
|
||
|
|
datasourceUid: loki
|
||
|
|
model:
|
||
|
|
expr: sum(count_over_time({compose_service=~".+"} |= "event=celery_task_failed" [5m]))
|
||
|
|
refId: A
|
||
|
|
- refId: B
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
type: reduce
|
||
|
|
expression: A
|
||
|
|
reducer: last
|
||
|
|
refId: B
|
||
|
|
- refId: C
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
type: threshold
|
||
|
|
expression: B
|
||
|
|
conditions:
|
||
|
|
- evaluator: { type: gt, params: [0] }
|
||
|
|
operator: { type: and }
|
||
|
|
reducer: { type: last }
|
||
|
|
refId: C
|
||
|
|
noDataState: NoData
|
||
|
|
execErrState: Error
|
||
|
|
for: 5m
|
||
|
|
annotations:
|
||
|
|
summary: Celery task failure logs in last 5 minutes
|
||
|
|
labels:
|
||
|
|
severity: warning
|