life-echo/api/deploy/observability/grafana/provisioning/alerting/rules.yml

apiVersion: 1

groups:
  - orgId: 1
    name: life-echo-alerts
    folder: Life Echo
    interval: 1m
    rules:
      - uid: life_echo_api_p95_high
        title: API latency p95 > 2s
        condition: C
        data:
          - refId: A
            relativeTimeRange: { from: 300, to: 0 }
            datasourceUid: Prometheus
            model:
              expr: histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le)) * 1000
              refId: A
          - refId: B
            datasourceUid: __expr__
            model:
              type: reduce
              expression: A
              reducer: last
              refId: B
          - refId: C
            datasourceUid: __expr__
            model:
              type: threshold
              expression: B
              conditions:
                - evaluator: { type: gt, params: [2000] }
                  operator: { type: and }
                  reducer: { type: last }
              refId: C
        noDataState: NoData
        execErrState: Error
        for: 5m
        annotations:
          summary: API p95 latency above 2s for 5 minutes
        labels:
          severity: warning

      - uid: life_echo_llm_error_rate
        title: LLM error rate > 5%
        condition: C
        data:
          - refId: A
            relativeTimeRange: { from: 300, to: 0 }
            datasourceUid: Prometheus
            model:
              expr: sum(rate(llm_call_total{outcome="error"}[5m])) / clamp_min(sum(rate(llm_call_total[5m])), 1e-9)
              refId: A
          - refId: B
            datasourceUid: __expr__
            model:
              type: reduce
              expression: A
              reducer: last
              refId: B
          - refId: C
            datasourceUid: __expr__
            model:
              type: threshold
              expression: B
              conditions:
                - evaluator: { type: gt, params: [0.05] }
                  operator: { type: and }
                  reducer: { type: last }
              refId: C
        noDataState: NoData
        execErrState: Error
        for: 5m
        annotations:
          summary: LLM call error rate above 5%
        labels:
          severity: warning

      - uid: life_echo_otel_collector_down
        title: OTel Collector scrape down
        condition: C
        data:
          - refId: A
            relativeTimeRange: { from: 120, to: 0 }
            datasourceUid: Prometheus
            model:
              expr: up{job="otel-collector"}
              refId: A
          - refId: B
            datasourceUid: __expr__
            model:
              type: reduce
              expression: A
              reducer: last
              refId: B
          - refId: C
            datasourceUid: __expr__
            model:
              type: threshold
              expression: B
              conditions:
                - evaluator: { type: lt, params: [1] }
                  operator: { type: and }
                  reducer: { type: last }
              refId: C
        noDataState: Alerting
        execErrState: Error
        for: 2m
        annotations:
          summary: Prometheus cannot scrape otel-collector
        labels:
          severity: critical

      - uid: life_echo_celery_task_failed
        title: Celery task failures detected
        condition: C
        data:
          - refId: A
            relativeTimeRange: { from: 300, to: 0 }
            datasourceUid: loki
            model:
              expr: sum(count_over_time({compose_service=~".+"} |= "event=celery_task_failed" [5m]))
              refId: A
          - refId: B
            datasourceUid: __expr__
            model:
              type: reduce
              expression: A
              reducer: last
              refId: B
          - refId: C
            datasourceUid: __expr__
            model:
              type: threshold
              expression: B
              conditions:
                - evaluator: { type: gt, params: [0] }
                  operator: { type: and }
                  reducer: { type: last }
              refId: C
        noDataState: NoData
        execErrState: Error
        for: 5m
        annotations:
          summary: Celery task failure logs in last 5 minutes
        labels:
          severity: warning