Mario 259b765971 [rhythm] Add partition lag alerts for Kafka consumers (#4830)
* Add partition lag alerts for Kafka consumers

* Dont filter by group

* fmt and compile

* Add configurable group filter for partition lag alerts
2025-03-11 12:38:08 +01:00

188 lines
10 KiB
YAML

"groups":
- "name": "tempo_alerts"
"rules":
- "alert": "TempoCompactorUnhealthy"
"annotations":
"message": "There are {{ printf \"%f\" $value }} unhealthy compactor(s)."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorUnhealthy"
"expr": |
max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="compactor", namespace=~".*"}) > 0
"for": "15m"
"labels":
"severity": "critical"
- "alert": "TempoDistributorUnhealthy"
"annotations":
"message": "There are {{ printf \"%f\" $value }} unhealthy distributor(s)."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoDistributorUnhealthy"
"expr": |
max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="distributor", namespace=~".*"}) > 0
"for": "15m"
"labels":
"severity": "warning"
- "alert": "TempoIngesterUnhealthy"
"annotations":
"message": "There are {{ printf \"%f\" $value }} unhealthy ingester(s)."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterUnhealthy"
"expr": |
max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="ingester", namespace=~".*"}) > 0
"for": "15m"
"labels":
"severity": "critical"
- "alert": "TempoMetricsGeneratorUnhealthy"
"annotations":
"message": "There are {{ printf \"%f\" $value }} unhealthy metric-generator(s)."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoMetricsGeneratorUnhealthy"
"expr": |
max by (cluster, namespace) (tempo_ring_members{state="Unhealthy", name="metrics-generator", namespace=~".*"}) > 0
"for": "15m"
"labels":
"severity": "critical"
- "alert": "TempoCompactionsFailing"
"annotations":
"message": "Greater than 2 compactions have failed in the past hour."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactionsFailing"
"expr": |
sum by (cluster, namespace) (increase(tempodb_compaction_errors_total{}[1h])) > 2 and
sum by (cluster, namespace) (increase(tempodb_compaction_errors_total{}[5m])) > 0
"for": "1h"
"labels":
"severity": "critical"
- "alert": "TempoIngesterFlushesUnhealthy"
"annotations":
"message": "Greater than 2 flush retries have occurred in the past hour."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterFlushesFailing"
"expr": |
sum by (cluster, namespace) (increase(tempo_ingester_failed_flushes_total{}[1h])) > 2 and
sum by (cluster, namespace) (increase(tempo_ingester_failed_flushes_total{}[5m])) > 0
"for": "5m"
"labels":
"severity": "warning"
- "alert": "TempoIngesterFlushesFailing"
"annotations":
"message": "Greater than 2 flush retries have failed in the past hour."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterFlushesFailing"
"expr": |
sum by (cluster, namespace) (increase(tempo_ingester_flush_failed_retries_total{}[1h])) > 2 and
sum by (cluster, namespace) (increase(tempo_ingester_flush_failed_retries_total{}[5m])) > 0
"for": "5m"
"labels":
"severity": "critical"
- "alert": "TempoPollsFailing"
"annotations":
"message": "Greater than 2 polls have failed in the past hour."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPollsFailing"
"expr": |
sum by (cluster, namespace) (increase(tempodb_blocklist_poll_errors_total{}[1h])) > 2 and
sum by (cluster, namespace) (increase(tempodb_blocklist_poll_errors_total{}[5m])) > 0
"labels":
"severity": "critical"
- "alert": "TempoTenantIndexFailures"
"annotations":
"message": "Greater than 2 tenant index failures in the past hour."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexFailures"
"expr": |
sum by (cluster, namespace) (increase(tempodb_blocklist_tenant_index_errors_total{}[1h])) > 2 and
sum by (cluster, namespace) (increase(tempodb_blocklist_tenant_index_errors_total{}[5m])) > 0
"labels":
"severity": "critical"
- "alert": "TempoNoTenantIndexBuilders"
"annotations":
"message": "No tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoNoTenantIndexBuilders"
"expr": |
sum by (cluster, namespace, tenant) (tempodb_blocklist_tenant_index_builder{}) == 0 and
max by (cluster, namespace) (tempodb_blocklist_length{}) > 0
"for": "5m"
"labels":
"severity": "critical"
- "alert": "TempoTenantIndexTooOld"
"annotations":
"message": "Tenant index age is 600 seconds old for tenant {{ $labels.tenant }}."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexTooOld"
"expr": |
max by (cluster, namespace, tenant) (tempodb_blocklist_tenant_index_age_seconds{}) > 600
"for": "5m"
"labels":
"severity": "critical"
- "alert": "TempoBlockListRisingQuickly"
"annotations":
"message": "Tempo block list length is up 40 percent over the last 7 days. Consider scaling compactors."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoBlockListRisingQuickly"
"expr": |
avg(tempodb_blocklist_length{namespace=~".*", container="compactor"}) / avg(tempodb_blocklist_length{namespace=~".*", container="compactor"} offset 7d) > 1.4
"for": "15m"
"labels":
"severity": "critical"
- "alert": "TempoBadOverrides"
"annotations":
"message": "{{ $labels.job }} failed to reload overrides."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoBadOverrides"
"expr": |
sum(tempo_runtime_config_last_reload_successful{namespace=~".*"} == 0) by (cluster, namespace, job)
"for": "15m"
"labels":
"severity": "warning"
- "alert": "TempoUserConfigurableOverridesReloadFailing"
"annotations":
"message": "Greater than 5 user-configurable overides reloads failed in the past hour."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexFailures"
"expr": |
sum by (cluster, namespace) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total{}[1h])) > 5 and
sum by (cluster, namespace) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total{}[5m])) > 0
"labels":
"severity": "critical"
- "alert": "TempoProvisioningTooManyWrites"
"annotations":
"message": "Ingesters in {{ $labels.cluster }}/{{ $labels.namespace }} are receiving more data/second than desired, add more ingesters."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoProvisioningTooManyWrites"
"expr": |
avg by (cluster, namespace) (rate(tempo_ingester_bytes_received_total{job=~".+/ingester"}[5m])) / 1024 / 1024 > 30
"for": "15m"
"labels":
"severity": "warning"
- "alert": "TempoCompactorsTooManyOutstandingBlocks"
"annotations":
"message": "There are too many outstanding compaction blocks in {{ $labels.cluster }}/{{ $labels.namespace }} for tenant {{ $labels.tenant }}, increase compactor's CPU or add more compactors."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorsTooManyOutstandingBlocks"
"expr": |
sum by (cluster, namespace, tenant) (tempodb_compaction_outstanding_blocks{container="compactor", namespace=~".*"}) / ignoring(tenant) group_left count(tempo_build_info{container="compactor", namespace=~".*"}) by (cluster, namespace) > 100
"for": "6h"
"labels":
"severity": "warning"
- "alert": "TempoCompactorsTooManyOutstandingBlocks"
"annotations":
"message": "There are too many outstanding compaction blocks in {{ $labels.cluster }}/{{ $labels.namespace }} for tenant {{ $labels.tenant }}, increase compactor's CPU or add more compactors."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorsTooManyOutstandingBlocks"
"expr": |
sum by (cluster, namespace, tenant) (tempodb_compaction_outstanding_blocks{container="compactor", namespace=~".*"}) / ignoring(tenant) group_left count(tempo_build_info{container="compactor", namespace=~".*"}) by (cluster, namespace) > 250
"for": "24h"
"labels":
"severity": "critical"
- "alert": "TempoIngesterReplayErrors"
"annotations":
"message": "Tempo ingester has encountered errors while replaying a block on startup in {{ $labels.cluster }}/{{ $labels.namespace }} for tenant {{ $labels.tenant }}"
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterReplayErrors"
"expr": |
sum by (cluster, namespace, tenant) (increase(tempo_ingester_replay_errors_total{namespace=~".*"}[5m])) > 0
"for": "5m"
"labels":
"severity": "critical"
- "alert": "TempoPartitionLagWarning"
"annotations":
"message": "Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than 300 seconds in {{ $labels.cluster }}/{{ $labels.namespace }}."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag"
"expr": |
max by (cluster, namespace, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~".*", group=~"metrics-generator|block-builder"}) > 300
"for": "5m"
"labels":
"severity": "warning"
- "alert": "TempoPartitionLagCritical"
"annotations":
"message": "Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than 900 seconds in {{ $labels.cluster }}/{{ $labels.namespace }}."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag"
"expr": |
max by (cluster, namespace, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~".*", group=~"metrics-generator|block-builder"}) > 900
"for": "5m"
"labels":
"severity": "critical"