[rhythm] Add partition lag alerts for Kafka consumers (#4830)

* Add partition lag alerts for Kafka consumers

* Dont filter by group

* fmt and compile

* Add configurable group filter for partition lag alerts
This commit is contained in:
Mario
2025-03-11 12:38:08 +01:00
committed by GitHub
parent f40dd6fb82
commit 259b765971
5 changed files with 76 additions and 3 deletions

View File

@ -8,7 +8,7 @@
"subdir": "ksonnet-util" "subdir": "ksonnet-util"
} }
}, },
"version": "39bc80b6c67e08f6fec0d1edfdfdf908cecf66a7", "version": "e6f3db92b7d61f348aed812087f2865b207d7148",
"sum": "0y3AFX9LQSpfWTxWKSwoLgbt0Wc9nnCwhMH2szKzHv0=" "sum": "0y3AFX9LQSpfWTxWKSwoLgbt0Wc9nnCwhMH2szKzHv0="
}, },
{ {
@ -18,7 +18,7 @@
"subdir": "memcached" "subdir": "memcached"
} }
}, },
"version": "39bc80b6c67e08f6fec0d1edfdfdf908cecf66a7", "version": "e6f3db92b7d61f348aed812087f2865b207d7148",
"sum": "Cc715Y3rgTuimgDFIw+FaKzXSJGRYwt1pFTMbdrNBD8=" "sum": "Cc715Y3rgTuimgDFIw+FaKzXSJGRYwt1pFTMbdrNBD8="
}, },
{ {
@ -28,7 +28,7 @@
"subdir": "1.29" "subdir": "1.29"
} }
}, },
"version": "f8b0d65c573f3b36040258fa69e90e13e7129083", "version": "84aed0f9591ba86a3035e7ebe3557cb012039890",
"sum": "i2w3hGbgQmaB73t5LJHSioPOVdYv8ZBvivHiDwZJVyI=" "sum": "i2w3hGbgQmaB73t5LJHSioPOVdYv8ZBvivHiDwZJVyI="
}, },
{ {

View File

@ -167,3 +167,21 @@
"for": "5m" "for": "5m"
"labels": "labels":
"severity": "critical" "severity": "critical"
- "alert": "TempoPartitionLagWarning"
"annotations":
"message": "Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than 300 seconds in {{ $labels.cluster }}/{{ $labels.namespace }}."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag"
"expr": |
max by (cluster, namespace, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~".*", group=~"metrics-generator|block-builder"}) > 300
"for": "5m"
"labels":
"severity": "warning"
- "alert": "TempoPartitionLagCritical"
"annotations":
"message": "Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than 900 seconds in {{ $labels.cluster }}/{{ $labels.namespace }}."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag"
"expr": |
max by (cluster, namespace, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~".*", group=~"metrics-generator|block-builder"}) > 900
"for": "5m"
"labels":
"severity": "critical"

View File

@ -265,6 +265,34 @@
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterReplayErrors', runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterReplayErrors',
}, },
}, },
{
alert: 'TempoPartitionLagWarning',
expr: |||
max by (%s, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~"%s", group=~"%s"}) > %d
||| % [$._config.group_by_cluster, $._config.namespace, $._config.alerts.partition_lag_group_filter, $._config.alerts.partition_lag_warning_seconds],
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
message: 'Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than %d seconds in {{ $labels.%s }}/{{ $labels.namespace }}.' % [$._config.alerts.partition_lag_warning_seconds, $._config.per_cluster_label],
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag',
},
},
{
alert: 'TempoPartitionLagCritical',
expr: |||
max by (%s, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~"%s", group=~"%s"}) > %d
||| % [$._config.group_by_cluster, $._config.namespace, $._config.alerts.partition_lag_group_filter, $._config.alerts.partition_lag_critical_seconds],
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
message: 'Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than %d seconds in {{ $labels.%s }}/{{ $labels.namespace }}.' % [$._config.alerts.partition_lag_critical_seconds, $._config.per_cluster_label],
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag',
},
},
], ],
}, },
], ],

View File

@ -25,6 +25,11 @@
p99_request_exclude_regex: 'metrics|/frontend.Frontend/Process|debug_pprof', p99_request_exclude_regex: 'metrics|/frontend.Frontend/Process|debug_pprof',
outstanding_blocks_warning: 100, outstanding_blocks_warning: 100,
outstanding_blocks_critical: 250, outstanding_blocks_critical: 250,
// Partition lag thresholds in seconds
partition_lag_warning_seconds: 300, // 5 minutes
partition_lag_critical_seconds: 900, // 15 minutes
// Filter for consumer groups to monitor for partition lag
partition_lag_group_filter: 'metrics-generator|block-builder',
}, },
per_cluster_label: 'cluster', per_cluster_label: 'cluster',

View File

@ -233,3 +233,25 @@ meta.json. Repair the meta.json and then restart the ingester to successfully r
it is not able to be repaired then the block files can be simply deleted as the ingester has already started it is not able to be repaired then the block files can be simply deleted as the ingester has already started
without it. As long as the replication factor is 2 or higher, then there will be no data loss as the without it. As long as the replication factor is 2 or higher, then there will be no data loss as the
same data was also written to another ingester. same data was also written to another ingester.
## TempoPartitionLag
This alert fires when a Kafka partition in a consumer group is lagging behind the latest offset by a significant amount of time.
### Troubleshooting
1. Check the general health of the affected component (block-builder or metrics-generator):
- Review logs for errors or warnings related to Kafka consumption
- Check if the component is experiencing high CPU or memory usage
- Look for any unusual patterns in processing time or error rates
2. Check the health of the Kafka cluster:
- Verify broker health and connectivity
- Check if there are any network issues between Tempo and Kafka
- Examine Kafka metrics for unusual patterns (high produce rate, throttling, etc.)
3. Possible resolutions:
- Scale up the consumer group by adding more instances
- Increase resources (CPU/memory) for the consumer instances
- Check for and fix any bottlenecks in the processing pipeline
- If the lag is temporary due to a spike in traffic, monitor to see if it recovers