[rhythm] Add partition lag alerts for Kafka consumers (#4830)

* Add partition lag alerts for Kafka consumers

* Dont filter by group

* fmt and compile

* Add configurable group filter for partition lag alerts
This commit is contained in:
Mario
2025-03-11 12:38:08 +01:00
committed by GitHub
parent f40dd6fb82
commit 259b765971
5 changed files with 76 additions and 3 deletions

View File

@ -8,7 +8,7 @@
"subdir": "ksonnet-util"
}
},
"version": "39bc80b6c67e08f6fec0d1edfdfdf908cecf66a7",
"version": "e6f3db92b7d61f348aed812087f2865b207d7148",
"sum": "0y3AFX9LQSpfWTxWKSwoLgbt0Wc9nnCwhMH2szKzHv0="
},
{
@ -18,7 +18,7 @@
"subdir": "memcached"
}
},
"version": "39bc80b6c67e08f6fec0d1edfdfdf908cecf66a7",
"version": "e6f3db92b7d61f348aed812087f2865b207d7148",
"sum": "Cc715Y3rgTuimgDFIw+FaKzXSJGRYwt1pFTMbdrNBD8="
},
{
@ -28,7 +28,7 @@
"subdir": "1.29"
}
},
"version": "f8b0d65c573f3b36040258fa69e90e13e7129083",
"version": "84aed0f9591ba86a3035e7ebe3557cb012039890",
"sum": "i2w3hGbgQmaB73t5LJHSioPOVdYv8ZBvivHiDwZJVyI="
},
{

View File

@ -167,3 +167,21 @@
"for": "5m"
"labels":
"severity": "critical"
- "alert": "TempoPartitionLagWarning"
"annotations":
"message": "Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than 300 seconds in {{ $labels.cluster }}/{{ $labels.namespace }}."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag"
"expr": |
max by (cluster, namespace, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~".*", group=~"metrics-generator|block-builder"}) > 300
"for": "5m"
"labels":
"severity": "warning"
- "alert": "TempoPartitionLagCritical"
"annotations":
"message": "Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than 900 seconds in {{ $labels.cluster }}/{{ $labels.namespace }}."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag"
"expr": |
max by (cluster, namespace, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~".*", group=~"metrics-generator|block-builder"}) > 900
"for": "5m"
"labels":
"severity": "critical"

View File

@ -265,6 +265,34 @@
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterReplayErrors',
},
},
{
alert: 'TempoPartitionLagWarning',
expr: |||
max by (%s, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~"%s", group=~"%s"}) > %d
||| % [$._config.group_by_cluster, $._config.namespace, $._config.alerts.partition_lag_group_filter, $._config.alerts.partition_lag_warning_seconds],
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
message: 'Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than %d seconds in {{ $labels.%s }}/{{ $labels.namespace }}.' % [$._config.alerts.partition_lag_warning_seconds, $._config.per_cluster_label],
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag',
},
},
{
alert: 'TempoPartitionLagCritical',
expr: |||
max by (%s, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~"%s", group=~"%s"}) > %d
||| % [$._config.group_by_cluster, $._config.namespace, $._config.alerts.partition_lag_group_filter, $._config.alerts.partition_lag_critical_seconds],
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
message: 'Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than %d seconds in {{ $labels.%s }}/{{ $labels.namespace }}.' % [$._config.alerts.partition_lag_critical_seconds, $._config.per_cluster_label],
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag',
},
},
],
},
],

View File

@ -25,6 +25,11 @@
p99_request_exclude_regex: 'metrics|/frontend.Frontend/Process|debug_pprof',
outstanding_blocks_warning: 100,
outstanding_blocks_critical: 250,
// Partition lag thresholds in seconds
partition_lag_warning_seconds: 300, // 5 minutes
partition_lag_critical_seconds: 900, // 15 minutes
// Filter for consumer groups to monitor for partition lag
partition_lag_group_filter: 'metrics-generator|block-builder',
},
per_cluster_label: 'cluster',

View File

@ -233,3 +233,25 @@ meta.json. Repair the meta.json and then restart the ingester to successfully r
it is not able to be repaired then the block files can be simply deleted as the ingester has already started
without it. As long as the replication factor is 2 or higher, then there will be no data loss as the
same data was also written to another ingester.
## TempoPartitionLag
This alert fires when a Kafka partition in a consumer group is lagging behind the latest offset by a significant amount of time.
### Troubleshooting
1. Check the general health of the affected component (block-builder or metrics-generator):
- Review logs for errors or warnings related to Kafka consumption
- Check if the component is experiencing high CPU or memory usage
- Look for any unusual patterns in processing time or error rates
2. Check the health of the Kafka cluster:
- Verify broker health and connectivity
- Check if there are any network issues between Tempo and Kafka
- Examine Kafka metrics for unusual patterns (high produce rate, throttling, etc.)
3. Possible resolutions:
- Scale up the consumer group by adding more instances
- Increase resources (CPU/memory) for the consumer instances
- Check for and fix any bottlenecks in the processing pipeline
- If the lag is temporary due to a spike in traffic, monitor to see if it recovers