mirror of
https://github.com/grafana/tempo.git
synced 2025-03-14 03:06:42 +00:00
[rhythm] Add partition lag alerts for Kafka consumers (#4830)
* Add partition lag alerts for Kafka consumers * Dont filter by group * fmt and compile * Add configurable group filter for partition lag alerts
This commit is contained in:
@ -8,7 +8,7 @@
|
||||
"subdir": "ksonnet-util"
|
||||
}
|
||||
},
|
||||
"version": "39bc80b6c67e08f6fec0d1edfdfdf908cecf66a7",
|
||||
"version": "e6f3db92b7d61f348aed812087f2865b207d7148",
|
||||
"sum": "0y3AFX9LQSpfWTxWKSwoLgbt0Wc9nnCwhMH2szKzHv0="
|
||||
},
|
||||
{
|
||||
@ -18,7 +18,7 @@
|
||||
"subdir": "memcached"
|
||||
}
|
||||
},
|
||||
"version": "39bc80b6c67e08f6fec0d1edfdfdf908cecf66a7",
|
||||
"version": "e6f3db92b7d61f348aed812087f2865b207d7148",
|
||||
"sum": "Cc715Y3rgTuimgDFIw+FaKzXSJGRYwt1pFTMbdrNBD8="
|
||||
},
|
||||
{
|
||||
@ -28,7 +28,7 @@
|
||||
"subdir": "1.29"
|
||||
}
|
||||
},
|
||||
"version": "f8b0d65c573f3b36040258fa69e90e13e7129083",
|
||||
"version": "84aed0f9591ba86a3035e7ebe3557cb012039890",
|
||||
"sum": "i2w3hGbgQmaB73t5LJHSioPOVdYv8ZBvivHiDwZJVyI="
|
||||
},
|
||||
{
|
||||
|
@ -167,3 +167,21 @@
|
||||
"for": "5m"
|
||||
"labels":
|
||||
"severity": "critical"
|
||||
- "alert": "TempoPartitionLagWarning"
|
||||
"annotations":
|
||||
"message": "Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than 300 seconds in {{ $labels.cluster }}/{{ $labels.namespace }}."
|
||||
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag"
|
||||
"expr": |
|
||||
max by (cluster, namespace, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~".*", group=~"metrics-generator|block-builder"}) > 300
|
||||
"for": "5m"
|
||||
"labels":
|
||||
"severity": "warning"
|
||||
- "alert": "TempoPartitionLagCritical"
|
||||
"annotations":
|
||||
"message": "Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than 900 seconds in {{ $labels.cluster }}/{{ $labels.namespace }}."
|
||||
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag"
|
||||
"expr": |
|
||||
max by (cluster, namespace, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~".*", group=~"metrics-generator|block-builder"}) > 900
|
||||
"for": "5m"
|
||||
"labels":
|
||||
"severity": "critical"
|
||||
|
@ -265,6 +265,34 @@
|
||||
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterReplayErrors',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'TempoPartitionLagWarning',
|
||||
expr: |||
|
||||
max by (%s, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~"%s", group=~"%s"}) > %d
|
||||
||| % [$._config.group_by_cluster, $._config.namespace, $._config.alerts.partition_lag_group_filter, $._config.alerts.partition_lag_warning_seconds],
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than %d seconds in {{ $labels.%s }}/{{ $labels.namespace }}.' % [$._config.alerts.partition_lag_warning_seconds, $._config.per_cluster_label],
|
||||
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'TempoPartitionLagCritical',
|
||||
expr: |||
|
||||
max by (%s, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~"%s", group=~"%s"}) > %d
|
||||
||| % [$._config.group_by_cluster, $._config.namespace, $._config.alerts.partition_lag_group_filter, $._config.alerts.partition_lag_critical_seconds],
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than %d seconds in {{ $labels.%s }}/{{ $labels.namespace }}.' % [$._config.alerts.partition_lag_critical_seconds, $._config.per_cluster_label],
|
||||
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
|
@ -25,6 +25,11 @@
|
||||
p99_request_exclude_regex: 'metrics|/frontend.Frontend/Process|debug_pprof',
|
||||
outstanding_blocks_warning: 100,
|
||||
outstanding_blocks_critical: 250,
|
||||
// Partition lag thresholds in seconds
|
||||
partition_lag_warning_seconds: 300, // 5 minutes
|
||||
partition_lag_critical_seconds: 900, // 15 minutes
|
||||
// Filter for consumer groups to monitor for partition lag
|
||||
partition_lag_group_filter: 'metrics-generator|block-builder',
|
||||
},
|
||||
|
||||
per_cluster_label: 'cluster',
|
||||
|
@ -233,3 +233,25 @@ meta.json. Repair the meta.json and then restart the ingester to successfully r
|
||||
it is not able to be repaired then the block files can be simply deleted as the ingester has already started
|
||||
without it. As long as the replication factor is 2 or higher, then there will be no data loss as the
|
||||
same data was also written to another ingester.
|
||||
|
||||
## TempoPartitionLag
|
||||
|
||||
This alert fires when a Kafka partition in a consumer group is lagging behind the latest offset by a significant amount of time.
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
1. Check the general health of the affected component (block-builder or metrics-generator):
|
||||
- Review logs for errors or warnings related to Kafka consumption
|
||||
- Check if the component is experiencing high CPU or memory usage
|
||||
- Look for any unusual patterns in processing time or error rates
|
||||
|
||||
2. Check the health of the Kafka cluster:
|
||||
- Verify broker health and connectivity
|
||||
- Check if there are any network issues between Tempo and Kafka
|
||||
- Examine Kafka metrics for unusual patterns (high produce rate, throttling, etc.)
|
||||
|
||||
3. Possible resolutions:
|
||||
- Scale up the consumer group by adding more instances
|
||||
- Increase resources (CPU/memory) for the consumer instances
|
||||
- Check for and fix any bottlenecks in the processing pipeline
|
||||
- If the lag is temporary due to a spike in traffic, monitor to see if it recovers
|
||||
|
Reference in New Issue
Block a user