From 259b765971113caeb0375fd04bbd5c8d56aa1fdc Mon Sep 17 00:00:00 2001 From: Mario Date: Tue, 11 Mar 2025 12:38:08 +0100 Subject: [PATCH] [rhythm] Add partition lag alerts for Kafka consumers (#4830) * Add partition lag alerts for Kafka consumers * Dont filter by group * fmt and compile * Add configurable group filter for partition lag alerts --- .../util/jsonnetfile.lock.json | 6 ++-- operations/tempo-mixin-compiled/alerts.yaml | 18 ++++++++++++ operations/tempo-mixin/alerts.libsonnet | 28 +++++++++++++++++++ operations/tempo-mixin/config.libsonnet | 5 ++++ operations/tempo-mixin/runbook.md | 22 +++++++++++++++ 5 files changed, 76 insertions(+), 3 deletions(-) diff --git a/operations/jsonnet-compiled/util/jsonnetfile.lock.json b/operations/jsonnet-compiled/util/jsonnetfile.lock.json index 8040e1261..5e40e1942 100644 --- a/operations/jsonnet-compiled/util/jsonnetfile.lock.json +++ b/operations/jsonnet-compiled/util/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "ksonnet-util" } }, - "version": "39bc80b6c67e08f6fec0d1edfdfdf908cecf66a7", + "version": "e6f3db92b7d61f348aed812087f2865b207d7148", "sum": "0y3AFX9LQSpfWTxWKSwoLgbt0Wc9nnCwhMH2szKzHv0=" }, { @@ -18,7 +18,7 @@ "subdir": "memcached" } }, - "version": "39bc80b6c67e08f6fec0d1edfdfdf908cecf66a7", + "version": "e6f3db92b7d61f348aed812087f2865b207d7148", "sum": "Cc715Y3rgTuimgDFIw+FaKzXSJGRYwt1pFTMbdrNBD8=" }, { @@ -28,7 +28,7 @@ "subdir": "1.29" } }, - "version": "f8b0d65c573f3b36040258fa69e90e13e7129083", + "version": "84aed0f9591ba86a3035e7ebe3557cb012039890", "sum": "i2w3hGbgQmaB73t5LJHSioPOVdYv8ZBvivHiDwZJVyI=" }, { diff --git a/operations/tempo-mixin-compiled/alerts.yaml b/operations/tempo-mixin-compiled/alerts.yaml index 6ab3e984a..a0d41eb73 100644 --- a/operations/tempo-mixin-compiled/alerts.yaml +++ b/operations/tempo-mixin-compiled/alerts.yaml @@ -167,3 +167,21 @@ "for": "5m" "labels": "severity": "critical" + - "alert": "TempoPartitionLagWarning" + "annotations": + "message": "Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than 300 seconds in {{ $labels.cluster }}/{{ $labels.namespace }}." + "runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag" + "expr": | + max by (cluster, namespace, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~".*", group=~"metrics-generator|block-builder"}) > 300 + "for": "5m" + "labels": + "severity": "warning" + - "alert": "TempoPartitionLagCritical" + "annotations": + "message": "Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than 900 seconds in {{ $labels.cluster }}/{{ $labels.namespace }}." + "runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag" + "expr": | + max by (cluster, namespace, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~".*", group=~"metrics-generator|block-builder"}) > 900 + "for": "5m" + "labels": + "severity": "critical" diff --git a/operations/tempo-mixin/alerts.libsonnet b/operations/tempo-mixin/alerts.libsonnet index 59e35d8b1..2919c40b5 100644 --- a/operations/tempo-mixin/alerts.libsonnet +++ b/operations/tempo-mixin/alerts.libsonnet @@ -265,6 +265,34 @@ runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterReplayErrors', }, }, + { + alert: 'TempoPartitionLagWarning', + expr: ||| + max by (%s, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~"%s", group=~"%s"}) > %d + ||| % [$._config.group_by_cluster, $._config.namespace, $._config.alerts.partition_lag_group_filter, $._config.alerts.partition_lag_warning_seconds], + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than %d seconds in {{ $labels.%s }}/{{ $labels.namespace }}.' % [$._config.alerts.partition_lag_warning_seconds, $._config.per_cluster_label], + runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag', + }, + }, + { + alert: 'TempoPartitionLagCritical', + expr: ||| + max by (%s, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~"%s", group=~"%s"}) > %d + ||| % [$._config.group_by_cluster, $._config.namespace, $._config.alerts.partition_lag_group_filter, $._config.alerts.partition_lag_critical_seconds], + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than %d seconds in {{ $labels.%s }}/{{ $labels.namespace }}.' % [$._config.alerts.partition_lag_critical_seconds, $._config.per_cluster_label], + runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag', + }, + }, ], }, ], diff --git a/operations/tempo-mixin/config.libsonnet b/operations/tempo-mixin/config.libsonnet index e13e164f7..f8db65d81 100644 --- a/operations/tempo-mixin/config.libsonnet +++ b/operations/tempo-mixin/config.libsonnet @@ -25,6 +25,11 @@ p99_request_exclude_regex: 'metrics|/frontend.Frontend/Process|debug_pprof', outstanding_blocks_warning: 100, outstanding_blocks_critical: 250, + // Partition lag thresholds in seconds + partition_lag_warning_seconds: 300, // 5 minutes + partition_lag_critical_seconds: 900, // 15 minutes + // Filter for consumer groups to monitor for partition lag + partition_lag_group_filter: 'metrics-generator|block-builder', }, per_cluster_label: 'cluster', diff --git a/operations/tempo-mixin/runbook.md b/operations/tempo-mixin/runbook.md index 5d66f60c7..667517947 100644 --- a/operations/tempo-mixin/runbook.md +++ b/operations/tempo-mixin/runbook.md @@ -233,3 +233,25 @@ meta.json. Repair the meta.json and then restart the ingester to successfully r it is not able to be repaired then the block files can be simply deleted as the ingester has already started without it. As long as the replication factor is 2 or higher, then there will be no data loss as the same data was also written to another ingester. + +## TempoPartitionLag + +This alert fires when a Kafka partition in a consumer group is lagging behind the latest offset by a significant amount of time. + +### Troubleshooting + +1. Check the general health of the affected component (block-builder or metrics-generator): + - Review logs for errors or warnings related to Kafka consumption + - Check if the component is experiencing high CPU or memory usage + - Look for any unusual patterns in processing time or error rates + +2. Check the health of the Kafka cluster: + - Verify broker health and connectivity + - Check if there are any network issues between Tempo and Kafka + - Examine Kafka metrics for unusual patterns (high produce rate, throttling, etc.) + +3. Possible resolutions: + - Scale up the consumer group by adding more instances + - Increase resources (CPU/memory) for the consumer instances + - Check for and fix any bottlenecks in the processing pipeline + - If the lag is temporary due to a spike in traffic, monitor to see if it recovers