[rhythm] Add partition lag alerts for Kafka consumers (#4830)

* Add partition lag alerts for Kafka consumers * Dont filter by group * fmt and compile * Add configurable group filter for partition lag alerts
2025-03-14 03:06:42 +00:00 · 2025-03-11 12:38:08 +01:00
parent f40dd6fb82
commit 259b765971
5 changed files with 76 additions and 3 deletions
--- a/operations/jsonnet-compiled/util/jsonnetfile.lock.json
+++ b/operations/jsonnet-compiled/util/jsonnetfile.lock.json
@ -8,7 +8,7 @@
          "subdir": "ksonnet-util"
        }
      },
-      "version": "39bc80b6c67e08f6fec0d1edfdfdf908cecf66a7",
+      "version": "e6f3db92b7d61f348aed812087f2865b207d7148",
      "sum": "0y3AFX9LQSpfWTxWKSwoLgbt0Wc9nnCwhMH2szKzHv0="
    },
    {
@ -18,7 +18,7 @@
          "subdir": "memcached"
        }
      },
-      "version": "39bc80b6c67e08f6fec0d1edfdfdf908cecf66a7",
+      "version": "e6f3db92b7d61f348aed812087f2865b207d7148",
      "sum": "Cc715Y3rgTuimgDFIw+FaKzXSJGRYwt1pFTMbdrNBD8="
    },
    {
@ -28,7 +28,7 @@
          "subdir": "1.29"
        }
      },
-      "version": "f8b0d65c573f3b36040258fa69e90e13e7129083",
+      "version": "84aed0f9591ba86a3035e7ebe3557cb012039890",
      "sum": "i2w3hGbgQmaB73t5LJHSioPOVdYv8ZBvivHiDwZJVyI="
    },
    {
--- a/operations/tempo-mixin-compiled/alerts.yaml
+++ b/operations/tempo-mixin-compiled/alerts.yaml
@ -167,3 +167,21 @@
    "for": "5m"
    "labels":
      "severity": "critical"
+  - "alert": "TempoPartitionLagWarning"
+    "annotations":
+      "message": "Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than 300 seconds in {{ $labels.cluster }}/{{ $labels.namespace }}."
+      "runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag"
+    "expr": |
+      max by (cluster, namespace, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~".*", group=~"metrics-generator|block-builder"}) > 300
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+  - "alert": "TempoPartitionLagCritical"
+    "annotations":
+      "message": "Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than 900 seconds in {{ $labels.cluster }}/{{ $labels.namespace }}."
+      "runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag"
+    "expr": |
+      max by (cluster, namespace, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~".*", group=~"metrics-generator|block-builder"}) > 900
+    "for": "5m"
+    "labels":
+      "severity": "critical"
--- a/operations/tempo-mixin/alerts.libsonnet
+++ b/operations/tempo-mixin/alerts.libsonnet
@ -265,6 +265,34 @@
              runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoIngesterReplayErrors',
            },
          },
+          {
+            alert: 'TempoPartitionLagWarning',
+            expr: |||
+              max by (%s, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~"%s", group=~"%s"}) > %d
+            ||| % [$._config.group_by_cluster, $._config.namespace, $._config.alerts.partition_lag_group_filter, $._config.alerts.partition_lag_warning_seconds],
+            'for': '5m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              message: 'Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than %d seconds in {{ $labels.%s }}/{{ $labels.namespace }}.' % [$._config.alerts.partition_lag_warning_seconds, $._config.per_cluster_label],
+              runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag',
+            },
+          },
+          {
+            alert: 'TempoPartitionLagCritical',
+            expr: |||
+              max by (%s, group, partition) (tempo_ingest_group_partition_lag_seconds{namespace=~"%s", group=~"%s"}) > %d
+            ||| % [$._config.group_by_cluster, $._config.namespace, $._config.alerts.partition_lag_group_filter, $._config.alerts.partition_lag_critical_seconds],
+            'for': '5m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              message: 'Tempo partition {{ $labels.partition }} in consumer group {{ $labels.group }} is lagging by more than %d seconds in {{ $labels.%s }}/{{ $labels.namespace }}.' % [$._config.alerts.partition_lag_critical_seconds, $._config.per_cluster_label],
+              runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPartitionLag',
+            },
+          },
        ],
      },
    ],
--- a/operations/tempo-mixin/config.libsonnet
+++ b/operations/tempo-mixin/config.libsonnet
@ -25,6 +25,11 @@
      p99_request_exclude_regex: 'metrics|/frontend.Frontend/Process|debug_pprof',
      outstanding_blocks_warning: 100,
      outstanding_blocks_critical: 250,
+      // Partition lag thresholds in seconds
+      partition_lag_warning_seconds: 300,  // 5 minutes
+      partition_lag_critical_seconds: 900,  // 15 minutes
+      // Filter for consumer groups to monitor for partition lag
+      partition_lag_group_filter: 'metrics-generator|block-builder',
    },

    per_cluster_label: 'cluster',
--- a/operations/tempo-mixin/runbook.md
+++ b/operations/tempo-mixin/runbook.md
@ -233,3 +233,25 @@ meta.json.  Repair the meta.json and then restart the ingester to successfully r
 it is not able to be repaired then the block files can be simply deleted as the ingester has already started
 without it.  As long as the replication factor is 2 or higher, then there will be no data loss as the
 same data was also written to another ingester.
+
+## TempoPartitionLag
+
+This alert fires when a Kafka partition in a consumer group is lagging behind the latest offset by a significant amount of time.
+
+### Troubleshooting
+
+1. Check the general health of the affected component (block-builder or metrics-generator):
+   - Review logs for errors or warnings related to Kafka consumption
+   - Check if the component is experiencing high CPU or memory usage
+   - Look for any unusual patterns in processing time or error rates
+
+2. Check the health of the Kafka cluster:
+   - Verify broker health and connectivity
+   - Check if there are any network issues between Tempo and Kafka
+   - Examine Kafka metrics for unusual patterns (high produce rate, throttling, etc.)
+
+3. Possible resolutions:
+   - Scale up the consumer group by adding more instances
+   - Increase resources (CPU/memory) for the consumer instances
+   - Check for and fix any bottlenecks in the processing pipeline
+   - If the lag is temporary due to a spike in traffic, monitor to see if it recovers