1
0
mirror of https://github.com/grafana/tempo.git synced 2025-03-14 03:06:42 +00:00
tempo/operations/tempo-mixin/alerts.libsonnet
Joe Elliott bb7a4ea74c Rename master to main ()
* master => main

Signed-off-by: Joe Elliott <number101010@gmail.com>

* master => main

Signed-off-by: Joe Elliott <number101010@gmail.com>

* master => main

Signed-off-by: Joe Elliott <number101010@gmail.com>
2021-04-20 08:03:37 -04:00

117 lines
4.8 KiB
Jsonnet

{
prometheusAlerts+:: {
groups+: [
{
name: 'tempo_alerts',
rules: [
{
alert: 'TempoRequestErrors',
expr: |||
100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
/
sum(rate(tempo_request_duration_seconds_count[1m])) by (namespace, job, route)
> 10
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||,
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestErrors'
},
},
{
alert: 'TempoRequestLatency',
expr: |||
namespace_job_route:tempo_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process"} > 3
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|||,
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestLatency'
},
},
{
alert: 'TempoCompactorUnhealthy',
'for': '15m',
expr: |||
max by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="compactor"}) > 0
|||,
labels: {
severity: 'critical',
},
annotations: {
message: 'There are {{ printf "%f" $value }} unhealthy compactor(s).',
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactorUnhealthy'
},
},
{
alert: 'TempoDistributorUnhealthy',
'for': '15m',
expr: |||
max by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="distributor"}) > 0
|||,
labels: {
severity: 'warning',
},
annotations: {
message: 'There are {{ printf "%f" $value }} unhealthy distributor(s).',
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoDistributorUnhealthy'
},
},
{
alert: 'TempoCompactionsFailing',
expr: |||
sum by (cluster, namespace) (increase(tempodb_compaction_errors_total{}[1h])) > %s and
sum by (cluster, namespace) (increase(tempodb_compaction_errors_total{}[5m])) > 0
||| % $._config.alerts.compactions_per_hour_failed,
labels: {
severity: 'critical',
},
annotations: {
message: 'Greater than %s compactions have failed in the past hour.' % $._config.alerts.compactions_per_hour_failed,
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoCompactionsFailing'
},
},
{
alert: 'TempoFlushesFailing',
expr: |||
sum by (cluster, namespace) (increase(tempo_ingester_failed_flushes_total{}[1h])) > %s and
sum by (cluster, namespace) (increase(tempo_ingester_failed_flushes_total{}[5m])) > 0
||| % $._config.alerts.flushes_per_hour_failed,
labels: {
severity: 'critical',
},
annotations: {
message: 'Greater than %s flushes have failed in the past hour.' % $._config.alerts.flushes_per_hour_failed,
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoFlushesFailing'
},
},
{
alert: 'TempoPollsFailing',
expr: |||
sum by (cluster, namespace) (increase(tempodb_blocklist_poll_errors_total{}[1h])) > %s and
sum by (cluster, namespace) (increase(tempodb_blocklist_poll_errors_total{}[5m])) > 0
||| % $._config.alerts.polls_per_hour_failed,
labels: {
severity: 'critical',
},
annotations: {
message: 'Greater than %s polls have failed in the past hour.' % $._config.alerts.polls_per_hour_failed,
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoPollsFailing'
},
},
],
},
],
},
}