Skip to content

Commit 9ef7f77

Browse files
committed
Shorter summaries and one more alert
1 parent a843f6b commit 9ef7f77

6 files changed

Lines changed: 66 additions & 38 deletions

File tree

charts/controlplane-operations/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
apiVersion: v2
22
name: controlplane-operations
3-
version: 1.1.8
3+
version: 1.1.9
44
description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters.
55
maintainers:
66
- name: Vladimir Videlov (d051408)

charts/controlplane-operations/alerts/controlplane-backup.yaml

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,14 @@ groups:
5151
summary: etcd KCP backup Pod {{`{{ $labels.pod }}`}} failed full snapshot.
5252
{{- end }}
5353

54-
{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupSnapshotTooOld | default false) }}
55-
- alert: EtcdKCPBackupSnapshotTooOld
54+
{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupFullSnapshotTooOld | default false) }}
55+
- alert: EtcdKCPBackupFullSnapshotTooOld
5656
expr: >
5757
(
58-
(time() - etcdbr_snapshot_latest_timestamp{kind="Full"}) > 24 * 3600
58+
bottomk(
59+
1,
60+
(time() - etcdbr_snapshot_latest_timestamp{kind="Full"}) > 24 * 3600
61+
)
5962
)
6063
AND
6164
on()
@@ -64,11 +67,36 @@ groups:
6467
) > 2
6568
labels:
6669
{{ include "controlplane-operations.additionalRuleLabels" . }}
67-
severity: {{ dig "EtcdKCPBackupSnapshotTooOld" "severity" "info" .Values.prometheusRules }}
68-
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupSnapshotTooOld.md
69-
service: {{ dig "EtcdKCPBackupSnapshotTooOld" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
70-
support_group: {{ dig "EtcdKCPBackupSnapshotTooOld" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
70+
severity: {{ dig "EtcdKCPBackupFullSnapshotTooOld" "severity" "info" .Values.prometheusRules }}
71+
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupFullSnapshotTooOld.md
72+
service: {{ dig "EtcdKCPBackupFullSnapshotTooOld" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
73+
support_group: {{ dig "EtcdKCPBackupFullSnapshotTooOld" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
74+
annotations:
75+
description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} has a full snapshot that is too old. Check pod logs and events for more details.
76+
summary: etcd KCP backup Pod {{`{{ $labels.pod }}`}} has a full snapshot that is too old.
77+
{{- end }}
78+
79+
{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupIncrSnapshotTooOld | default false) }}
80+
- alert: EtcdKCPBackupIncrSnapshotTooOld
81+
expr: >
82+
(
83+
bottomk(
84+
1,
85+
(time() - etcdbr_snapshot_latest_timestamp{kind="Incr"}) > 600
86+
)
87+
)
88+
AND
89+
on()
90+
count(
91+
(time() - etcdbr_snapshot_latest_timestamp{kind="Incr"}) > 600
92+
) > 2
93+
labels:
94+
{{ include "controlplane-operations.additionalRuleLabels" . }}
95+
severity: {{ dig "EtcdKCPBackupIncrSnapshotTooOld" "severity" "info" .Values.prometheusRules }}
96+
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupIncrSnapshotTooOld.md
97+
service: {{ dig "EtcdKCPBackupIncrSnapshotTooOld" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
98+
support_group: {{ dig "EtcdKCPBackupIncrSnapshotTooOld" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
7199
annotations:
72-
description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} has an outdated full snapshot. Check pod logs and events for more details.
73-
summary: etcd KCP backup Pod {{`{{ $labels.pod }}`}} has an outdated full snapshot.
100+
description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} has an incremental snapshot that is too old. Check pod logs and events for more details.
101+
summary: etcd KCP backup Pod {{`{{ $labels.pod }}`}} has incremental snapshot that is too old.
74102
{{- end }}

charts/controlplane-operations/alerts/controlplane-bond.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ groups:
1212
service: {{ dig "NodeBondDegradedMain" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
1313
support_group: {{ dig "NodeBondDegradedMain" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
1414
annotations:
15-
description: Bond `{{`{{ $labels.master }}`}}` on `{{`{{ $labels.node }}`}}` is degraded. Imminent network outage for this node.
16-
summary: Bond `{{`{{ $labels.master }}`}}` is degraded. Node network connectivity is not HA. Switch failover or upgrade will cause an outage!
15+
description: Bond `{{`{{ $labels.master }}`}}` on `{{`{{ $labels.node }}`}}` is degraded. Imminent network outage for this node. Node network connectivity is not HA. Switch failover or upgrade will cause an outage!
16+
summary: Bond `{{`{{ $labels.master }}`}}` is degraded.
1717
{{- end }}
1818

1919
{{- if not (.Values.prometheusRules.disabled.NodeVirtualInterfaceDown | default false) }}
@@ -27,6 +27,6 @@ groups:
2727
service: {{ dig "NodeVirtualInterfaceDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
2828
support_group: {{ dig "NodeVirtualInterfaceDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
2929
annotations:
30-
description: Interface `{{`{{ $labels.device }}`}}` on `{{`{{ $labels.node }}`}}` is down. Tenant network outage for this node.
31-
summary: Interface `{{`{{ $labels.device }}`}}` is down. Node network connectivity is degraded.
30+
description: Interface `{{`{{ $labels.device }}`}}` on `{{`{{ $labels.node }}`}}` is down. Tenant network outage for this node. Node network connectivity is degraded.
31+
summary: Interface `{{`{{ $labels.device }}`}}` is down.
3232
{{- end }}

charts/controlplane-operations/alerts/controlplane-gardener.yaml

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ groups:
4545
support_group: "{{`{{ $labels.support_group }}`}}"
4646
annotations:
4747
description: Shoot {{`{{ $labels.name }}`}} from project {{`{{ $labels.project }}`}} on {{`{{ $labels.landscape }}`}} is not being reconciled successfully for {{ dig "ShootReconciliationFailed" "for" "30m" .Values.prometheusRules }} minutes. Check the shoot's conditions and events for more details.
48-
summary: Shoot {{`{{ $labels.name }}`}} from project {{`{{ $labels.project }}`}} on {{`{{ $labels.landscape }}`}} is not being reconciled successfully.
48+
summary: Shoot {{`{{ $labels.name }}`}} on {{`{{ $labels.landscape }}`}} is not being reconciled successfully.
4949
{{- end }}
5050

5151
{{- if not (.Values.prometheusRules.disabled.ShootConditionNotTrue | default false) }}
@@ -74,7 +74,7 @@ groups:
7474
support_group: "{{`{{ $labels.support_group }}`}}"
7575
annotations:
7676
description: Shoot {{`{{ $labels.name }}`}} of project {{`{{ $labels.project }}`}} seeded from {{`{{ $labels.landscape }}`}}/{{`{{ $labels.seed }}`}} has a condition that is not True. Check the Shoot's conditions and events for more details.
77-
summary: Shoot {{`{{ $labels.name }}`}} of project {{`{{ $labels.project }}`}} seeded from {{`{{ $labels.landscape }}`}}/{{`{{ $labels.seed }}`}} has a condition that is not True.
77+
summary: Shoot {{`{{ $labels.name }}`}} seeded from {{`{{ $labels.seed }}`}} has a condition that is not True.
7878
{{- end }}
7979

8080
{{- if not (.Values.prometheusRules.disabled.SeedConditionNotTrue | default false) }}
@@ -149,7 +149,7 @@ groups:
149149
support_group: "{{`{{ $labels.support_group }}`}}"
150150
annotations:
151151
description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{trimPrefix "shoot--cp--" "`{{ $labels.cluster }}`"}}/{{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration?
152-
summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{trimPrefix "shoot--cp--" "`{{ $labels.cluster }}`"}}/{{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors.
152+
summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{trimPrefix "shoot--cp--" "`{{ $labels.cluster }}`"}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors.
153153
{{- end }}
154154

155155
{{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionAllDown | default false) }}
@@ -177,7 +177,7 @@ groups:
177177
support_group: "{{`{{ $labels.support_group }}`}}"
178178
annotations:
179179
description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{trimPrefix "shoot--cp--" "`{{ $labels.cluster }}`"}}/{{`{{ $labels.node }}`}} has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration?
180-
summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{trimPrefix "shoot--cp--" "`{{ $labels.cluster }}`"}}/{{`{{ $labels.node }}`}} has no BGP neighbors.
180+
summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{trimPrefix "shoot--cp--" "`{{ $labels.cluster }}`"}} has no BGP neighbors.
181181
{{- end }}
182182

183183
{{- if not (.Values.prometheusRules.disabled.CalicoNodeMissing | default false) }}
@@ -253,8 +253,8 @@ groups:
253253
service: gardener
254254
support_group: "{{`{{ $labels.support_group }}`}}"
255255
annotations:
256-
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is not Ready. Check the Machine's conditions and events for more details.
257-
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is not Ready.
256+
description: Machine {{`{{ $labels.name }}`}} from Shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is not Ready. Check the Machine's conditions and events for more details.
257+
summary: Machine {{`{{ $labels.name }}`}} from Shoot {{`{{ $labels.shoot_name }}`}} is not Ready.
258258
{{- end }}
259259

260260
{{- if not (.Values.prometheusRules.disabled.MCMMachineStuckInTerminating | default false) }}
@@ -282,8 +282,8 @@ groups:
282282
service: gardener
283283
support_group: "{{`{{ $labels.support_group }}`}}"
284284
annotations:
285-
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Terminating state. Check the Machine's conditions and events for more details.
286-
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Terminating state.
285+
description: Machine {{`{{ $labels.name }}`}} from Shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Terminating state. Check the Machine's conditions and events for more details.
286+
summary: Machine {{`{{ $labels.name }}`}} from Shoot {{`{{ $labels.shoot_name }}`}} is stuck in Terminating state.
287287
{{- end }}
288288

289289
{{- if not (.Values.prometheusRules.disabled.MCMMachineFailed | default false) }}
@@ -311,8 +311,8 @@ groups:
311311
service: gardener
312312
support_group: "{{`{{ $labels.support_group }}`}}"
313313
annotations:
314-
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in Failed state. Check the Machine's conditions and events for more details.
315-
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in Failed state.
314+
description: Machine {{`{{ $labels.name }}`}} from Shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in Failed state. Check the Machine's conditions and events for more details.
315+
summary: Machine {{`{{ $labels.name }}`}} from Shoot {{`{{ $labels.shoot_name }}`}} is in Failed state.
316316
{{- end }}
317317

318318
{{- if not (.Values.prometheusRules.disabled.MCMMachineCrashLoopBackOff | default false) }}
@@ -340,8 +340,8 @@ groups:
340340
service: gardener
341341
support_group: "{{`{{ $labels.support_group }}`}}"
342342
annotations:
343-
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in CrashLoopBackOff state. Check the Machine's conditions and events for more details.
344-
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in CrashLoopBackOff state.
343+
description: Machine {{`{{ $labels.name }}`}} from Shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in CrashLoopBackOff state. Check the Machine's conditions and events for more details.
344+
summary: Machine {{`{{ $labels.name }}`}} from Shoot {{`{{ $labels.shoot_name }}`}} is in CrashLoopBackOff state.
345345
{{- end }}
346346

347347
{{- if not (.Values.prometheusRules.disabled.MCMMachineStuckInPending | default false) }}
@@ -369,6 +369,6 @@ groups:
369369
service: gardener
370370
support_group: "{{`{{ $labels.support_group }}`}}"
371371
annotations:
372-
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Pending state. Check the Machine's conditions and events for more details.
373-
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Pending state.
372+
description: Machine {{`{{ $labels.name }}`}} from Shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Pending state. Check the Machine's conditions and events for more details.
373+
summary: Machine {{`{{ $labels.name }}`}} from Shoot {{`{{ $labels.shoot_name }}`}} is stuck in Pending state.
374374
{{- end }}

charts/controlplane-operations/alerts/controlplane-remote.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ groups:
1414
service: {{ dig "ArgoraUpdateInError" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
1515
support_group: {{ dig "ArgoraUpdateInError" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
1616
annotations:
17-
description: "Argora Update CR status is in Error state for more than 10 minutes."
18-
summary: "Update CR in Error state."
17+
description: Argora Update CR status is in Error state for more than {{ dig "ArgoraUpdateInError" "for" "10m" .Values.prometheusRules }} minutes.
18+
summary: Update CR in Error state.
1919
{{- end }}
2020

2121
{{- if not (.Values.prometheusRules.disabled.ArgoraClusterImportInError | default false) }}
@@ -31,8 +31,8 @@ groups:
3131
service: {{ dig "ArgoraClusterImportInError" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
3232
support_group: {{ dig "ArgoraClusterImportInError" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
3333
annotations:
34-
description: "Argora ClusterImport CR status is in Error state for more than 10 minutes."
35-
summary: "ClusterImport CR in Error state."
34+
description: Argora ClusterImport CR status is in Error state for more than {{ dig "ArgoraClusterImportInError" "for" "10m" .Values.prometheusRules }} minutes.
35+
summary: ClusterImport CR in Error state.
3636
{{- end }}
3737

3838
{{- if not (.Values.prometheusRules.disabled.ArgoraPodNotReadyError | default false) }}
@@ -47,8 +47,8 @@ groups:
4747
service: {{ dig "ArgoraPodNotReadyError" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
4848
support_group: {{ dig "ArgoraPodNotReadyError" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
4949
annotations:
50-
description: "Argora Pod is not ready for more than 5 minutes."
51-
summary: "Pod not ready."
50+
description: Argora Pod is not ready for more than {{ dig "ArgoraPodNotReadyError" "for" "5m" .Values.prometheusRules }} minutes.
51+
summary: Pod not ready.
5252
{{- end }}
5353

5454
{{- if not (.Values.prometheusRules.disabled.ServerStuckInDiscovery | default false) }}
@@ -64,6 +64,6 @@ groups:
6464
service: {{ dig "ServerStuckInDiscovery" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
6565
support_group: {{ dig "ServerStuckInDiscovery" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
6666
annotations:
67-
description: "Server is stuck in Discovery for more than 15 minutes."
68-
summary: "Server stuck in Discovery."
67+
description: Server is stuck in Discovery for more than {{ dig "ServerStuckInDiscovery" "for" "15m" .Values.prometheusRules }} minutes.
68+
summary: Server stuck in Discovery.
6969
{{- end }}

charts/controlplane-operations/plugindefinition.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@ kind: PluginDefinition
33
metadata:
44
name: controlplane-operations
55
spec:
6-
version: 1.1.8
6+
version: 1.1.9
77
displayName: Controlplane operations bundle
88
description: Operations bundle for Controlane clusters
99
docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md
1010
icon: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/charts/controlplane-operations/kubernetes-logo.png
1111
helmChart:
1212
name: controlplane-operations
1313
repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts
14-
version: 1.1.8
14+
version: 1.1.9
1515
options:
1616
- name: prometheusRules.create
1717
description: Create Prometheus rules

0 commit comments

Comments
 (0)