From 32d1851ce9e0ee6acb82c1999cd4025a7f0f95b1 Mon Sep 17 00:00:00 2001 From: christophrichtersap Date: Thu, 28 May 2026 07:31:41 +0200 Subject: [PATCH 1/2] feat: add playbook labels to all alerts Every alert now has a `playbook:` label pointing to the corresponding page on operations.global.cloud.sap. This enables one-click navigation from alert notifications to triage procedures. Ref: PlusOne/CPE-KVM#236 --- charts/openstack-hypervisor-operator/alerts/eviction.yaml | 3 +++ charts/openstack-hypervisor-operator/alerts/operator.yaml | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/charts/openstack-hypervisor-operator/alerts/eviction.yaml b/charts/openstack-hypervisor-operator/alerts/eviction.yaml index 127bdf1e..c8f086ba 100644 --- a/charts/openstack-hypervisor-operator/alerts/eviction.yaml +++ b/charts/openstack-hypervisor-operator/alerts/eviction.yaml @@ -11,6 +11,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/evictionfailed annotations: summary: "Eviction {{ $labels.name }} has failed" description: "The eviction {{ $labels.name }} for hypervisor {{ $labels.hypervisor }} has reached a terminal failure state. Manual intervention is required — check if the hypervisor exists in OpenStack." @@ -24,6 +25,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/evictionmigrationfailing annotations: summary: "Eviction {{ $labels.name }} has failing instance migrations for over 1 hour" description: "The eviction {{ $labels.name }} has had MigratingInstance=Failed for more than 1 hour while still running. Instances may be in ERROR state, blocking eviction progress." @@ -37,6 +39,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/evictionoutstandingram annotations: summary: "Eviction {{ $labels.name }} has outstanding RAM for over 6 hours" description: "The eviction {{ $labels.name }} has had {{ $value }}MB of outstanding RAM for more than 6 hours. Check for stuck live-migrations or instances that cannot be moved." diff --git a/charts/openstack-hypervisor-operator/alerts/operator.yaml b/charts/openstack-hypervisor-operator/alerts/operator.yaml index c9322326..edf2478e 100644 --- a/charts/openstack-hypervisor-operator/alerts/operator.yaml +++ b/charts/openstack-hypervisor-operator/alerts/operator.yaml @@ -11,6 +11,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisoronboardingstuck annotations: summary: "Hypervisor {{ $labels.name }} onboarding stuck for over 1 hour" description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been onboarding for more than 1 hour. Check nova registration, test VM status, or trait/aggregate sync." @@ -22,6 +23,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisorevictionstuck annotations: summary: "Hypervisor {{ $labels.name }} eviction running for over 4 hours" description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had an active eviction for more than 4 hours. Check for stuck live-migrations or failed VMs." @@ -35,6 +37,7 @@ groups: labels: severity: info type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisorevictedtoolong annotations: summary: "Hypervisor {{ $labels.name }} has been evicted for over 7 days" description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been evicted for more than 7 days without being offboarded. Consider re-enabling or decommissioning." @@ -50,6 +53,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisortraitsyncfailed annotations: summary: "Hypervisor {{ $labels.name }} trait sync has been failing" description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had TraitsUpdated=False for more than 30 minutes outside of onboarding. Check OpenStack Placement API connectivity." @@ -65,6 +69,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisoraggregatesyncfailed annotations: summary: "Hypervisor {{ $labels.name }} aggregate sync has been failing" description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had AggregatesUpdated=False for more than 30 minutes outside of onboarding and eviction. Check OpenStack Nova API connectivity." @@ -78,6 +83,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisorreconcileerrors annotations: summary: "Hypervisor operator controller {{ $labels.controller }} has persistent reconcile errors" description: "The controller {{ $labels.controller }} has been producing sustained reconciliation errors for more than 15 minutes." @@ -89,6 +95,7 @@ groups: labels: severity: critical type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisoroperatordown annotations: summary: "Hypervisor operator is down" description: "The hypervisor operator metrics endpoint has been unreachable for more than 5 minutes." From 99359bb2504661b6a18597be7654b1ea51594dc8 Mon Sep 17 00:00:00 2001 From: christophrichtersap Date: Thu, 28 May 2026 07:42:27 +0200 Subject: [PATCH 2/2] feat: add ServiceMonitor for operator metrics scraping Without a ServiceMonitor, Prometheus cannot scrape the operator pod. This means HypervisorOperatorDown and HypervisorOperatorReconcileErrors alerts are blind (no up metric, no reconcile_errors_total). Enabled by default with 60s scrape interval. --- .../templates/servicemonitor.yaml | 27 +++++++++++++++++++ .../openstack-hypervisor-operator/values.yaml | 4 +++ 2 files changed, 31 insertions(+) create mode 100644 charts/openstack-hypervisor-operator/templates/servicemonitor.yaml diff --git a/charts/openstack-hypervisor-operator/templates/servicemonitor.yaml b/charts/openstack-hypervisor-operator/templates/servicemonitor.yaml new file mode 100644 index 00000000..7d019606 --- /dev/null +++ b/charts/openstack-hypervisor-operator/templates/servicemonitor.yaml @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "openstack-hypervisor-operator.fullname" . }} + labels: + {{- include "openstack-hypervisor-operator.labels" . | nindent 4 }} + {{- with .Values.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + control-plane: controller-manager + {{- include "openstack-hypervisor-operator.selectorLabels" . | nindent 6 }} + endpoints: + - port: https + scheme: https + tlsConfig: + insecureSkipVerify: true + {{- with .Values.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} +{{- end }} diff --git a/charts/openstack-hypervisor-operator/values.yaml b/charts/openstack-hypervisor-operator/values.yaml index d77520e2..a64814aa 100644 --- a/charts/openstack-hypervisor-operator/values.yaml +++ b/charts/openstack-hypervisor-operator/values.yaml @@ -45,6 +45,10 @@ metricsService: protocol: TCP targetPort: 8443 type: ClusterIP +serviceMonitor: + enabled: true + labels: {} + interval: 60s secret: servicePassword: "" serviceAccount: