From 6771b30e17bd409c263519a76a58b006692b8090 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Thu, 9 Apr 2026 11:27:28 +0200
Subject: [PATCH 1/6] Scaffold cortex placement api shim
---
.github/workflows/push-charts.yaml | 19 -
.github/workflows/push-images.yaml | 45 ++
.github/workflows/update-appversion.yml | 21 +
.gitignore | 1 +
AGENTS.md | 3 +-
Dockerfile | 8 +-
Tiltfile | 24 +-
cmd/{ => manager}/main.go | 0
cmd/shim/main.go | 9 +
helm/bundles/cortex-placement-shim/Chart.yaml | 20 +
.../alerts/placement-shim.alerts.yaml | 734 ++++++++++++++++++
.../templates/alerts.yaml | 17 +
.../templates/clusterrole.yaml | 23 +
.../templates/clusterrolebinding.yaml | 14 +
.../bundles/cortex-placement-shim/values.yaml | 23 +
helm/library/cortex-shim/Chart.lock | 6 +
helm/library/cortex-shim/Chart.yaml | 8 +
.../cortex-shim/templates/_helpers.tpl | 50 ++
.../cortex-shim/templates/clusterrole.yaml | 100 +++
.../templates/clusterrolebinding.yaml | 34 +
.../cortex-shim/templates/deployment.yaml | 112 +++
.../cortex-shim/templates/service.yaml | 33 +
.../cortex-shim/templates/serviceaccount.yaml | 15 +
.../cortex-shim/templates/servicemonitor.yaml | 16 +
helm/library/cortex-shim/values.yaml | 68 ++
.../cortex/templates/manager/manager.yaml | 2 +-
internal/shim/placement/.gitkeep | 0
27 files changed, 1379 insertions(+), 26 deletions(-)
rename cmd/{ => manager}/main.go (100%)
create mode 100644 cmd/shim/main.go
create mode 100644 helm/bundles/cortex-placement-shim/Chart.yaml
create mode 100644 helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml
create mode 100644 helm/bundles/cortex-placement-shim/templates/alerts.yaml
create mode 100644 helm/bundles/cortex-placement-shim/templates/clusterrole.yaml
create mode 100644 helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml
create mode 100644 helm/bundles/cortex-placement-shim/values.yaml
create mode 100644 helm/library/cortex-shim/Chart.lock
create mode 100644 helm/library/cortex-shim/Chart.yaml
create mode 100644 helm/library/cortex-shim/templates/_helpers.tpl
create mode 100644 helm/library/cortex-shim/templates/clusterrole.yaml
create mode 100644 helm/library/cortex-shim/templates/clusterrolebinding.yaml
create mode 100644 helm/library/cortex-shim/templates/deployment.yaml
create mode 100644 helm/library/cortex-shim/templates/service.yaml
create mode 100644 helm/library/cortex-shim/templates/serviceaccount.yaml
create mode 100644 helm/library/cortex-shim/templates/servicemonitor.yaml
create mode 100644 helm/library/cortex-shim/values.yaml
create mode 100644 internal/shim/placement/.gitkeep
diff --git a/.github/workflows/push-charts.yaml b/.github/workflows/push-charts.yaml
index 2e3577275..a4559d15a 100644
--- a/.github/workflows/push-charts.yaml
+++ b/.github/workflows/push-charts.yaml
@@ -27,25 +27,6 @@ jobs:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- - name: Get all changed helm/library/cortex Chart.yaml files
- id: changed-chart-yaml-files-core
- uses: tj-actions/changed-files@v47
- with:
- files: |
- helm/library/cortex/Chart.yaml
- - name: Push cortex core charts to registry
- if: steps.changed-chart-yaml-files-core.outputs.all_changed_files != ''
- shell: bash
- env:
- ALL_CHANGED_FILES: ${{ steps.changed-chart-yaml-files-core.outputs.all_changed_files }}
- run: |
- for CHART_FILE in ${ALL_CHANGED_FILES}; do
- CHART_DIR=$(dirname $CHART_FILE)
- helm package $CHART_DIR --dependency-update --destination $CHART_DIR
- CHART_PACKAGE=$(ls $CHART_DIR/*.tgz)
- helm push $CHART_PACKAGE oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/
- done
-
- name: Get all changed library Chart.yaml files
id: changed-chart-yaml-files-library
uses: tj-actions/changed-files@v47
diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml
index 997595976..3085b503b 100644
--- a/.github/workflows/push-images.yaml
+++ b/.github/workflows/push-images.yaml
@@ -72,6 +72,50 @@ jobs:
subject-digest: ${{ steps.push_cortex_postgres.outputs.digest }}
push-to-registry: true
+ # Only build and push the cortex-shim image if there are changes related
+ # to the cortex shims (e.g., in cmd/shim or internal/shim).
+ - name: Get all changed shim/ files
+ id: changed_shim_files
+ uses: tj-actions/changed-files@v47
+ with:
+ files: |
+ cmd/shim/**
+ internal/shim/**
+ - name: Docker Meta (Cortex Shim)
+ if: steps.changed_shim_files.outputs.all_changed_files != ''
+ id: meta_cortex_shim
+ uses: docker/metadata-action@v6
+ with:
+ images: ${{ env.REGISTRY }}/${{ github.repository }}-shim
+ tags: |
+ type=semver,pattern={{version}}
+ type=semver,pattern={{major}}.{{minor}}
+ type=sha
+ latest
+ env:
+ DOCKER_METADATA_SHORT_SHA_LENGTH: 8
+ - name: Build and Push Cortex Shim
+ if: steps.changed_shim_files.outputs.all_changed_files != ''
+ id: push_cortex_shim
+ uses: docker/build-push-action@v7
+ with:
+ context: cmd/shim
+ platforms: linux/amd64,linux/arm64
+ push: true
+ tags: ${{ steps.meta_cortex_shim.outputs.tags }}
+ labels: ${{ steps.meta_cortex_shim.outputs.labels }}
+ build-args: |
+ GIT_TAG=${{ github.ref_name }}
+ GIT_COMMIT=${{ github.sha }}
+ GOMAIN=cmd/shim/main.go
+ - name: Generate Artifact Attestation for Cortex Shim
+ if: steps.changed_shim_files.outputs.all_changed_files != ''
+ uses: actions/attest-build-provenance@v4
+ with:
+ subject-name: ${{ env.REGISTRY }}/${{ github.repository }}-shim
+ subject-digest: ${{ steps.push_cortex_shim.outputs.digest }}
+ push-to-registry: true
+
# Build & push new cortex image
- name: Docker Meta (Cortex)
id: meta_cortex
@@ -98,6 +142,7 @@ jobs:
build-args: |
GIT_TAG=${{ github.ref_name }}
GIT_COMMIT=${{ github.sha }}
+ GOMAIN=cmd/manager/main.go
- name: Generate Artifact Attestation for Cortex
uses: actions/attest-build-provenance@v4
with:
diff --git a/.github/workflows/update-appversion.yml b/.github/workflows/update-appversion.yml
index cc5ccdc9f..20087fa80 100644
--- a/.github/workflows/update-appversion.yml
+++ b/.github/workflows/update-appversion.yml
@@ -44,6 +44,27 @@ jobs:
git commit -m "Bump cortex-postgres chart appVersions to ${{ steps.vars.outputs.sha }} [skip ci]" || echo "No changes to commit"
git push origin HEAD:main
+ # Only bumped if there are changes in shim-related directories
+ - name: Get all changed shim files
+ id: changed_shim_files
+ uses: tj-actions/changed-files@v47
+ with:
+ files: |
+ internal/shim/**
+ cmd/shim/**
+ - name: Update appVersion in cortex-shim Chart.yaml
+ if: steps.changed_shim_files.outputs.all_changed_files != ''
+ run: |
+ sed -i 's/^\([ ]*appVersion:[ ]*\).*/\1"${{ steps.vars.outputs.sha }}"/' helm/library/cortex-shim/Chart.yaml
+ - name: Commit and push changes for cortex-shim
+ if: steps.changed_shim_files.outputs.all_changed_files != ''
+ run: |
+ git config user.name "github-actions[bot]"
+ git config user.email "github-actions[bot]@users.noreply.github.com"
+ git add helm/library/cortex-shim/Chart.yaml
+ git commit -m "Bump cortex-shim chart appVersions to ${{ steps.vars.outputs.sha }} [skip ci]" || echo "No changes to commit"
+ git push origin HEAD:main
+
- name: Update appVersion in helm/library/cortex/Chart.yaml
run: |
sed -i 's/^\([ ]*appVersion:[ ]*\).*/\1"${{ steps.vars.outputs.sha }}"/' helm/library/cortex/Chart.yaml
diff --git a/.gitignore b/.gitignore
index 04bac2d09..7e21248bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@ cortex.secrets.yaml
!.editorconfig
!.gitignore
!.github
+!.gitkeep
!.golangci.yaml
!.license-scan-overrides.jsonl
!.license-scan-rules.json
diff --git a/AGENTS.md b/AGENTS.md
index 6f2e12a17..59747bd8c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -50,7 +50,8 @@ Helm charts:
## Repository Structure
Code:
-- `cmd/main.go` is the entry point for the manager, which starts the controllers and webhooks
+- `cmd/manager/main.go` is the entry point for the manager, which starts the controllers and webhooks
+- `cmd/shim/main.go` is the entry point for cortex shims exposing cortex capabilities over REST endpoints
- `api/v1alpha1` is where the CRD specs of cortex lives
- `api/external` contains messages sent to cortex via http from external openstack services
- `internal/scheduling` contains the logic for scheduling in different cloud domains
diff --git a/Dockerfile b/Dockerfile
index 6f7e79bea..2580e9637 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,6 +6,8 @@ ARG TARGETARCH
ARG GO_MOD_PATH=.
ARG GOCACHE=/root/.cache/go-build
ENV GOCACHE=${GOCACHE}
+ARG GOMAIN=cmd/manager/main.go
+ENV GOMAIN=${GOMAIN}
# Note: avoid using COPY to /lib which will lead to docker build errors.
WORKDIR /workspace/${GO_MOD_PATH}
@@ -29,13 +31,13 @@ ENV GOOS=${TARGETOS:-linux}
ENV GOARCH=${TARGETARCH}
RUN --mount=type=cache,target=/go/pkg/mod/ \
--mount=type=cache,target=${GOCACHE} \
- go build -a -o /manager cmd/main.go
+ go build -a -o /main ${GOMAIN}
# Use distroless as minimal base image to package the manager binary
# Refer to https://github.com/GoogleContainerTools/distroless for more details
FROM gcr.io/distroless/static:nonroot
WORKDIR /
-COPY --from=builder /manager .
+COPY --from=builder /main .
USER 65532:65532
-ENTRYPOINT ["/manager"]
+ENTRYPOINT ["/main"]
diff --git a/Tiltfile b/Tiltfile
index 6871d18b3..bc87f4d30 100644
--- a/Tiltfile
+++ b/Tiltfile
@@ -7,7 +7,10 @@
analytics_settings(False)
# Use the ACTIVE_DEPLOYMENTS env var to select which Cortex bundles to deploy.
-ACTIVE_DEPLOYMENTS_ENV = os.getenv('ACTIVE_DEPLOYMENTS', 'nova,manila,cinder,ironcore,pods')
+ACTIVE_DEPLOYMENTS_ENV = os.getenv(
+ 'ACTIVE_DEPLOYMENTS',
+ 'nova,manila,cinder,ironcore,pods,placement',
+)
if ACTIVE_DEPLOYMENTS_ENV == "":
ACTIVE_DEPLOYMENTS = [] # Catch "".split(",") = [""]
else:
@@ -78,13 +81,22 @@ local('kubectl wait --namespace cert-manager --for=condition=available deploymen
url = 'https://raw.githubusercontent.com/cobaltcore-dev/openstack-hypervisor-operator/refs/heads/main/charts/openstack-hypervisor-operator/crds/kvm.cloud.sap_hypervisors.yaml'
local('curl -L ' + url + ' | kubectl apply -f -')
-########### Cortex Operator & CRDs
+########### Cortex Manager & CRDs
docker_build('ghcr.io/cobaltcore-dev/cortex', '.',
dockerfile='Dockerfile',
+ build_args={'GOMAIN': 'cmd/manager/main.go'},
only=['internal/', 'cmd/', 'api/', 'pkg', 'go.mod', 'go.sum', 'Dockerfile'],
)
local('sh helm/sync.sh helm/library/cortex')
+########### Cortex Shim
+docker_build('ghcr.io/cobaltcore-dev/cortex-shim', '.',
+ dockerfile='Dockerfile',
+ build_args={'GOMAIN': 'cmd/shim/main.go'},
+ only=['internal/', 'cmd/', 'api/', 'pkg', 'go.mod', 'go.sum', 'Dockerfile'],
+)
+local('sh helm/sync.sh helm/library/cortex-shim')
+
########### Cortex Bundles
docker_build('ghcr.io/cobaltcore-dev/cortex-postgres', 'postgres')
@@ -98,6 +110,7 @@ bundle_charts = [
('helm/bundles/cortex-cinder', 'cortex-cinder'),
('helm/bundles/cortex-ironcore', 'cortex-ironcore'),
('helm/bundles/cortex-pods', 'cortex-pods'),
+ ('helm/bundles/cortex-placement-shim', 'cortex-placement-shim'),
]
dep_charts = {
'cortex-crds': [
@@ -123,6 +136,9 @@ dep_charts = {
('helm/library/cortex-postgres', 'cortex-postgres'),
('helm/library/cortex', 'cortex'),
],
+ 'cortex-placement-shim': [
+ ('helm/library/cortex-shim', 'cortex-shim'),
+ ],
}
for (bundle_chart_path, bundle_chart_name) in bundle_charts:
@@ -255,6 +271,10 @@ if 'pods' in ACTIVE_DEPLOYMENTS:
k8s_yaml('samples/pods/pod.yaml')
k8s_resource('test-pod', labels=['Cortex-Pods'])
+if 'placement' in ACTIVE_DEPLOYMENTS:
+ print("Activating Cortex Placement Shim bundle")
+ k8s_yaml(helm('./helm/bundles/cortex-placement-shim', name='cortex-placement-shim', values=tilt_values, set=env_set_overrides))
+
########### Dev Dependencies
local('sh helm/sync.sh helm/dev/cortex-prometheus-operator')
k8s_yaml(helm('./helm/dev/cortex-prometheus-operator', name='cortex-prometheus-operator')) # Operator
diff --git a/cmd/main.go b/cmd/manager/main.go
similarity index 100%
rename from cmd/main.go
rename to cmd/manager/main.go
diff --git a/cmd/shim/main.go b/cmd/shim/main.go
new file mode 100644
index 000000000..6b0634229
--- /dev/null
+++ b/cmd/shim/main.go
@@ -0,0 +1,9 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package main
+
+func main() {
+ // TODO: this needs scaffolding, for now it just does nothing.
+ select {}
+}
diff --git a/helm/bundles/cortex-placement-shim/Chart.yaml b/helm/bundles/cortex-placement-shim/Chart.yaml
new file mode 100644
index 000000000..7f53ed347
--- /dev/null
+++ b/helm/bundles/cortex-placement-shim/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright SAP SE
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v2
+name: cortex-placement-shim
+description: A Helm chart deploying the Cortex placement shim.
+type: application
+version: 0.0.1
+appVersion: 0.1.0
+dependencies:
+ # from: file://../../library/cortex-shim
+ - name: cortex-shim
+ repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
+ version: 0.0.1
+ # Owner info adds a configmap to the kubernetes cluster with information on
+ # the service owner. This makes it easier to find out who to contact in case
+ # of issues. See: https://github.com/sapcc/helm-charts/pkgs/container/helm-charts%2Fowner-info
+ - name: owner-info
+ repository: oci://ghcr.io/sapcc/helm-charts
+ version: 1.0.0
diff --git a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml
new file mode 100644
index 000000000..41bf29794
--- /dev/null
+++ b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml
@@ -0,0 +1,734 @@
+groups:
+- name: cortex-nova-alerts
+ rules:
+ - alert: CortexNovaSchedulingDown
+ expr: |
+ up{pod=~"cortex-nova-scheduling-.*"} != 1 or
+ absent(up{pod=~"cortex-nova-scheduling-.*"})
+ for: 5m
+ labels:
+ context: liveness
+ dashboard: cortex/cortex
+ service: cortex
+ severity: critical
+ support_group: workload-management
+ playbook: docs/support/playbook/cortex/down
+ annotations:
+ summary: "Cortex Scheduling for Nova is down"
+ description: >
+ The Cortex scheduling service is down. Scheduling requests from Nova will
+ not be served. This is non-critical for vmware virtual machines, but
+ blocks kvm virtual machines from being scheduled. Thus, it is
+ recommended to immediately investigate and resolve the issue.
+
+ - alert: CortexNovaKnowledgeDown
+ expr: |
+ up{pod=~"cortex-nova-knowledge-.*"} != 1 or
+ absent(up{pod=~"cortex-nova-knowledge-.*"})
+ for: 5m
+ labels:
+ context: liveness
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ playbook: docs/support/playbook/cortex/down
+ annotations:
+ summary: "Cortex Knowledge for Nova is down"
+ description: >
+ The Cortex Knowledge service is down. This is no immediate problem,
+ since cortex is still able to process requests,
+ but the quality of the responses may be affected.
+
+ - alert: CortexNovaDeschedulerPipelineErroring
+ expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0
+ for: 5m
+ labels:
+ context: descheduler
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Descheduler pipeline is erroring."
+ description: >
+ The Cortex descheduler pipeline is encountering errors during its execution.
+ This may indicate issues with the descheduling logic or the underlying infrastructure.
+ It is recommended to investigate the descheduler logs and the state of the VMs being processed.
+
+ - alert: CortexNovaHttpRequest400sTooHigh
+ expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1
+ for: 5m
+ labels:
+ context: api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Nova Scheduler HTTP request 400 errors too high"
+ description: >
+ Nova Scheduler is responding to placement requests with HTTP 4xx
+ errors. This is expected when the scheduling request cannot be served
+ by Cortex. However, it could also indicate that the request format has
+ changed and Cortex is unable to parse it.
+
+ - alert: CortexNovaSchedulingHttpRequest500sTooHigh
+ expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1
+ for: 5m
+ labels:
+ context: api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Nova Scheduler HTTP request 500 errors too high"
+ description: >
+ Nova Scheduler is responding to placement requests with HTTP 5xx errors.
+ This is not expected and indicates that Cortex is having some internal problem.
+ Nova will continue to place new VMs, but the placement will be less desirable.
+ Thus, no immediate action is needed.
+
+ - alert: CortexNovaHighMemoryUsage
+ expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > 6000 * 1024 * 1024
+ for: 5m
+ labels:
+ context: memory
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "`{{$labels.component}}` uses too much memory"
+ description: >
+ `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it
+ should use much less, so there may be a memory leak or other changes
+ that are causing the memory usage to increase significantly.
+
+ - alert: CortexNovaHighCPUUsage
+ expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5
+ for: 5m
+ labels:
+ context: cpu
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "`{{$labels.component}}` uses too much CPU"
+ description: >
+ `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually
+ it should use much less, so there may be a CPU leak or other changes
+ that are causing the CPU usage to increase significantly.
+
+ - alert: CortexNovaTooManyDBConnectionAttempts
+ expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1
+ for: 5m
+ labels:
+ context: db
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "`{{$labels.component}}` is trying to connect to the database too often"
+ description: >
+ `{{$labels.component}}` is trying to connect to the database too often. This may happen
+ when the database is down or the connection parameters are misconfigured.
+
+ - alert: CortexNovaSyncNotSuccessful
+ expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0
+ for: 5m
+ labels:
+ context: syncstatus
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "`{{$labels.component}}` Sync not successful"
+ description: >
+ `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may
+ happen when the datasource (OpenStack, Prometheus, etc.) is down or
+ the sync module is misconfigured. No immediate action is needed, since
+ the sync module will retry the sync operation and the currently synced
+ data will be kept. However, when this problem persists for a longer
+ time the service will have a less recent view of the datacenter.
+
+ - alert: CortexNovaSyncObjectsDroppedToZero
+ expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0
+ for: 60m
+ labels:
+ context: syncobjects
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`"
+ description: >
+ `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen
+ when the datasource (OpenStack, Prometheus, etc.) is down or the sync
+ module is misconfigured. No immediate action is needed, since the sync
+ module will retry the sync operation and the currently synced data will
+ be kept. However, when this problem persists for a longer time the
+ service will have a less recent view of the datacenter.
+
+ - alert: CortexNovaDatasourceUnready
+ expr: cortex_datasource_state{domain="nova",state!="ready"} != 0
+ for: 60m
+ labels:
+ context: datasources
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state"
+ description: >
+ This may indicate issues with the datasource
+ connectivity or configuration. It is recommended to investigate the
+ datasource status and logs for more details.
+
+ - alert: CortexNovaKnowledgeUnready
+ expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0
+ for: 60m
+ labels:
+ context: knowledge
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state"
+ description: >
+ This may indicate issues with the knowledge
+ configuration. It is recommended to investigate the
+ knowledge status and logs for more details.
+
+ - alert: CortexNovaDecisionsWithErrors
+ expr: cortex_decision_state{domain="nova",state="error"} > 0
+ for: 5m
+ labels:
+ context: decisions
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Some decisions are in error state for operator `{{$labels.operator}}`"
+ description: >
+ The cortex scheduling pipeline generated decisions that are in error state.
+ This may indicate issues with the decision logic or the underlying infrastructure.
+ It is recommended to investigate the decision logs and the state of the
+ VMs being processed.
+
+ - alert: CortexNovaTooManyDecisionsWaiting
+ expr: cortex_decision_state{domain="nova",state="waiting"} > 10
+ for: 5m
+ labels:
+ context: decisions
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`"
+ description: >
+ The cortex scheduling pipeline has a high number of decisions for which
+ no target host has been assigned yet.
+
+ This may indicate a backlog in processing or issues with the decision logic.
+ It is recommended to investigate the decision logs and the state of the
+ VMs being processed.
+
+ - alert: CortexNovaKPIUnready
+ expr: |
+ cortex_kpi_state{domain="nova",state!="ready"} != 0
+ for: 60m
+ labels:
+ context: kpis
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state"
+ description: >
+ This may indicate issues with the KPI
+ configuration. It is recommended to investigate the
+ KPI status and logs for more details.
+
+ - alert: CortexNovaPipelineUnready
+ expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0
+ for: 5m
+ labels:
+ context: pipelines
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state"
+ description: >
+ This may indicate issues with the pipeline
+ configuration. It is recommended to investigate the
+ pipeline status and logs for more details.
+
+ # Committed Resource Info API Alerts
+ - alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh
+ expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
+ for: 5m
+ labels:
+ context: committed-resource-api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource info API HTTP 500 errors too high"
+ description: >
+ The committed resource info API (Limes LIQUID integration) is responding
+ with HTTP 5xx errors. This indicates internal problems building service info,
+ such as invalid flavor group data. Limes will not be able to discover available
+ resources until the issue is resolved.
+
+ # Committed Resource Change API Alerts
+ - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh
+ expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
+ for: 5m
+ labels:
+ context: committed-resource-api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource change API HTTP 400 errors too high"
+ description: >
+ The committed resource change API (Limes LIQUID integration) is responding
+ with HTTP 4xx errors. This may happen when Limes sends a request with
+ an outdated info version (409), the API is temporarily unavailable,
+ or the request format is invalid. Limes will typically retry these
+ requests, so no immediate action is needed unless the errors persist.
+
+ - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh
+ expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
+ for: 5m
+ labels:
+ context: committed-resource-api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource change API HTTP 500 errors too high"
+ description: >
+ The committed resource change API (Limes LIQUID integration) is responding
+ with HTTP 5xx errors. This is not expected and indicates that Cortex
+ is having an internal problem processing commitment changes. Limes will
+ continue to retry, but new commitments may not be fulfilled until the
+ issue is resolved.
+
+ - alert: CortexNovaCommittedResourceLatencyTooHigh
+ expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30
+ for: 5m
+ labels:
+ context: committed-resource-api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource change API latency too high"
+ description: >
+ The committed resource change API (Limes LIQUID integration) is experiencing
+ high latency (p95 > 30s). This may indicate that the scheduling pipeline
+ is under heavy load or that reservation scheduling is taking longer than
+ expected. Limes requests may time out, causing commitment changes to fail.
+
+ - alert: CortexNovaCommittedResourceRejectionRateTooHigh
+ expr: |
+ sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m]))
+ / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5
+ for: 5m
+ labels:
+ context: committed-resource-api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource rejection rate too high"
+ description: >
+ More than 50% of commitment change requests are being rejected.
+ This may indicate insufficient capacity in the datacenter to fulfill
+ new commitments, or issues with the commitment scheduling logic.
+ Rejected commitments are rolled back, so Limes will see them as failed
+ and may retry or report the failure to users.
+
+ - alert: CortexNovaCommittedResourceTimeoutsTooHigh
+ expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0
+ for: 5m
+ labels:
+ context: committed-resource-api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource change API timeouts too high"
+ description: >
+ The committed resource change API (Limes LIQUID integration) timed out
+ while waiting for reservations to become ready. This indicates that the
+ scheduling pipeline is overloaded or reservations are taking too long
+ to be scheduled. Affected commitment changes are rolled back and Limes
+ will see them as failed. Consider investigating the scheduler performance
+ or increasing the timeout configuration.
+
+ # Committed Resource Usage API Alerts
+ - alert: CortexNovaCommittedResourceUsageHttpRequest400sTooHigh
+ expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
+ for: 5m
+ labels:
+ context: committed-resource-api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource usage API HTTP 400 errors too high"
+ description: >
+ The committed resource usage API (Limes LIQUID integration) is responding
+ with HTTP 4xx errors. This may indicate invalid project IDs or malformed
+ requests from Limes. Limes will typically retry these requests.
+
+ - alert: CortexNovaCommittedResourceUsageHttpRequest500sTooHigh
+ expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
+ for: 5m
+ labels:
+ context: committed-resource-api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource usage API HTTP 500 errors too high"
+ description: >
+ The committed resource usage API (Limes LIQUID integration) is responding
+ with HTTP 5xx errors. This indicates internal problems fetching reservations
+ or Nova server data. Limes may receive stale or incomplete usage data.
+
+ - alert: CortexNovaCommittedResourceUsageLatencyTooHigh
+ expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_usage_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
+ for: 5m
+ labels:
+ context: committed-resource-api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource usage API latency too high"
+ description: >
+ The committed resource usage API (Limes LIQUID integration) is experiencing
+ high latency (p95 > 5s). This may indicate slow Nova API responses or
+ database queries. Limes scrapes may time out, affecting quota reporting.
+
+ # Committed Resource Capacity API Alerts
+ - alert: CortexNovaCommittedResourceCapacityHttpRequest400sTooHigh
+ expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
+ for: 5m
+ labels:
+ context: committed-resource-api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource capacity API HTTP 400 errors too high"
+ description: >
+ The committed resource capacity API (Limes LIQUID integration) is responding
+ with HTTP 4xx errors. This may indicate malformed requests from Limes.
+
+ - alert: CortexNovaCommittedResourceCapacityHttpRequest500sTooHigh
+ expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
+ for: 5m
+ labels:
+ context: committed-resource-api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource capacity API HTTP 500 errors too high"
+ description: >
+ The committed resource capacity API (Limes LIQUID integration) is responding
+ with HTTP 5xx errors. This indicates internal problems calculating cluster
+ capacity. Limes may receive stale or incomplete capacity data.
+
+ - alert: CortexNovaCommittedResourceCapacityLatencyTooHigh
+ expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_capacity_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
+ for: 5m
+ labels:
+ context: committed-resource-api
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource capacity API latency too high"
+ description: >
+ The committed resource capacity API (Limes LIQUID integration) is experiencing
+ high latency (p95 > 5s). This may indicate slow database queries or knowledge
+ CRD retrieval. Limes scrapes may time out, affecting capacity reporting.
+
+ # Committed Resource Syncer Alerts
+ - alert: CortexNovaCommittedResourceSyncerErrorsHigh
+ expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3
+ for: 5m
+ labels:
+ context: committed-resource-syncer
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource syncer experiencing errors"
+ description: >
+ The committed resource syncer has encountered multiple errors in the last hour.
+ This may indicate connectivity issues with Limes. Check the syncer logs for error details.
+
+ - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh
+ expr: |
+ (
+ sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h]))
+ / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
+ ) > 0.05
+ and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
+ for: 15m
+ labels:
+ context: committed-resource-syncer
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource syncer unit mismatch rate >5%"
+ description: >
+ More than 5% of commitments are being skipped due to unit mismatches between
+ Limes and Cortex flavor groups. This happens when Limes has not yet been
+ updated to use the new unit format after a flavor group change. The affected
+ commitments will keep their existing reservations until Limes notices the update.
+ Check the logs if this error persists for longer time.
+
+ - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh
+ expr: |
+ (
+ sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h]))
+ / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
+ ) > 0
+ and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
+ for: 15m
+ labels:
+ context: committed-resource-syncer
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource syncer unknown flavor group rate >0%"
+ description: >
+ Some commitments reference flavor groups that don't exist in
+ Cortex Knowledge (anymore). This may indicate that flavor group configuration is
+ out of sync between Limes and Cortex, or that Knowledge extraction is failing.
+ Check the flavor group Knowledge CRD and history to see what was changed.
+
+ - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh
+ expr: |
+ (
+ (
+ rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) +
+ rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) +
+ rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
+ ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
+ ) > 0.01
+ and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
+ for: 15m
+ labels:
+ context: committed-resource-syncer
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource syncer local change rate >1%"
+ description: >
+ More than 1% of synced commitments are requiring reservation changes
+ (creates, deletes, or repairs). This is higher than expected for steady-state
+ operation and may indicate data inconsistencies, external modifications to
+ reservations, or issues with the CRDs. Check Cortex logs for details.
+
+ - alert: CortexNovaCommittedResourceSyncerRepairRateHigh
+ expr: |
+ (
+ rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
+ / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
+ ) > 0
+ and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
+ for: 15m
+ labels:
+ context: committed-resource-syncer
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Committed Resource syncer repair rate >0%"
+ description: >
+ Some commitments have reservations that needed repair
+ (wrong metadata like project ID or flavor group). This may indicate data
+ corruption, bugs in reservation creation, or external modifications.
+ Reservations are automatically repaired, but the root cause should be
+ investigated if this alert persists.
+
+ - alert: CortexNovaDoesntFindValidKVMHosts
+ expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0
+ for: 5m
+ labels:
+ context: scheduling
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Nova scheduling cannot find valid KVM hosts"
+ description: >
+ Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling
+ failed to find a valid `{{$labels.hvtype}}` host. This may indicate
+ capacity issues, misconfigured filters, or resource constraints in the
+ datacenter. Investigate the affected VMs and hypervisor availability.
+
+ - alert: CortexNovaNewDatasourcesNotReconciling
+ expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0
+ for: 60m
+ labels:
+ context: datasources
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "New datasource `{{$labels.datasource}}` has not reconciled"
+ description: >
+ A new datasource `{{$labels.datasource}}` has been added but has not
+ completed its first reconciliation yet. This may indicate issues with
+ the datasource controller's workqueue overprioritizing other datasources.
+
+ - alert: CortexNovaExistingDatasourcesLackingBehind
+ expr: |
+ sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600
+ and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1
+ for: 10m
+ labels:
+ context: datasources
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Existing datasource `{{$labels.datasource}}` is lacking behind"
+ description: >
+ An existing datasource `{{$labels.datasource}}` has been queued for
+ reconciliation for more than 10 minutes. This may indicate issues with
+ the datasource controller's workqueue or that this or another datasource
+ is taking an unusually long time to reconcile.
+
+ - alert: CortexNovaReconcileErrorsHigh
+ expr: |
+ (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m])))
+ / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1
+ for: 15m
+ labels:
+ context: controller-errors
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Controller reconcile error rate >10%"
+ description: >
+ More than 10% of controller reconciles are resulting in errors. This may
+ indicate issues with the controller logic, connectivity problems, or
+ external factors causing failures. Check the controller logs for error
+ details and investigate the affected resources.
+
+ - alert: CortexNovaReconcileDurationHigher10Min
+ expr: |
+ (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m])))
+ / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > 600
+ for: 15m
+ labels:
+ context: controller-duration
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})"
+ description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}"
+
+ - alert: CortexNovaWorkqueueNotDrained
+ expr: |
+ sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0
+ for: 60m
+ labels:
+ context: controller-workqueue
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Controller {{ $labels.name }}'s backlog is not being drained."
+ description: >
+ The workqueue for controller {{ $labels.name }} has a backlog that is
+ not being drained. This may indicate that the controller is overwhelmed
+ with work or is stuck on certain resources. Check the controller logs
+ and the state of the resources it manages for more details.
+
+ - alert: CortexNovaWebhookLatencyHigh
+ expr: |
+ histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2
+ for: 15m
+ labels:
+ context: controller-webhook
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Controller webhook {{ $labels.webhook }} latency is high"
+ description: >
+ The latency for webhook {{ $labels.webhook }} is higher than expected (p90 > 200ms).
+ This may indicate performance issues with the webhook server or the logic it executes.
+ Check the webhook server logs and monitor its resource usage for more insights.
+
+ - alert: CortexNovaWebhookErrorsHigh
+ expr: |
+ (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m])))
+ / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1
+ for: 15m
+ labels:
+ context: controller-webhook
+ dashboard: cortex/cortex
+ service: cortex
+ severity: warning
+ support_group: workload-management
+ annotations:
+ summary: "Controller webhook {{ $labels.webhook }} is experiencing errors"
+ description: >
+ The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes.
+ This may indicate issues with the webhook logic, connectivity problems, or
+ external factors causing failures. Check the webhook server logs for error
+ details and investigate the affected resources.
\ No newline at end of file
diff --git a/helm/bundles/cortex-placement-shim/templates/alerts.yaml b/helm/bundles/cortex-placement-shim/templates/alerts.yaml
new file mode 100644
index 000000000..7db3b96e6
--- /dev/null
+++ b/helm/bundles/cortex-placement-shim/templates/alerts.yaml
@@ -0,0 +1,17 @@
+# Copyright SAP SE
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.alerts.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: cortex-placement-shim-alerts
+ labels:
+ type: alerting-rules
+ prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }}
+spec:
+ {{- $files := .Files.Glob "alerts/*.alerts.yaml" }}
+ {{- range $path, $file := $files }}
+ {{ $file | toString | nindent 2 }}
+ {{- end }}
+{{- end }}
diff --git a/helm/bundles/cortex-placement-shim/templates/clusterrole.yaml b/helm/bundles/cortex-placement-shim/templates/clusterrole.yaml
new file mode 100644
index 000000000..489878c89
--- /dev/null
+++ b/helm/bundles/cortex-placement-shim/templates/clusterrole.yaml
@@ -0,0 +1,23 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ labels:
+ {{- include "chart.labels" . | nindent 4 }}
+ name: cortex-placement-shim-role-hypervisor
+rules:
+- apiGroups:
+ - kvm.cloud.sap
+ resources:
+ - hypervisors
+ verbs:
+ - get
+ - list
+ - patch
+ - update
+ - watch
+- apiGroups:
+ - kvm.cloud.sap
+ resources:
+ - hypervisors/status
+ verbs:
+ - get
\ No newline at end of file
diff --git a/helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml b/helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml
new file mode 100644
index 000000000..0388373f9
--- /dev/null
+++ b/helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml
@@ -0,0 +1,14 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ labels:
+ {{- include "chart.labels" . | nindent 4 }}
+ name: cortex-placement-shim-rolebinding-hypervisor
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: cortex-placement-shim-role-hypervisor
+subjects:
+- kind: ServiceAccount
+ name: cortex-placement-shim
+ namespace: {{ .Release.Namespace }}
\ No newline at end of file
diff --git a/helm/bundles/cortex-placement-shim/values.yaml b/helm/bundles/cortex-placement-shim/values.yaml
new file mode 100644
index 000000000..40aa9cb11
--- /dev/null
+++ b/helm/bundles/cortex-placement-shim/values.yaml
@@ -0,0 +1,23 @@
+# Copyright SAP SE
+# SPDX-License-Identifier: Apache-2.0
+
+owner-info:
+ enabled: true
+ helm-chart-url: "https://github.com/cobaltcore-dev/cortex/helm/bundles/cortex-placement-shim"
+ maintainers:
+ - "arno.uhlig@sap.com"
+ - "julius.clausnitzer@sap.com"
+ - "malte.viering@sap.com"
+ - "marcel.gute@sap.com"
+ - "markus.wieland@sap.com"
+ - "p.matthes@sap.com"
+ support-group: "workload-management"
+ service: "cortex-placement-shim"
+
+alerts:
+ enabled: true
+ prometheus: openstack
+
+cortex-shim:
+ namePrefix: cortex-placement
+ conf: {} # TODO
diff --git a/helm/library/cortex-shim/Chart.lock b/helm/library/cortex-shim/Chart.lock
new file mode 100644
index 000000000..db4c5823b
--- /dev/null
+++ b/helm/library/cortex-shim/Chart.lock
@@ -0,0 +1,6 @@
+dependencies:
+- name: owner-info
+ repository: oci://ghcr.io/sapcc/helm-charts
+ version: 1.0.0
+digest: sha256:7643f231cc4ebda347fd12ec62fe4445c280e2b71d27eec555f3025290f5038f
+generated: "2025-08-26T10:55:05.888651+02:00"
diff --git a/helm/library/cortex-shim/Chart.yaml b/helm/library/cortex-shim/Chart.yaml
new file mode 100644
index 000000000..5282dc655
--- /dev/null
+++ b/helm/library/cortex-shim/Chart.yaml
@@ -0,0 +1,8 @@
+apiVersion: v2
+name: cortex-shim
+description: A Helm chart to distribute cortex shims.
+type: application
+version: 0.0.1
+appVersion: "sha-3e56acea"
+icon: "https://example.com/icon.png"
+dependencies: []
diff --git a/helm/library/cortex-shim/templates/_helpers.tpl b/helm/library/cortex-shim/templates/_helpers.tpl
new file mode 100644
index 000000000..782e14eef
--- /dev/null
+++ b/helm/library/cortex-shim/templates/_helpers.tpl
@@ -0,0 +1,50 @@
+{{- define "chart.name" -}}
+{{- if .Chart }}
+ {{- if .Chart.Name }}
+ {{- .Chart.Name | trunc 63 | trimSuffix "-" }}
+ {{- else if .Values.nameOverride }}
+ {{ .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+ {{- else }}
+ scheduling
+ {{- end }}
+{{- else }}
+ scheduling
+{{- end }}
+{{- end }}
+
+
+{{- define "chart.labels" -}}
+{{- if .Chart.AppVersion -}}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+{{- if .Chart.Version }}
+helm.sh/chart: {{ .Chart.Version | quote }}
+{{- end }}
+app.kubernetes.io/name: {{ include "chart.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+
+{{- define "chart.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "chart.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+
+{{- define "chart.hasMutatingWebhooks" -}}
+{{- $hasMutating := false }}
+{{- range . }}
+ {{- if eq .type "mutating" }}
+ $hasMutating = true }}{{- end }}
+{{- end }}
+{{ $hasMutating }}}}{{- end }}
+
+
+{{- define "chart.hasValidatingWebhooks" -}}
+{{- $hasValidating := false }}
+{{- range . }}
+ {{- if eq .type "validating" }}
+ $hasValidating = true }}{{- end }}
+{{- end }}
+{{ $hasValidating }}}}{{- end }}
diff --git a/helm/library/cortex-shim/templates/clusterrole.yaml b/helm/library/cortex-shim/templates/clusterrole.yaml
new file mode 100644
index 000000000..74f8e7ad4
--- /dev/null
+++ b/helm/library/cortex-shim/templates/clusterrole.yaml
@@ -0,0 +1,100 @@
+# Roles that grant the shims access to cortex crds.
+{{- if .Values.rbac.enable }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ labels:
+ {{- include "chart.labels" . | nindent 4 }}
+ name: {{ .Values.namePrefix }}-shim-role
+rules:
+- apiGroups:
+ - cortex.cloud
+ resources:
+ - knowledges
+ - datasources
+ - reservations
+ - decisions
+ - deschedulings
+ - pipelines
+ - kpis
+ - histories
+ verbs:
+ - create
+ - delete
+ - get
+ - list
+ - patch
+ - update
+ - watch
+- apiGroups:
+ - cortex.cloud
+ resources:
+ - knowledges/finalizers
+ - datasources/finalizers
+ - reservations/finalizers
+ - decisions/finalizers
+ - deschedulings/finalizers
+ - pipelines/finalizers
+ - kpis/finalizers
+ - histories/finalizers
+ verbs:
+ - update
+- apiGroups:
+ - cortex.cloud
+ resources:
+ - knowledges/status
+ - datasources/status
+ - reservations/status
+ - decisions/status
+ - deschedulings/status
+ - pipelines/status
+ - kpis/status
+ - histories/status
+ verbs:
+ - get
+ - patch
+ - update
+- apiGroups:
+ - events.k8s.io
+ resources:
+ - events
+ verbs:
+ - create
+ - patch
+{{- end -}}
+{{- if and .Values.rbac.enable .Values.metrics.enable }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ labels:
+ {{- include "chart.labels" . | nindent 4 }}
+ name: {{ .Values.namePrefix }}-metrics-reader
+rules:
+- nonResourceURLs:
+ - "/metrics"
+ verbs:
+ - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ labels:
+ {{- include "chart.labels" . | nindent 4 }}
+ name: {{ .Values.namePrefix }}-metrics-auth-role
+rules:
+- apiGroups:
+ - authentication.k8s.io
+ resources:
+ - tokenreviews
+ verbs:
+ - create
+- apiGroups:
+ - authorization.k8s.io
+ resources:
+ - subjectaccessreviews
+ verbs:
+ - create
+{{- end -}}
+
diff --git a/helm/library/cortex-shim/templates/clusterrolebinding.yaml b/helm/library/cortex-shim/templates/clusterrolebinding.yaml
new file mode 100644
index 000000000..ca82a0119
--- /dev/null
+++ b/helm/library/cortex-shim/templates/clusterrolebinding.yaml
@@ -0,0 +1,34 @@
+{{- if .Values.rbac.enable }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ labels:
+ {{- include "chart.labels" . | nindent 4 }}
+ name: {{ .Values.namePrefix }}-shim-rolebinding
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: {{ .Values.namePrefix }}-shim-role
+subjects:
+- kind: ServiceAccount
+ name: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }}
+ namespace: {{ .Release.Namespace }}
+{{- end -}}
+{{- if and .Values.rbac.enable .Values.metrics.enable }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ labels:
+ {{- include "chart.labels" . | nindent 4 }}
+ name: {{ .Values.namePrefix }}-metrics-auth-rolebinding
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: {{ .Values.namePrefix }}-metrics-auth-role
+subjects:
+- kind: ServiceAccount
+ name: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }}
+ namespace: {{ .Release.Namespace }}
+{{- end -}}
+
diff --git a/helm/library/cortex-shim/templates/deployment.yaml b/helm/library/cortex-shim/templates/deployment.yaml
new file mode 100644
index 000000000..b38eb3c02
--- /dev/null
+++ b/helm/library/cortex-shim/templates/deployment.yaml
@@ -0,0 +1,112 @@
+# This file is safe from kubebuilder edit --plugins=helm/v1-alpha
+# If you want to re-generate, add the --force flag.
+
+{{- if .Values.deployment.enable }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{ .Values.namePrefix }}-shim
+ namespace: {{ .Release.Namespace }}
+ labels:
+ {{- include "chart.labels" . | nindent 4 }}
+spec:
+ replicas: {{ .Values.deployment.replicas }}
+ selector:
+ matchLabels:
+ {{- include "chart.selectorLabels" . | nindent 6 }}
+ template:
+ metadata:
+ annotations:
+ kubectl.kubernetes.io/default-container: shim
+ labels:
+ {{- include "chart.labels" . | nindent 8 }}
+ {{- if and .Values.deployment.pod .Values.deployment.pod.labels }}
+ {{- range $key, $value := .Values.deployment.pod.labels }}
+ {{ $key }}: {{ $value }}
+ {{- end }}
+ {{- end }}
+ spec:
+ containers:
+ - name: shim
+ args:
+ {{- range .Values.deployment.container.args }}
+ - {{ . }}
+ {{- end }}
+ ports:
+ - name: api
+ containerPort: 8080
+ protocol: TCP
+ - name: metrics
+ containerPort: 2112
+ protocol: TCP
+ command:
+ - /main
+ image: {{ .Values.deployment.container.image.repository }}:{{ .Values.deployment.container.image.tag | default .Chart.AppVersion }}
+ {{- if .Values.deployment.container.image.pullPolicy }}
+ imagePullPolicy: {{ .Values.deployment.container.image.pullPolicy }}
+ {{- end }}
+ {{- if .Values.deployment.container.env }}
+ env:
+ {{- range $key, $value := .Values.deployment.container.env }}
+ - name: {{ $key }}
+ value: {{ $value }}
+ {{- end }}
+ {{- end }}
+ livenessProbe:
+ {{- toYaml .Values.deployment.container.livenessProbe | nindent 12 }}
+ readinessProbe:
+ {{- toYaml .Values.deployment.container.readinessProbe | nindent 12 }}
+ resources:
+ {{- toYaml .Values.deployment.container.resources | nindent 12 }}
+ securityContext:
+ {{- toYaml .Values.deployment.container.securityContext | nindent 12 }}
+ volumeMounts:
+ - name: shim-config-volume
+ mountPath: /etc/config
+ - name: shim-secrets-volume
+ mountPath: /etc/secrets
+ readOnly: true
+ securityContext:
+ {{- toYaml .Values.deployment.securityContext | nindent 8 }}
+ serviceAccountName: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }}
+ terminationGracePeriodSeconds: {{ .Values.deployment.terminationGracePeriodSeconds }}
+ volumes:
+ # Custom values to configure the shim.
+ - name: shim-config-volume
+ configMap:
+ name: {{ .Values.namePrefix }}-shim-config
+ - name: shim-secrets-volume
+ secret:
+ secretName: {{ .Values.namePrefix }}-shim-secrets
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: {{ .Values.namePrefix }}-shim-config
+data:
+ conf.json: |-
+ {{- $mergedConf := dict }}
+ {{- if .Values.global.conf }}
+ {{- $mergedConf = .Values.global.conf }}
+ {{- end }}
+ {{- if .Values.conf }}
+ {{- $mergedConf = mergeOverwrite .Values.conf $mergedConf }}
+ {{- end }}
+ {{ toJson $mergedConf }}
+---
+apiVersion: v1
+kind: Secret
+metadata:
+ name: {{ .Values.namePrefix }}-shim-secrets
+type: Opaque
+data:
+ secrets.json: |-
+ {{- $mergedSecrets := dict }}
+ {{- if .Values.global.secrets }}
+ {{- $mergedSecrets = .Values.global.secrets }}
+ {{- end }}
+ {{- if .Values.secrets }}
+ {{- $mergedSecrets = mergeOverwrite .Values.secrets $mergedSecrets }}
+ {{- end }}
+ {{ toJson $mergedSecrets | b64enc }}
+{{- end }}
\ No newline at end of file
diff --git a/helm/library/cortex-shim/templates/service.yaml b/helm/library/cortex-shim/templates/service.yaml
new file mode 100644
index 000000000..549ceed95
--- /dev/null
+++ b/helm/library/cortex-shim/templates/service.yaml
@@ -0,0 +1,33 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ .Values.namePrefix }}-shim-service
+ namespace: {{ .Release.Namespace }}
+ labels:
+ {{- include "chart.labels" . | nindent 4 }}
+spec:
+ ports:
+ - port: 8080
+ targetPort: api
+ protocol: TCP
+ name: api
+ selector:
+ app.kubernetes.io/name: {{ include "chart.name" . }}
+{{- if .Values.metrics.enable }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ .Values.namePrefix }}-shim-metrics-service
+ namespace: {{ .Release.Namespace }}
+ labels:
+ {{- include "chart.labels" . | nindent 4 }}
+spec:
+ ports:
+ - port: 2112
+ targetPort: metrics
+ protocol: TCP
+ name: metrics
+ selector:
+ app.kubernetes.io/name: {{ include "chart.name" . }}
+{{- end }}
diff --git a/helm/library/cortex-shim/templates/serviceaccount.yaml b/helm/library/cortex-shim/templates/serviceaccount.yaml
new file mode 100644
index 000000000..ea0789dd0
--- /dev/null
+++ b/helm/library/cortex-shim/templates/serviceaccount.yaml
@@ -0,0 +1,15 @@
+{{- if .Values.rbac.enable }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ labels:
+ {{- include "chart.labels" . | nindent 4 }}
+ {{- if and .Values.deployment.serviceAccount .Values.deployment.serviceAccount.annotations }}
+ annotations:
+ {{- range $key, $value := .Values.deployment.serviceAccount.annotations }}
+ {{ $key }}: {{ $value }}
+ {{- end }}
+ {{- end }}
+ name: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }}
+ namespace: {{ .Release.Namespace }}
+{{- end -}}
diff --git a/helm/library/cortex-shim/templates/servicemonitor.yaml b/helm/library/cortex-shim/templates/servicemonitor.yaml
new file mode 100644
index 000000000..803e66dd5
--- /dev/null
+++ b/helm/library/cortex-shim/templates/servicemonitor.yaml
@@ -0,0 +1,16 @@
+# To integrate with Prometheus.
+{{- if .Values.prometheus.enable }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+ labels:
+ {{- include "chart.labels" . | nindent 4 }}
+ name: {{ .Values.namePrefix }}-shim-metrics-monitor
+ namespace: {{ .Release.Namespace }}
+spec:
+ endpoints:
+ - port: metrics
+ selector:
+ matchLabels:
+ app.kubernetes.io/name: {{ include "chart.name" . }}
+{{- end }}
diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml
new file mode 100644
index 000000000..6434e823a
--- /dev/null
+++ b/helm/library/cortex-shim/values.yaml
@@ -0,0 +1,68 @@
+deployment:
+ enable: true
+ replicas: 1
+ container:
+ image:
+ repository: ghcr.io/cobaltcore-dev/cortex-shim
+ args:
+ - "--metrics-bind-address=:2112"
+ - "--health-probe-bind-address=:8081"
+ - "--metrics-secure=false"
+ resources:
+ limits:
+ cpu: 500m
+ memory: 2048Mi
+ requests:
+ cpu: 10m
+ memory: 64Mi
+ livenessProbe:
+ initialDelaySeconds: 15
+ periodSeconds: 20
+ httpGet:
+ path: /healthz
+ port: 8081
+ readinessProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 10
+ httpGet:
+ path: /readyz
+ port: 8081
+ securityContext:
+ allowPrivilegeEscalation: false
+ capabilities:
+ drop:
+ - "ALL"
+ securityContext:
+ runAsNonRoot: true
+ seccompProfile:
+ type: RuntimeDefault
+ terminationGracePeriodSeconds: 10
+ serviceAccountName: shim
+
+# [METRICS]: Set to true to generate manifests for exporting metrics.
+# To disable metrics export set false, and ensure that the
+# ControllerManager argument "--metrics-bind-address=:8443" is removed.
+metrics:
+ enable: true
+
+# [RBAC]: To enable RBAC (Permissions) configurations
+rbac:
+ enable: true
+
+# [PROMETHEUS]: To enable a ServiceMonitor to export metrics to Prometheus set true
+prometheus:
+ enable: true
+
+global:
+ conf: {}
+
+# Use this to unambiguate multiple cortex deployments in the same cluster.
+namePrefix: cortex
+conf:
+ # The scheduling domain this operator is responsible for.
+ schedulingDomain: cortex
+ # Used to differentiate different cortex deployments in the same cluster (e.g. leader election ID)
+ leaderElectionID: cortex-unknown
+ enabledControllers:
+ # The explanation controller is available for all decision resources.
+ - explanation-controller
diff --git a/helm/library/cortex/templates/manager/manager.yaml b/helm/library/cortex/templates/manager/manager.yaml
index 73672164f..0c9f362aa 100644
--- a/helm/library/cortex/templates/manager/manager.yaml
+++ b/helm/library/cortex/templates/manager/manager.yaml
@@ -51,7 +51,7 @@ spec:
protocol: TCP
{{- end }}
command:
- - /manager
+ - /main
image: {{ .Values.controllerManager.container.image.repository }}:{{ .Values.controllerManager.container.image.tag | default .Chart.AppVersion }}
{{- if .Values.controllerManager.container.image.pullPolicy }}
imagePullPolicy: {{ .Values.controllerManager.container.image.pullPolicy }}
diff --git a/internal/shim/placement/.gitkeep b/internal/shim/placement/.gitkeep
new file mode 100644
index 000000000..e69de29bb
From b55ca3c98c694f8fc6d291a0e5b72ef3e99e987f Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Thu, 9 Apr 2026 11:56:31 +0200
Subject: [PATCH 2/6] Add monitoring labels and scaffold manager (w/o leader
election)
---
cmd/shim/main.go | 248 +++++++++++++++++-
.../bundles/cortex-placement-shim/values.yaml | 6 +-
helm/library/cortex-shim/values.yaml | 9 +-
3 files changed, 252 insertions(+), 11 deletions(-)
diff --git a/cmd/shim/main.go b/cmd/shim/main.go
index 6b0634229..d59490c3c 100644
--- a/cmd/shim/main.go
+++ b/cmd/shim/main.go
@@ -3,7 +3,251 @@
package main
+import (
+ "context"
+ "crypto/tls"
+ "errors"
+ "flag"
+ "net/http"
+ "os"
+ "path/filepath"
+
+ "github.com/cobaltcore-dev/cortex/api/v1alpha1"
+ "github.com/cobaltcore-dev/cortex/pkg/conf"
+ "github.com/cobaltcore-dev/cortex/pkg/monitoring"
+ hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+ "github.com/sapcc/go-bits/httpext"
+ "k8s.io/apimachinery/pkg/runtime"
+ utilruntime "k8s.io/apimachinery/pkg/util/runtime"
+ clientgoscheme "k8s.io/client-go/kubernetes/scheme"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/certwatcher"
+ "sigs.k8s.io/controller-runtime/pkg/healthz"
+ "sigs.k8s.io/controller-runtime/pkg/log/zap"
+ "sigs.k8s.io/controller-runtime/pkg/metrics"
+ "sigs.k8s.io/controller-runtime/pkg/metrics/filters"
+ metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
+ "sigs.k8s.io/controller-runtime/pkg/webhook"
+)
+
+var (
+ // Scheme defines the scheme for the API types used by the shim.
+ scheme = runtime.NewScheme()
+ // setupLog is the logger used for setup operations in the shim.
+ setupLog = ctrl.Log.WithName("setup")
+)
+
+func init() {
+ // Bind the Kubernetes client-go scheme and the custom API types to the
+ // scheme used by the shim.
+ utilruntime.Must(clientgoscheme.AddToScheme(scheme))
+ utilruntime.Must(v1alpha1.AddToScheme(scheme)) // Cortex crds
+ utilruntime.Must(hv1.AddToScheme(scheme)) // Hypervisor crd
+}
+
func main() {
- // TODO: this needs scaffolding, for now it just does nothing.
- select {}
+ ctx := context.Background()
+ restConfig := ctrl.GetConfigOrDie()
+
+ var metricsAddr string
+ var metricsCertPath, metricsCertName, metricsCertKey string
+ var webhookCertPath, webhookCertName, webhookCertKey string
+ // The shim does not require leader election, but this flag is provided to
+ // stay consistent with the kubebuilder scaffold.
+ var enableLeaderElection bool
+ var probeAddr string
+ var secureMetrics bool
+ var enableHTTP2 bool
+ var tlsOpts []func(*tls.Config)
+ flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
+ "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
+ flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
+ flag.BoolVar(&enableLeaderElection, "leader-elect", false,
+ "Enable leader election for controller manager. "+
+ "Enabling this will ensure there is only one active controller manager.")
+ flag.BoolVar(&secureMetrics, "metrics-secure", true,
+ "If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.")
+ flag.StringVar(&webhookCertPath, "webhook-cert-path", "", "The directory that contains the webhook certificate.")
+ flag.StringVar(&webhookCertName, "webhook-cert-name", "tls.crt", "The name of the webhook certificate file.")
+ flag.StringVar(&webhookCertKey, "webhook-cert-key", "tls.key", "The name of the webhook key file.")
+ flag.StringVar(&metricsCertPath, "metrics-cert-path", "",
+ "The directory that contains the metrics server certificate.")
+ flag.StringVar(&metricsCertName, "metrics-cert-name", "tls.crt", "The name of the metrics server certificate file.")
+ flag.StringVar(&metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.")
+ flag.BoolVar(&enableHTTP2, "enable-http2", false,
+ "If set, HTTP/2 will be enabled for the metrics and webhook servers")
+ opts := zap.Options{
+ Development: true,
+ }
+ opts.BindFlags(flag.CommandLine)
+ flag.Parse()
+
+ // Check that we're really running this shim without leader election enabled.
+ if enableLeaderElection {
+ err := errors.New("leader election should not be enabled for the shim")
+ setupLog.Error(err, "invalid configuration")
+ os.Exit(1)
+ }
+
+ ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
+
+ // if the enable-http2 flag is false (the default), http/2 should be disabled
+ // due to its vulnerabilities. More specifically, disabling http/2 will
+ // prevent from being vulnerable to the HTTP/2 Stream Cancellation and
+ // Rapid Reset CVEs. For more information see:
+ // - https://github.com/advisories/GHSA-qppj-fm5r-hxr3
+ // - https://github.com/advisories/GHSA-4374-p667-p6c8
+ disableHTTP2 := func(c *tls.Config) {
+ setupLog.Info("disabling http/2")
+ c.NextProtos = []string{"http/1.1"}
+ }
+
+ if !enableHTTP2 {
+ tlsOpts = append(tlsOpts, disableHTTP2)
+ }
+
+ // Create watchers for metrics and webhooks certificates
+ var metricsCertWatcher, webhookCertWatcher *certwatcher.CertWatcher
+
+ // Initial webhook TLS options
+ webhookTLSOpts := tlsOpts
+
+ if webhookCertPath != "" {
+ setupLog.Info("Initializing webhook certificate watcher using provided certificates",
+ "webhook-cert-path", webhookCertPath, "webhook-cert-name", webhookCertName, "webhook-cert-key", webhookCertKey)
+
+ var err error
+ webhookCertWatcher, err = certwatcher.New(
+ filepath.Join(webhookCertPath, webhookCertName),
+ filepath.Join(webhookCertPath, webhookCertKey),
+ )
+ if err != nil {
+ setupLog.Error(err, "Failed to initialize webhook certificate watcher")
+ os.Exit(1)
+ }
+
+ webhookTLSOpts = append(webhookTLSOpts, func(config *tls.Config) {
+ config.GetCertificate = webhookCertWatcher.GetCertificate
+ })
+ }
+
+ webhookServer := webhook.NewServer(webhook.Options{
+ TLSOpts: webhookTLSOpts,
+ })
+
+ // Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
+ // More info:
+ // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.21.0/pkg/metrics/server
+ // - https://book.kubebuilder.io/reference/metrics.html
+ metricsServerOptions := metricsserver.Options{
+ BindAddress: metricsAddr,
+ SecureServing: secureMetrics,
+ TLSOpts: tlsOpts,
+ }
+
+ if secureMetrics {
+ // FilterProvider is used to protect the metrics endpoint with authn/authz.
+ // These configurations ensure that only authorized users and service accounts
+ // can access the metrics endpoint. The RBAC are configured in 'config/rbac/kustomization.yaml'. More info:
+ // https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.21.0/pkg/metrics/filters#WithAuthenticationAndAuthorization
+ metricsServerOptions.FilterProvider = filters.WithAuthenticationAndAuthorization
+ }
+
+ // If the certificate is not specified, controller-runtime will automatically
+ // generate self-signed certificates for the metrics server. While convenient for development and testing,
+ // this setup is not recommended for production.
+ //
+ // If you enable certManager, uncomment the following lines:
+ // - [METRICS-WITH-CERTS] at config/default/kustomization.yaml to generate and use certificates
+ // managed by cert-manager for the metrics server.
+ // - [PROMETHEUS-WITH-CERTS] at config/prometheus/kustomization.yaml for TLS certification.
+ if metricsCertPath != "" {
+ setupLog.Info("Initializing metrics certificate watcher using provided certificates",
+ "metrics-cert-path", metricsCertPath, "metrics-cert-name", metricsCertName, "metrics-cert-key", metricsCertKey)
+
+ var err error
+ metricsCertWatcher, err = certwatcher.New(
+ filepath.Join(metricsCertPath, metricsCertName),
+ filepath.Join(metricsCertPath, metricsCertKey),
+ )
+ if err != nil {
+ setupLog.Error(err, "to initialize metrics certificate watcher", "error", err)
+ os.Exit(1)
+ }
+
+ metricsServerOptions.TLSOpts = append(metricsServerOptions.TLSOpts, func(config *tls.Config) {
+ config.GetCertificate = metricsCertWatcher.GetCertificate
+ })
+ }
+
+ mgr, err := ctrl.NewManager(restConfig, ctrl.Options{
+ Scheme: scheme,
+ Metrics: metricsServerOptions,
+ WebhookServer: webhookServer,
+ HealthProbeBindAddress: probeAddr,
+ // Kept for consistency with kubebuilder scaffold, but the shim should
+ // always run with leader election disabled.
+ LeaderElection: enableLeaderElection,
+ })
+ if err != nil {
+ setupLog.Error(err, "unable to start manager")
+ os.Exit(1)
+ }
+
+ // TODO: Initialize multicluster client here.
+
+ // Our custom monitoring registry can add prometheus labels to all metrics.
+ // This is useful to distinguish metrics from different deployments.
+ metricsConfig := conf.GetConfigOrDie[monitoring.Config]()
+ metrics.Registry = monitoring.WrapRegistry(metrics.Registry, metricsConfig)
+
+ // API endpoint.
+ mux := http.NewServeMux()
+
+ // +kubebuilder:scaffold:builder
+
+ if metricsCertWatcher != nil {
+ setupLog.Info("Adding metrics certificate watcher to manager")
+ if err := mgr.Add(metricsCertWatcher); err != nil {
+ setupLog.Error(err, "unable to add metrics certificate watcher to manager")
+ os.Exit(1)
+ }
+ }
+
+ if webhookCertWatcher != nil {
+ setupLog.Info("Adding webhook certificate watcher to manager")
+ if err := mgr.Add(webhookCertWatcher); err != nil {
+ setupLog.Error(err, "unable to add webhook certificate watcher to manager")
+ os.Exit(1)
+ }
+ }
+
+ if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
+ setupLog.Error(err, "unable to set up health check")
+ os.Exit(1)
+ }
+ if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
+ setupLog.Error(err, "unable to set up ready check")
+ os.Exit(1)
+ }
+
+ errchan := make(chan error)
+ go func() {
+ errchan <- func() error {
+ setupLog.Info("starting api server", "address", ":8080")
+ return httpext.ListenAndServeContext(ctx, ":8080", mux)
+ }()
+ }()
+ go func() {
+ if err := <-errchan; err != nil {
+ setupLog.Error(err, "problem running api server")
+ os.Exit(1)
+ }
+ }()
+
+ setupLog.Info("starting manager")
+ if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
+ setupLog.Error(err, "problem running manager")
+ os.Exit(1)
+ }
}
diff --git a/helm/bundles/cortex-placement-shim/values.yaml b/helm/bundles/cortex-placement-shim/values.yaml
index 40aa9cb11..6dd793653 100644
--- a/helm/bundles/cortex-placement-shim/values.yaml
+++ b/helm/bundles/cortex-placement-shim/values.yaml
@@ -20,4 +20,8 @@ alerts:
cortex-shim:
namePrefix: cortex-placement
- conf: {} # TODO
+ conf:
+ monitoring:
+ labels:
+ github_org: cobaltcore-dev
+ github_repo: cortex
diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml
index 6434e823a..1c45c2542 100644
--- a/helm/library/cortex-shim/values.yaml
+++ b/helm/library/cortex-shim/values.yaml
@@ -58,11 +58,4 @@ global:
# Use this to unambiguate multiple cortex deployments in the same cluster.
namePrefix: cortex
-conf:
- # The scheduling domain this operator is responsible for.
- schedulingDomain: cortex
- # Used to differentiate different cortex deployments in the same cluster (e.g. leader election ID)
- leaderElectionID: cortex-unknown
- enabledControllers:
- # The explanation controller is available for all decision resources.
- - explanation-controller
+conf: {} # No config for now that's needed by all the shims.
From 5bd2e0491899ea8f3e44ada30ba2b3878ba0b93d Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Thu, 9 Apr 2026 12:01:19 +0200
Subject: [PATCH 3/6] Remove alerts
---
.../alerts/placement-shim.alerts.yaml | 735 +-----------------
1 file changed, 2 insertions(+), 733 deletions(-)
diff --git a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml
index 41bf29794..03aea7763 100644
--- a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml
+++ b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml
@@ -1,734 +1,3 @@
groups:
-- name: cortex-nova-alerts
- rules:
- - alert: CortexNovaSchedulingDown
- expr: |
- up{pod=~"cortex-nova-scheduling-.*"} != 1 or
- absent(up{pod=~"cortex-nova-scheduling-.*"})
- for: 5m
- labels:
- context: liveness
- dashboard: cortex/cortex
- service: cortex
- severity: critical
- support_group: workload-management
- playbook: docs/support/playbook/cortex/down
- annotations:
- summary: "Cortex Scheduling for Nova is down"
- description: >
- The Cortex scheduling service is down. Scheduling requests from Nova will
- not be served. This is non-critical for vmware virtual machines, but
- blocks kvm virtual machines from being scheduled. Thus, it is
- recommended to immediately investigate and resolve the issue.
-
- - alert: CortexNovaKnowledgeDown
- expr: |
- up{pod=~"cortex-nova-knowledge-.*"} != 1 or
- absent(up{pod=~"cortex-nova-knowledge-.*"})
- for: 5m
- labels:
- context: liveness
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- playbook: docs/support/playbook/cortex/down
- annotations:
- summary: "Cortex Knowledge for Nova is down"
- description: >
- The Cortex Knowledge service is down. This is no immediate problem,
- since cortex is still able to process requests,
- but the quality of the responses may be affected.
-
- - alert: CortexNovaDeschedulerPipelineErroring
- expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0
- for: 5m
- labels:
- context: descheduler
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Descheduler pipeline is erroring."
- description: >
- The Cortex descheduler pipeline is encountering errors during its execution.
- This may indicate issues with the descheduling logic or the underlying infrastructure.
- It is recommended to investigate the descheduler logs and the state of the VMs being processed.
-
- - alert: CortexNovaHttpRequest400sTooHigh
- expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1
- for: 5m
- labels:
- context: api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Nova Scheduler HTTP request 400 errors too high"
- description: >
- Nova Scheduler is responding to placement requests with HTTP 4xx
- errors. This is expected when the scheduling request cannot be served
- by Cortex. However, it could also indicate that the request format has
- changed and Cortex is unable to parse it.
-
- - alert: CortexNovaSchedulingHttpRequest500sTooHigh
- expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1
- for: 5m
- labels:
- context: api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Nova Scheduler HTTP request 500 errors too high"
- description: >
- Nova Scheduler is responding to placement requests with HTTP 5xx errors.
- This is not expected and indicates that Cortex is having some internal problem.
- Nova will continue to place new VMs, but the placement will be less desirable.
- Thus, no immediate action is needed.
-
- - alert: CortexNovaHighMemoryUsage
- expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > 6000 * 1024 * 1024
- for: 5m
- labels:
- context: memory
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "`{{$labels.component}}` uses too much memory"
- description: >
- `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it
- should use much less, so there may be a memory leak or other changes
- that are causing the memory usage to increase significantly.
-
- - alert: CortexNovaHighCPUUsage
- expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5
- for: 5m
- labels:
- context: cpu
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "`{{$labels.component}}` uses too much CPU"
- description: >
- `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually
- it should use much less, so there may be a CPU leak or other changes
- that are causing the CPU usage to increase significantly.
-
- - alert: CortexNovaTooManyDBConnectionAttempts
- expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1
- for: 5m
- labels:
- context: db
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "`{{$labels.component}}` is trying to connect to the database too often"
- description: >
- `{{$labels.component}}` is trying to connect to the database too often. This may happen
- when the database is down or the connection parameters are misconfigured.
-
- - alert: CortexNovaSyncNotSuccessful
- expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0
- for: 5m
- labels:
- context: syncstatus
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "`{{$labels.component}}` Sync not successful"
- description: >
- `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may
- happen when the datasource (OpenStack, Prometheus, etc.) is down or
- the sync module is misconfigured. No immediate action is needed, since
- the sync module will retry the sync operation and the currently synced
- data will be kept. However, when this problem persists for a longer
- time the service will have a less recent view of the datacenter.
-
- - alert: CortexNovaSyncObjectsDroppedToZero
- expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0
- for: 60m
- labels:
- context: syncobjects
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`"
- description: >
- `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen
- when the datasource (OpenStack, Prometheus, etc.) is down or the sync
- module is misconfigured. No immediate action is needed, since the sync
- module will retry the sync operation and the currently synced data will
- be kept. However, when this problem persists for a longer time the
- service will have a less recent view of the datacenter.
-
- - alert: CortexNovaDatasourceUnready
- expr: cortex_datasource_state{domain="nova",state!="ready"} != 0
- for: 60m
- labels:
- context: datasources
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state"
- description: >
- This may indicate issues with the datasource
- connectivity or configuration. It is recommended to investigate the
- datasource status and logs for more details.
-
- - alert: CortexNovaKnowledgeUnready
- expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0
- for: 60m
- labels:
- context: knowledge
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state"
- description: >
- This may indicate issues with the knowledge
- configuration. It is recommended to investigate the
- knowledge status and logs for more details.
-
- - alert: CortexNovaDecisionsWithErrors
- expr: cortex_decision_state{domain="nova",state="error"} > 0
- for: 5m
- labels:
- context: decisions
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Some decisions are in error state for operator `{{$labels.operator}}`"
- description: >
- The cortex scheduling pipeline generated decisions that are in error state.
- This may indicate issues with the decision logic or the underlying infrastructure.
- It is recommended to investigate the decision logs and the state of the
- VMs being processed.
-
- - alert: CortexNovaTooManyDecisionsWaiting
- expr: cortex_decision_state{domain="nova",state="waiting"} > 10
- for: 5m
- labels:
- context: decisions
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`"
- description: >
- The cortex scheduling pipeline has a high number of decisions for which
- no target host has been assigned yet.
-
- This may indicate a backlog in processing or issues with the decision logic.
- It is recommended to investigate the decision logs and the state of the
- VMs being processed.
-
- - alert: CortexNovaKPIUnready
- expr: |
- cortex_kpi_state{domain="nova",state!="ready"} != 0
- for: 60m
- labels:
- context: kpis
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state"
- description: >
- This may indicate issues with the KPI
- configuration. It is recommended to investigate the
- KPI status and logs for more details.
-
- - alert: CortexNovaPipelineUnready
- expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0
- for: 5m
- labels:
- context: pipelines
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state"
- description: >
- This may indicate issues with the pipeline
- configuration. It is recommended to investigate the
- pipeline status and logs for more details.
-
- # Committed Resource Info API Alerts
- - alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh
- expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
- for: 5m
- labels:
- context: committed-resource-api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource info API HTTP 500 errors too high"
- description: >
- The committed resource info API (Limes LIQUID integration) is responding
- with HTTP 5xx errors. This indicates internal problems building service info,
- such as invalid flavor group data. Limes will not be able to discover available
- resources until the issue is resolved.
-
- # Committed Resource Change API Alerts
- - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh
- expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
- for: 5m
- labels:
- context: committed-resource-api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource change API HTTP 400 errors too high"
- description: >
- The committed resource change API (Limes LIQUID integration) is responding
- with HTTP 4xx errors. This may happen when Limes sends a request with
- an outdated info version (409), the API is temporarily unavailable,
- or the request format is invalid. Limes will typically retry these
- requests, so no immediate action is needed unless the errors persist.
-
- - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh
- expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
- for: 5m
- labels:
- context: committed-resource-api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource change API HTTP 500 errors too high"
- description: >
- The committed resource change API (Limes LIQUID integration) is responding
- with HTTP 5xx errors. This is not expected and indicates that Cortex
- is having an internal problem processing commitment changes. Limes will
- continue to retry, but new commitments may not be fulfilled until the
- issue is resolved.
-
- - alert: CortexNovaCommittedResourceLatencyTooHigh
- expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30
- for: 5m
- labels:
- context: committed-resource-api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource change API latency too high"
- description: >
- The committed resource change API (Limes LIQUID integration) is experiencing
- high latency (p95 > 30s). This may indicate that the scheduling pipeline
- is under heavy load or that reservation scheduling is taking longer than
- expected. Limes requests may time out, causing commitment changes to fail.
-
- - alert: CortexNovaCommittedResourceRejectionRateTooHigh
- expr: |
- sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m]))
- / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5
- for: 5m
- labels:
- context: committed-resource-api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource rejection rate too high"
- description: >
- More than 50% of commitment change requests are being rejected.
- This may indicate insufficient capacity in the datacenter to fulfill
- new commitments, or issues with the commitment scheduling logic.
- Rejected commitments are rolled back, so Limes will see them as failed
- and may retry or report the failure to users.
-
- - alert: CortexNovaCommittedResourceTimeoutsTooHigh
- expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0
- for: 5m
- labels:
- context: committed-resource-api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource change API timeouts too high"
- description: >
- The committed resource change API (Limes LIQUID integration) timed out
- while waiting for reservations to become ready. This indicates that the
- scheduling pipeline is overloaded or reservations are taking too long
- to be scheduled. Affected commitment changes are rolled back and Limes
- will see them as failed. Consider investigating the scheduler performance
- or increasing the timeout configuration.
-
- # Committed Resource Usage API Alerts
- - alert: CortexNovaCommittedResourceUsageHttpRequest400sTooHigh
- expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
- for: 5m
- labels:
- context: committed-resource-api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource usage API HTTP 400 errors too high"
- description: >
- The committed resource usage API (Limes LIQUID integration) is responding
- with HTTP 4xx errors. This may indicate invalid project IDs or malformed
- requests from Limes. Limes will typically retry these requests.
-
- - alert: CortexNovaCommittedResourceUsageHttpRequest500sTooHigh
- expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
- for: 5m
- labels:
- context: committed-resource-api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource usage API HTTP 500 errors too high"
- description: >
- The committed resource usage API (Limes LIQUID integration) is responding
- with HTTP 5xx errors. This indicates internal problems fetching reservations
- or Nova server data. Limes may receive stale or incomplete usage data.
-
- - alert: CortexNovaCommittedResourceUsageLatencyTooHigh
- expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_usage_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
- for: 5m
- labels:
- context: committed-resource-api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource usage API latency too high"
- description: >
- The committed resource usage API (Limes LIQUID integration) is experiencing
- high latency (p95 > 5s). This may indicate slow Nova API responses or
- database queries. Limes scrapes may time out, affecting quota reporting.
-
- # Committed Resource Capacity API Alerts
- - alert: CortexNovaCommittedResourceCapacityHttpRequest400sTooHigh
- expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
- for: 5m
- labels:
- context: committed-resource-api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource capacity API HTTP 400 errors too high"
- description: >
- The committed resource capacity API (Limes LIQUID integration) is responding
- with HTTP 4xx errors. This may indicate malformed requests from Limes.
-
- - alert: CortexNovaCommittedResourceCapacityHttpRequest500sTooHigh
- expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
- for: 5m
- labels:
- context: committed-resource-api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource capacity API HTTP 500 errors too high"
- description: >
- The committed resource capacity API (Limes LIQUID integration) is responding
- with HTTP 5xx errors. This indicates internal problems calculating cluster
- capacity. Limes may receive stale or incomplete capacity data.
-
- - alert: CortexNovaCommittedResourceCapacityLatencyTooHigh
- expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_capacity_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
- for: 5m
- labels:
- context: committed-resource-api
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource capacity API latency too high"
- description: >
- The committed resource capacity API (Limes LIQUID integration) is experiencing
- high latency (p95 > 5s). This may indicate slow database queries or knowledge
- CRD retrieval. Limes scrapes may time out, affecting capacity reporting.
-
- # Committed Resource Syncer Alerts
- - alert: CortexNovaCommittedResourceSyncerErrorsHigh
- expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3
- for: 5m
- labels:
- context: committed-resource-syncer
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource syncer experiencing errors"
- description: >
- The committed resource syncer has encountered multiple errors in the last hour.
- This may indicate connectivity issues with Limes. Check the syncer logs for error details.
-
- - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh
- expr: |
- (
- sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h]))
- / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
- ) > 0.05
- and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
- for: 15m
- labels:
- context: committed-resource-syncer
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource syncer unit mismatch rate >5%"
- description: >
- More than 5% of commitments are being skipped due to unit mismatches between
- Limes and Cortex flavor groups. This happens when Limes has not yet been
- updated to use the new unit format after a flavor group change. The affected
- commitments will keep their existing reservations until Limes notices the update.
- Check the logs if this error persists for longer time.
-
- - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh
- expr: |
- (
- sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h]))
- / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
- ) > 0
- and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
- for: 15m
- labels:
- context: committed-resource-syncer
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource syncer unknown flavor group rate >0%"
- description: >
- Some commitments reference flavor groups that don't exist in
- Cortex Knowledge (anymore). This may indicate that flavor group configuration is
- out of sync between Limes and Cortex, or that Knowledge extraction is failing.
- Check the flavor group Knowledge CRD and history to see what was changed.
-
- - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh
- expr: |
- (
- (
- rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) +
- rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) +
- rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
- ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
- ) > 0.01
- and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
- for: 15m
- labels:
- context: committed-resource-syncer
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource syncer local change rate >1%"
- description: >
- More than 1% of synced commitments are requiring reservation changes
- (creates, deletes, or repairs). This is higher than expected for steady-state
- operation and may indicate data inconsistencies, external modifications to
- reservations, or issues with the CRDs. Check Cortex logs for details.
-
- - alert: CortexNovaCommittedResourceSyncerRepairRateHigh
- expr: |
- (
- rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
- / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
- ) > 0
- and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
- for: 15m
- labels:
- context: committed-resource-syncer
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Committed Resource syncer repair rate >0%"
- description: >
- Some commitments have reservations that needed repair
- (wrong metadata like project ID or flavor group). This may indicate data
- corruption, bugs in reservation creation, or external modifications.
- Reservations are automatically repaired, but the root cause should be
- investigated if this alert persists.
-
- - alert: CortexNovaDoesntFindValidKVMHosts
- expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0
- for: 5m
- labels:
- context: scheduling
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Nova scheduling cannot find valid KVM hosts"
- description: >
- Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling
- failed to find a valid `{{$labels.hvtype}}` host. This may indicate
- capacity issues, misconfigured filters, or resource constraints in the
- datacenter. Investigate the affected VMs and hypervisor availability.
-
- - alert: CortexNovaNewDatasourcesNotReconciling
- expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0
- for: 60m
- labels:
- context: datasources
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "New datasource `{{$labels.datasource}}` has not reconciled"
- description: >
- A new datasource `{{$labels.datasource}}` has been added but has not
- completed its first reconciliation yet. This may indicate issues with
- the datasource controller's workqueue overprioritizing other datasources.
-
- - alert: CortexNovaExistingDatasourcesLackingBehind
- expr: |
- sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600
- and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1
- for: 10m
- labels:
- context: datasources
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Existing datasource `{{$labels.datasource}}` is lacking behind"
- description: >
- An existing datasource `{{$labels.datasource}}` has been queued for
- reconciliation for more than 10 minutes. This may indicate issues with
- the datasource controller's workqueue or that this or another datasource
- is taking an unusually long time to reconcile.
-
- - alert: CortexNovaReconcileErrorsHigh
- expr: |
- (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m])))
- / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1
- for: 15m
- labels:
- context: controller-errors
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Controller reconcile error rate >10%"
- description: >
- More than 10% of controller reconciles are resulting in errors. This may
- indicate issues with the controller logic, connectivity problems, or
- external factors causing failures. Check the controller logs for error
- details and investigate the affected resources.
-
- - alert: CortexNovaReconcileDurationHigher10Min
- expr: |
- (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m])))
- / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > 600
- for: 15m
- labels:
- context: controller-duration
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})"
- description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}"
-
- - alert: CortexNovaWorkqueueNotDrained
- expr: |
- sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0
- for: 60m
- labels:
- context: controller-workqueue
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Controller {{ $labels.name }}'s backlog is not being drained."
- description: >
- The workqueue for controller {{ $labels.name }} has a backlog that is
- not being drained. This may indicate that the controller is overwhelmed
- with work or is stuck on certain resources. Check the controller logs
- and the state of the resources it manages for more details.
-
- - alert: CortexNovaWebhookLatencyHigh
- expr: |
- histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2
- for: 15m
- labels:
- context: controller-webhook
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Controller webhook {{ $labels.webhook }} latency is high"
- description: >
- The latency for webhook {{ $labels.webhook }} is higher than expected (p90 > 200ms).
- This may indicate performance issues with the webhook server or the logic it executes.
- Check the webhook server logs and monitor its resource usage for more insights.
-
- - alert: CortexNovaWebhookErrorsHigh
- expr: |
- (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m])))
- / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1
- for: 15m
- labels:
- context: controller-webhook
- dashboard: cortex/cortex
- service: cortex
- severity: warning
- support_group: workload-management
- annotations:
- summary: "Controller webhook {{ $labels.webhook }} is experiencing errors"
- description: >
- The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes.
- This may indicate issues with the webhook logic, connectivity problems, or
- external factors causing failures. Check the webhook server logs for error
- details and investigate the affected resources.
\ No newline at end of file
+- name: cortex-placement-shim-alerts
+ rules: []
\ No newline at end of file
From efcfef53f4e567024a0c98cd37c9cb29edc6204e Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Thu, 9 Apr 2026 13:20:52 +0200
Subject: [PATCH 4/6] PR feedback
---
.github/workflows/push-images.yaml | 7 ++++++-
cmd/shim/main.go | 9 ++++-----
helm/library/cortex-shim/templates/_helpers.tpl | 12 ++++++++----
helm/library/cortex-shim/templates/service.yaml | 4 ++--
helm/library/cortex-shim/values.yaml | 4 ++--
5 files changed, 22 insertions(+), 14 deletions(-)
diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml
index 3085b503b..f3be685ce 100644
--- a/.github/workflows/push-images.yaml
+++ b/.github/workflows/push-images.yaml
@@ -81,6 +81,11 @@ jobs:
files: |
cmd/shim/**
internal/shim/**
+ api/**
+ pkg/**
+ go.mod
+ go.sum
+ Dockerfile
- name: Docker Meta (Cortex Shim)
if: steps.changed_shim_files.outputs.all_changed_files != ''
id: meta_cortex_shim
@@ -99,7 +104,7 @@ jobs:
id: push_cortex_shim
uses: docker/build-push-action@v7
with:
- context: cmd/shim
+ context: .
platforms: linux/amd64,linux/arm64
push: true
tags: ${{ steps.meta_cortex_shim.outputs.tags }}
diff --git a/cmd/shim/main.go b/cmd/shim/main.go
index d59490c3c..970c8c934 100644
--- a/cmd/shim/main.go
+++ b/cmd/shim/main.go
@@ -4,7 +4,6 @@
package main
import (
- "context"
"crypto/tls"
"errors"
"flag"
@@ -46,7 +45,7 @@ func init() {
}
func main() {
- ctx := context.Background()
+ ctx := ctrl.SetupSignalHandler()
restConfig := ctrl.GetConfigOrDie()
var metricsAddr string
@@ -110,7 +109,7 @@ func main() {
var metricsCertWatcher, webhookCertWatcher *certwatcher.CertWatcher
// Initial webhook TLS options
- webhookTLSOpts := tlsOpts
+ webhookTLSOpts := append([]func(*tls.Config){}, tlsOpts...)
if webhookCertPath != "" {
setupLog.Info("Initializing webhook certificate watcher using provided certificates",
@@ -142,7 +141,7 @@ func main() {
metricsServerOptions := metricsserver.Options{
BindAddress: metricsAddr,
SecureServing: secureMetrics,
- TLSOpts: tlsOpts,
+ TLSOpts: append([]func(*tls.Config){}, tlsOpts...),
}
if secureMetrics {
@@ -246,7 +245,7 @@ func main() {
}()
setupLog.Info("starting manager")
- if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
+ if err := mgr.Start(ctx); err != nil {
setupLog.Error(err, "problem running manager")
os.Exit(1)
}
diff --git a/helm/library/cortex-shim/templates/_helpers.tpl b/helm/library/cortex-shim/templates/_helpers.tpl
index 782e14eef..cca33d701 100644
--- a/helm/library/cortex-shim/templates/_helpers.tpl
+++ b/helm/library/cortex-shim/templates/_helpers.tpl
@@ -36,15 +36,19 @@ app.kubernetes.io/instance: {{ .Release.Name }}
{{- $hasMutating := false }}
{{- range . }}
{{- if eq .type "mutating" }}
- $hasMutating = true }}{{- end }}
+ {{- $hasMutating = true -}}
+ {{- end }}
+{{- end }}
+{{ $hasMutating }}
{{- end }}
-{{ $hasMutating }}}}{{- end }}
{{- define "chart.hasValidatingWebhooks" -}}
{{- $hasValidating := false }}
{{- range . }}
{{- if eq .type "validating" }}
- $hasValidating = true }}{{- end }}
+ {{- $hasValidating = true -}}
+ {{- end }}
+{{- end }}
+{{ $hasValidating }}
{{- end }}
-{{ $hasValidating }}}}{{- end }}
diff --git a/helm/library/cortex-shim/templates/service.yaml b/helm/library/cortex-shim/templates/service.yaml
index 549ceed95..faf3082a3 100644
--- a/helm/library/cortex-shim/templates/service.yaml
+++ b/helm/library/cortex-shim/templates/service.yaml
@@ -12,7 +12,7 @@ spec:
protocol: TCP
name: api
selector:
- app.kubernetes.io/name: {{ include "chart.name" . }}
+ {{- include "chart.selectorLabels" . | nindent 4 }}
{{- if .Values.metrics.enable }}
---
apiVersion: v1
@@ -29,5 +29,5 @@ spec:
protocol: TCP
name: metrics
selector:
- app.kubernetes.io/name: {{ include "chart.name" . }}
+ {{- include "chart.selectorLabels" . | nindent 4 }}
{{- end }}
diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml
index 1c45c2542..1d1bc844c 100644
--- a/helm/library/cortex-shim/values.yaml
+++ b/helm/library/cortex-shim/values.yaml
@@ -40,8 +40,8 @@ deployment:
serviceAccountName: shim
# [METRICS]: Set to true to generate manifests for exporting metrics.
-# To disable metrics export set false, and ensure that the
-# ControllerManager argument "--metrics-bind-address=:8443" is removed.
+# To disable metrics export set false, and remove the container args
+# "--metrics-bind-address=:2112" and "--metrics-secure=false".
metrics:
enable: true
From 653cf8591e6ec3ea401a1e9c476fb996e5dafc0a Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Thu, 9 Apr 2026 13:36:10 +0200
Subject: [PATCH 5/6] PR feedback
---
cmd/shim/main.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cmd/shim/main.go b/cmd/shim/main.go
index 970c8c934..9feea8d5f 100644
--- a/cmd/shim/main.go
+++ b/cmd/shim/main.go
@@ -170,7 +170,7 @@ func main() {
filepath.Join(metricsCertPath, metricsCertKey),
)
if err != nil {
- setupLog.Error(err, "to initialize metrics certificate watcher", "error", err)
+ setupLog.Error(err, "Failed to initialize metrics certificate watcher")
os.Exit(1)
}
From 88b2cb49ac0e481320725ed0c923547bf8c36eb9 Mon Sep 17 00:00:00 2001
From: Philipp Matthes
Date: Thu, 9 Apr 2026 13:42:55 +0200
Subject: [PATCH 6/6] 3 replicas by default
---
helm/library/cortex-shim/values.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml
index 1d1bc844c..63574fbe4 100644
--- a/helm/library/cortex-shim/values.yaml
+++ b/helm/library/cortex-shim/values.yaml
@@ -1,6 +1,6 @@
deployment:
enable: true
- replicas: 1
+ replicas: 3
container:
image:
repository: ghcr.io/cobaltcore-dev/cortex-shim