From 6771b30e17bd409c263519a76a58b006692b8090 Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Thu, 9 Apr 2026 11:27:28 +0200
Subject: [PATCH 1/6] Scaffold cortex placement api shim

---
 .github/workflows/push-charts.yaml            |  19 -
 .github/workflows/push-images.yaml            |  45 ++
 .github/workflows/update-appversion.yml       |  21 +
 .gitignore                                    |   1 +
 AGENTS.md                                     |   3 +-
 Dockerfile                                    |   8 +-
 Tiltfile                                      |  24 +-
 cmd/{ => manager}/main.go                     |   0
 cmd/shim/main.go                              |   9 +
 helm/bundles/cortex-placement-shim/Chart.yaml |  20 +
 .../alerts/placement-shim.alerts.yaml         | 734 ++++++++++++++++++
 .../templates/alerts.yaml                     |  17 +
 .../templates/clusterrole.yaml                |  23 +
 .../templates/clusterrolebinding.yaml         |  14 +
 .../bundles/cortex-placement-shim/values.yaml |  23 +
 helm/library/cortex-shim/Chart.lock           |   6 +
 helm/library/cortex-shim/Chart.yaml           |   8 +
 .../cortex-shim/templates/_helpers.tpl        |  50 ++
 .../cortex-shim/templates/clusterrole.yaml    | 100 +++
 .../templates/clusterrolebinding.yaml         |  34 +
 .../cortex-shim/templates/deployment.yaml     | 112 +++
 .../cortex-shim/templates/service.yaml        |  33 +
 .../cortex-shim/templates/serviceaccount.yaml |  15 +
 .../cortex-shim/templates/servicemonitor.yaml |  16 +
 helm/library/cortex-shim/values.yaml          |  68 ++
 .../cortex/templates/manager/manager.yaml     |   2 +-
 internal/shim/placement/.gitkeep              |   0
 27 files changed, 1379 insertions(+), 26 deletions(-)
 rename cmd/{ => manager}/main.go (100%)
 create mode 100644 cmd/shim/main.go
 create mode 100644 helm/bundles/cortex-placement-shim/Chart.yaml
 create mode 100644 helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml
 create mode 100644 helm/bundles/cortex-placement-shim/templates/alerts.yaml
 create mode 100644 helm/bundles/cortex-placement-shim/templates/clusterrole.yaml
 create mode 100644 helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml
 create mode 100644 helm/bundles/cortex-placement-shim/values.yaml
 create mode 100644 helm/library/cortex-shim/Chart.lock
 create mode 100644 helm/library/cortex-shim/Chart.yaml
 create mode 100644 helm/library/cortex-shim/templates/_helpers.tpl
 create mode 100644 helm/library/cortex-shim/templates/clusterrole.yaml
 create mode 100644 helm/library/cortex-shim/templates/clusterrolebinding.yaml
 create mode 100644 helm/library/cortex-shim/templates/deployment.yaml
 create mode 100644 helm/library/cortex-shim/templates/service.yaml
 create mode 100644 helm/library/cortex-shim/templates/serviceaccount.yaml
 create mode 100644 helm/library/cortex-shim/templates/servicemonitor.yaml
 create mode 100644 helm/library/cortex-shim/values.yaml
 create mode 100644 internal/shim/placement/.gitkeep

diff --git a/.github/workflows/push-charts.yaml b/.github/workflows/push-charts.yaml
index 2e3577275..a4559d15a 100644
--- a/.github/workflows/push-charts.yaml
+++ b/.github/workflows/push-charts.yaml
@@ -27,25 +27,6 @@ jobs:
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Get all changed helm/library/cortex Chart.yaml files
-        id: changed-chart-yaml-files-core
-        uses: tj-actions/changed-files@v47
-        with:
-          files: |
-            helm/library/cortex/Chart.yaml
-      - name: Push cortex core charts to registry
-        if: steps.changed-chart-yaml-files-core.outputs.all_changed_files != ''
-        shell: bash
-        env:
-          ALL_CHANGED_FILES: ${{ steps.changed-chart-yaml-files-core.outputs.all_changed_files }}
-        run: |
-          for CHART_FILE in ${ALL_CHANGED_FILES}; do
-            CHART_DIR=$(dirname $CHART_FILE)
-            helm package $CHART_DIR --dependency-update --destination $CHART_DIR
-            CHART_PACKAGE=$(ls $CHART_DIR/*.tgz)
-            helm push $CHART_PACKAGE oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/
-          done
-
       - name: Get all changed library Chart.yaml files
         id: changed-chart-yaml-files-library
         uses: tj-actions/changed-files@v47
diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml
index 997595976..3085b503b 100644
--- a/.github/workflows/push-images.yaml
+++ b/.github/workflows/push-images.yaml
@@ -72,6 +72,50 @@ jobs:
           subject-digest: ${{ steps.push_cortex_postgres.outputs.digest }}
           push-to-registry: true
 
+      # Only build and push the cortex-shim image if there are changes related
+      # to the cortex shims (e.g., in cmd/shim or internal/shim).
+      - name: Get all changed shim/ files
+        id: changed_shim_files
+        uses: tj-actions/changed-files@v47
+        with:
+          files: |
+            cmd/shim/**
+            internal/shim/**
+      - name: Docker Meta (Cortex Shim)
+        if: steps.changed_shim_files.outputs.all_changed_files != ''
+        id: meta_cortex_shim
+        uses: docker/metadata-action@v6
+        with:
+          images: ${{ env.REGISTRY }}/${{ github.repository }}-shim
+          tags: |
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=sha
+            latest
+        env:
+          DOCKER_METADATA_SHORT_SHA_LENGTH: 8
+      - name: Build and Push Cortex Shim
+        if: steps.changed_shim_files.outputs.all_changed_files != ''
+        id: push_cortex_shim
+        uses: docker/build-push-action@v7
+        with:
+          context: cmd/shim
+          platforms: linux/amd64,linux/arm64
+          push: true
+          tags: ${{ steps.meta_cortex_shim.outputs.tags }}
+          labels: ${{ steps.meta_cortex_shim.outputs.labels }}
+          build-args: |
+            GIT_TAG=${{ github.ref_name }}
+            GIT_COMMIT=${{ github.sha }}
+            GOMAIN=cmd/shim/main.go
+      - name: Generate Artifact Attestation for Cortex Shim
+        if: steps.changed_shim_files.outputs.all_changed_files != ''
+        uses: actions/attest-build-provenance@v4
+        with:
+          subject-name: ${{ env.REGISTRY }}/${{ github.repository }}-shim
+          subject-digest: ${{ steps.push_cortex_shim.outputs.digest }}
+          push-to-registry: true
+
       # Build & push new cortex image
       - name: Docker Meta (Cortex)
         id: meta_cortex
@@ -98,6 +142,7 @@ jobs:
           build-args: |
             GIT_TAG=${{ github.ref_name }}
             GIT_COMMIT=${{ github.sha }}
+            GOMAIN=cmd/manager/main.go
       - name: Generate Artifact Attestation for Cortex
         uses: actions/attest-build-provenance@v4
         with:
diff --git a/.github/workflows/update-appversion.yml b/.github/workflows/update-appversion.yml
index cc5ccdc9f..20087fa80 100644
--- a/.github/workflows/update-appversion.yml
+++ b/.github/workflows/update-appversion.yml
@@ -44,6 +44,27 @@ jobs:
           git commit -m "Bump cortex-postgres chart appVersions to ${{ steps.vars.outputs.sha }} [skip ci]" || echo "No changes to commit"
           git push origin HEAD:main
 
+      # Only bumped if there are changes in shim-related directories
+      - name: Get all changed shim files
+        id: changed_shim_files
+        uses: tj-actions/changed-files@v47
+        with:
+          files: |
+            internal/shim/**
+            cmd/shim/**
+      - name: Update appVersion in cortex-shim Chart.yaml
+        if: steps.changed_shim_files.outputs.all_changed_files != ''
+        run: |
+          sed -i 's/^\([ ]*appVersion:[ ]*\).*/\1"${{ steps.vars.outputs.sha }}"/' helm/library/cortex-shim/Chart.yaml
+      - name: Commit and push changes for cortex-shim
+        if: steps.changed_shim_files.outputs.all_changed_files != ''
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git add helm/library/cortex-shim/Chart.yaml
+          git commit -m "Bump cortex-shim chart appVersions to ${{ steps.vars.outputs.sha }} [skip ci]" || echo "No changes to commit"
+          git push origin HEAD:main
+
       - name: Update appVersion in helm/library/cortex/Chart.yaml
         run: |
           sed -i 's/^\([ ]*appVersion:[ ]*\).*/\1"${{ steps.vars.outputs.sha }}"/' helm/library/cortex/Chart.yaml
diff --git a/.gitignore b/.gitignore
index 04bac2d09..7e21248bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@ cortex.secrets.yaml
 !.editorconfig
 !.gitignore
 !.github
+!.gitkeep
 !.golangci.yaml
 !.license-scan-overrides.jsonl
 !.license-scan-rules.json
diff --git a/AGENTS.md b/AGENTS.md
index 6f2e12a17..59747bd8c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -50,7 +50,8 @@ Helm charts:
 ## Repository Structure
 
 Code:
-- `cmd/main.go` is the entry point for the manager, which starts the controllers and webhooks
+- `cmd/manager/main.go` is the entry point for the manager, which starts the controllers and webhooks
+- `cmd/shim/main.go` is the entry point for cortex shims exposing cortex capabilities over REST endpoints
 - `api/v1alpha1` is where the CRD specs of cortex lives
 - `api/external` contains messages sent to cortex via http from external openstack services
 - `internal/scheduling` contains the logic for scheduling in different cloud domains
diff --git a/Dockerfile b/Dockerfile
index 6f7e79bea..2580e9637 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,6 +6,8 @@ ARG TARGETARCH
 ARG GO_MOD_PATH=.
 ARG GOCACHE=/root/.cache/go-build
 ENV GOCACHE=${GOCACHE}
+ARG GOMAIN=cmd/manager/main.go
+ENV GOMAIN=${GOMAIN}
 
 # Note: avoid using COPY to /lib which will lead to docker build errors.
 WORKDIR /workspace/${GO_MOD_PATH}
@@ -29,13 +31,13 @@ ENV GOOS=${TARGETOS:-linux}
 ENV GOARCH=${TARGETARCH}
 RUN --mount=type=cache,target=/go/pkg/mod/ \
     --mount=type=cache,target=${GOCACHE} \
-    go build -a -o /manager cmd/main.go
+    go build -a -o /main ${GOMAIN}
 
 # Use distroless as minimal base image to package the manager binary
 # Refer to https://github.com/GoogleContainerTools/distroless for more details
 FROM gcr.io/distroless/static:nonroot
 WORKDIR /
-COPY --from=builder /manager .
+COPY --from=builder /main .
 USER 65532:65532
 
-ENTRYPOINT ["/manager"]
+ENTRYPOINT ["/main"]
diff --git a/Tiltfile b/Tiltfile
index 6871d18b3..bc87f4d30 100644
--- a/Tiltfile
+++ b/Tiltfile
@@ -7,7 +7,10 @@
 analytics_settings(False)
 
 # Use the ACTIVE_DEPLOYMENTS env var to select which Cortex bundles to deploy.
-ACTIVE_DEPLOYMENTS_ENV = os.getenv('ACTIVE_DEPLOYMENTS', 'nova,manila,cinder,ironcore,pods')
+ACTIVE_DEPLOYMENTS_ENV = os.getenv(
+    'ACTIVE_DEPLOYMENTS',
+    'nova,manila,cinder,ironcore,pods,placement',
+)
 if ACTIVE_DEPLOYMENTS_ENV == "":
     ACTIVE_DEPLOYMENTS = [] # Catch "".split(",") = [""]
 else:
@@ -78,13 +81,22 @@ local('kubectl wait --namespace cert-manager --for=condition=available deploymen
 url = 'https://raw.githubusercontent.com/cobaltcore-dev/openstack-hypervisor-operator/refs/heads/main/charts/openstack-hypervisor-operator/crds/kvm.cloud.sap_hypervisors.yaml'
 local('curl -L ' + url + ' | kubectl apply -f -')
 
-########### Cortex Operator & CRDs
+########### Cortex Manager & CRDs
 docker_build('ghcr.io/cobaltcore-dev/cortex', '.',
     dockerfile='Dockerfile',
+    build_args={'GOMAIN': 'cmd/manager/main.go'},
     only=['internal/', 'cmd/', 'api/', 'pkg', 'go.mod', 'go.sum', 'Dockerfile'],
 )
 local('sh helm/sync.sh helm/library/cortex')
 
+########### Cortex Shim
+docker_build('ghcr.io/cobaltcore-dev/cortex-shim', '.',
+    dockerfile='Dockerfile',
+    build_args={'GOMAIN': 'cmd/shim/main.go'},
+    only=['internal/', 'cmd/', 'api/', 'pkg', 'go.mod', 'go.sum', 'Dockerfile'],
+)
+local('sh helm/sync.sh helm/library/cortex-shim')
+
 ########### Cortex Bundles
 docker_build('ghcr.io/cobaltcore-dev/cortex-postgres', 'postgres')
 
@@ -98,6 +110,7 @@ bundle_charts = [
     ('helm/bundles/cortex-cinder', 'cortex-cinder'),
     ('helm/bundles/cortex-ironcore', 'cortex-ironcore'),
     ('helm/bundles/cortex-pods', 'cortex-pods'),
+    ('helm/bundles/cortex-placement-shim', 'cortex-placement-shim'),
 ]
 dep_charts = {
     'cortex-crds': [
@@ -123,6 +136,9 @@ dep_charts = {
         ('helm/library/cortex-postgres', 'cortex-postgres'),
         ('helm/library/cortex', 'cortex'),
     ],
+    'cortex-placement-shim': [
+        ('helm/library/cortex-shim', 'cortex-shim'),
+    ],
 }
 
 for (bundle_chart_path, bundle_chart_name) in bundle_charts:
@@ -255,6 +271,10 @@ if 'pods' in ACTIVE_DEPLOYMENTS:
     k8s_yaml('samples/pods/pod.yaml')
     k8s_resource('test-pod', labels=['Cortex-Pods'])
 
+if 'placement' in ACTIVE_DEPLOYMENTS:
+    print("Activating Cortex Placement Shim bundle")
+    k8s_yaml(helm('./helm/bundles/cortex-placement-shim', name='cortex-placement-shim', values=tilt_values, set=env_set_overrides))
+
 ########### Dev Dependencies
 local('sh helm/sync.sh helm/dev/cortex-prometheus-operator')
 k8s_yaml(helm('./helm/dev/cortex-prometheus-operator', name='cortex-prometheus-operator')) # Operator
diff --git a/cmd/main.go b/cmd/manager/main.go
similarity index 100%
rename from cmd/main.go
rename to cmd/manager/main.go
diff --git a/cmd/shim/main.go b/cmd/shim/main.go
new file mode 100644
index 000000000..6b0634229
--- /dev/null
+++ b/cmd/shim/main.go
@@ -0,0 +1,9 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package main
+
+func main() {
+	// TODO: this needs scaffolding, for now it just does nothing.
+	select {}
+}
diff --git a/helm/bundles/cortex-placement-shim/Chart.yaml b/helm/bundles/cortex-placement-shim/Chart.yaml
new file mode 100644
index 000000000..7f53ed347
--- /dev/null
+++ b/helm/bundles/cortex-placement-shim/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright SAP SE
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v2
+name: cortex-placement-shim
+description: A Helm chart deploying the Cortex placement shim.
+type: application
+version: 0.0.1
+appVersion: 0.1.0
+dependencies:
+  # from: file://../../library/cortex-shim
+  - name: cortex-shim
+    repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
+    version: 0.0.1
+  # Owner info adds a configmap to the kubernetes cluster with information on
+  # the service owner. This makes it easier to find out who to contact in case
+  # of issues. See: https://github.com/sapcc/helm-charts/pkgs/container/helm-charts%2Fowner-info
+  - name: owner-info
+    repository: oci://ghcr.io/sapcc/helm-charts
+    version: 1.0.0
diff --git a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml
new file mode 100644
index 000000000..41bf29794
--- /dev/null
+++ b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml
@@ -0,0 +1,734 @@
+groups:
+- name: cortex-nova-alerts
+  rules:
+  - alert: CortexNovaSchedulingDown
+    expr: |
+      up{pod=~"cortex-nova-scheduling-.*"} != 1 or
+      absent(up{pod=~"cortex-nova-scheduling-.*"})
+    for: 5m
+    labels:
+      context: liveness
+      dashboard: cortex/cortex
+      service: cortex
+      severity: critical
+      support_group: workload-management
+      playbook: docs/support/playbook/cortex/down
+    annotations:
+      summary: "Cortex Scheduling for Nova is down"
+      description: >
+        The Cortex scheduling service is down. Scheduling requests from Nova will
+        not be served. This is non-critical for vmware virtual machines, but
+        blocks kvm virtual machines from being scheduled. Thus, it is
+        recommended to immediately investigate and resolve the issue.
+
+  - alert: CortexNovaKnowledgeDown
+    expr: |
+      up{pod=~"cortex-nova-knowledge-.*"} != 1 or
+      absent(up{pod=~"cortex-nova-knowledge-.*"})
+    for: 5m
+    labels:
+      context: liveness
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+      playbook: docs/support/playbook/cortex/down
+    annotations:
+      summary: "Cortex Knowledge for Nova is down"
+      description: >
+        The Cortex Knowledge service is down. This is no immediate problem,
+        since cortex is still able to process requests,
+        but the quality of the responses may be affected.
+
+  - alert: CortexNovaDeschedulerPipelineErroring
+    expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0
+    for: 5m
+    labels:
+      context: descheduler
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Descheduler pipeline is erroring."
+      description: >
+        The Cortex descheduler pipeline is encountering errors during its execution.
+        This may indicate issues with the descheduling logic or the underlying infrastructure.
+        It is recommended to investigate the descheduler logs and the state of the VMs being processed.
+
+  - alert: CortexNovaHttpRequest400sTooHigh
+    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Nova Scheduler HTTP request 400 errors too high"
+      description: >
+        Nova Scheduler is responding to placement requests with HTTP 4xx
+        errors. This is expected when the scheduling request cannot be served
+        by Cortex. However, it could also indicate that the request format has
+        changed and Cortex is unable to parse it.
+
+  - alert: CortexNovaSchedulingHttpRequest500sTooHigh
+    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1
+    for: 5m
+    labels:
+      context: api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Nova Scheduler HTTP request 500 errors too high"
+      description: >
+        Nova Scheduler is responding to placement requests with HTTP 5xx errors.
+        This is not expected and indicates that Cortex is having some internal problem.
+        Nova will continue to place new VMs, but the placement will be less desirable.
+        Thus, no immediate action is needed.
+
+  - alert: CortexNovaHighMemoryUsage
+    expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > 6000 * 1024 * 1024
+    for: 5m
+    labels:
+      context: memory
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` uses too much memory"
+      description: >
+        `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it
+        should use much less, so there may be a memory leak or other changes
+        that are causing the memory usage to increase significantly.
+
+  - alert: CortexNovaHighCPUUsage
+    expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5
+    for: 5m
+    labels:
+      context: cpu
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` uses too much CPU"
+      description: >
+        `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually
+        it should use much less, so there may be a CPU leak or other changes
+        that are causing the CPU usage to increase significantly.
+
+  - alert: CortexNovaTooManyDBConnectionAttempts
+    expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: db
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` is trying to connect to the database too often"
+      description: >
+        `{{$labels.component}}` is trying to connect to the database too often. This may happen
+        when the database is down or the connection parameters are misconfigured.
+
+  - alert: CortexNovaSyncNotSuccessful
+    expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0
+    for: 5m
+    labels:
+      context: syncstatus
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` Sync not successful"
+      description: >
+        `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may
+        happen when the datasource (OpenStack, Prometheus, etc.) is down or
+        the sync module is misconfigured. No immediate action is needed, since
+        the sync module will retry the sync operation and the currently synced
+        data will be kept. However, when this problem persists for a longer
+        time the service will have a less recent view of the datacenter.
+
+  - alert: CortexNovaSyncObjectsDroppedToZero
+    expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0
+    for: 60m
+    labels:
+      context: syncobjects
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`"
+      description: >
+        `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen
+        when the datasource (OpenStack, Prometheus, etc.) is down or the sync
+        module is misconfigured. No immediate action is needed, since the sync
+        module will retry the sync operation and the currently synced data will
+        be kept. However, when this problem persists for a longer time the
+        service will have a less recent view of the datacenter.
+
+  - alert: CortexNovaDatasourceUnready
+    expr: cortex_datasource_state{domain="nova",state!="ready"} != 0
+    for: 60m
+    labels:
+      context: datasources
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state"
+      description: >
+        This may indicate issues with the datasource
+        connectivity or configuration. It is recommended to investigate the
+        datasource status and logs for more details.
+
+  - alert: CortexNovaKnowledgeUnready
+    expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0
+    for: 60m
+    labels:
+      context: knowledge
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state"
+      description: >
+        This may indicate issues with the knowledge
+        configuration. It is recommended to investigate the
+        knowledge status and logs for more details.
+
+  - alert: CortexNovaDecisionsWithErrors
+    expr: cortex_decision_state{domain="nova",state="error"} > 0
+    for: 5m
+    labels:
+      context: decisions
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Some decisions are in error state for operator `{{$labels.operator}}`"
+      description: >
+        The cortex scheduling pipeline generated decisions that are in error state.
+        This may indicate issues with the decision logic or the underlying infrastructure.
+        It is recommended to investigate the decision logs and the state of the
+        VMs being processed.
+
+  - alert: CortexNovaTooManyDecisionsWaiting
+    expr: cortex_decision_state{domain="nova",state="waiting"} > 10
+    for: 5m
+    labels:
+      context: decisions
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`"
+      description: >
+        The cortex scheduling pipeline has a high number of decisions for which
+        no target host has been assigned yet.
+
+        This may indicate a backlog in processing or issues with the decision logic.
+        It is recommended to investigate the decision logs and the state of the
+        VMs being processed.
+
+  - alert: CortexNovaKPIUnready
+    expr: |
+      cortex_kpi_state{domain="nova",state!="ready"} != 0
+    for: 60m
+    labels:
+      context: kpis
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state"
+      description: >
+        This may indicate issues with the KPI
+        configuration. It is recommended to investigate the
+        KPI status and logs for more details.
+
+  - alert: CortexNovaPipelineUnready
+    expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0
+    for: 5m
+    labels:
+      context: pipelines
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state"
+      description: >
+        This may indicate issues with the pipeline
+        configuration. It is recommended to investigate the
+        pipeline status and logs for more details.
+
+  # Committed Resource Info API Alerts
+  - alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh
+    expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource info API HTTP 500 errors too high"
+      description: >
+        The committed resource info API (Limes LIQUID integration) is responding
+        with HTTP 5xx errors. This indicates internal problems building service info,
+        such as invalid flavor group data. Limes will not be able to discover available
+        resources until the issue is resolved.
+
+  # Committed Resource Change API Alerts
+  - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh
+    expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API HTTP 400 errors too high"
+      description: >
+        The committed resource change API (Limes LIQUID integration) is responding
+        with HTTP 4xx errors. This may happen when Limes sends a request with
+        an outdated info version (409), the API is temporarily unavailable,
+        or the request format is invalid. Limes will typically retry these
+        requests, so no immediate action is needed unless the errors persist.
+
+  - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh
+    expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API HTTP 500 errors too high"
+      description: >
+        The committed resource change API (Limes LIQUID integration) is responding
+        with HTTP 5xx errors. This is not expected and indicates that Cortex
+        is having an internal problem processing commitment changes. Limes will
+        continue to retry, but new commitments may not be fulfilled until the
+        issue is resolved.
+
+  - alert: CortexNovaCommittedResourceLatencyTooHigh
+    expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API latency too high"
+      description: >
+        The committed resource change API (Limes LIQUID integration) is experiencing
+        high latency (p95 > 30s). This may indicate that the scheduling pipeline
+        is under heavy load or that reservation scheduling is taking longer than
+        expected. Limes requests may time out, causing commitment changes to fail.
+
+  - alert: CortexNovaCommittedResourceRejectionRateTooHigh
+    expr: |
+      sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m]))
+      / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource rejection rate too high"
+      description: >
+        More than 50% of commitment change requests are being rejected.
+        This may indicate insufficient capacity in the datacenter to fulfill
+        new commitments, or issues with the commitment scheduling logic.
+        Rejected commitments are rolled back, so Limes will see them as failed
+        and may retry or report the failure to users.
+
+  - alert: CortexNovaCommittedResourceTimeoutsTooHigh
+    expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource change API timeouts too high"
+      description: >
+        The committed resource change API (Limes LIQUID integration) timed out
+        while waiting for reservations to become ready. This indicates that the
+        scheduling pipeline is overloaded or reservations are taking too long
+        to be scheduled. Affected commitment changes are rolled back and Limes
+        will see them as failed. Consider investigating the scheduler performance
+        or increasing the timeout configuration.
+
+  # Committed Resource Usage API Alerts
+  - alert: CortexNovaCommittedResourceUsageHttpRequest400sTooHigh
+    expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource usage API HTTP 400 errors too high"
+      description: >
+        The committed resource usage API (Limes LIQUID integration) is responding
+        with HTTP 4xx errors. This may indicate invalid project IDs or malformed
+        requests from Limes. Limes will typically retry these requests.
+
+  - alert: CortexNovaCommittedResourceUsageHttpRequest500sTooHigh
+    expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource usage API HTTP 500 errors too high"
+      description: >
+        The committed resource usage API (Limes LIQUID integration) is responding
+        with HTTP 5xx errors. This indicates internal problems fetching reservations
+        or Nova server data. Limes may receive stale or incomplete usage data.
+
+  - alert: CortexNovaCommittedResourceUsageLatencyTooHigh
+    expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_usage_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource usage API latency too high"
+      description: >
+        The committed resource usage API (Limes LIQUID integration) is experiencing
+        high latency (p95 > 5s). This may indicate slow Nova API responses or
+        database queries. Limes scrapes may time out, affecting quota reporting.
+
+  # Committed Resource Capacity API Alerts
+  - alert: CortexNovaCommittedResourceCapacityHttpRequest400sTooHigh
+    expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource capacity API HTTP 400 errors too high"
+      description: >
+        The committed resource capacity API (Limes LIQUID integration) is responding
+        with HTTP 4xx errors. This may indicate malformed requests from Limes.
+
+  - alert: CortexNovaCommittedResourceCapacityHttpRequest500sTooHigh
+    expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource capacity API HTTP 500 errors too high"
+      description: >
+        The committed resource capacity API (Limes LIQUID integration) is responding
+        with HTTP 5xx errors. This indicates internal problems calculating cluster
+        capacity. Limes may receive stale or incomplete capacity data.
+
+  - alert: CortexNovaCommittedResourceCapacityLatencyTooHigh
+    expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_capacity_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource capacity API latency too high"
+      description: >
+        The committed resource capacity API (Limes LIQUID integration) is experiencing
+        high latency (p95 > 5s). This may indicate slow database queries or knowledge
+        CRD retrieval. Limes scrapes may time out, affecting capacity reporting.
+
+  # Committed Resource Syncer Alerts
+  - alert: CortexNovaCommittedResourceSyncerErrorsHigh
+    expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3
+    for: 5m
+    labels:
+      context: committed-resource-syncer
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource syncer experiencing errors"
+      description: >
+        The committed resource syncer has encountered multiple errors in the last hour.
+        This may indicate connectivity issues with Limes. Check the syncer logs for error details.
+
+  - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh
+    expr: |
+      (
+        sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h]))
+        / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
+      ) > 0.05
+      and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
+    for: 15m
+    labels:
+      context: committed-resource-syncer
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource syncer unit mismatch rate >5%"
+      description: >
+        More than 5% of commitments are being skipped due to unit mismatches between
+        Limes and Cortex flavor groups. This happens when Limes has not yet been
+        updated to use the new unit format after a flavor group change. The affected
+        commitments will keep their existing reservations until Limes notices the update.
+        Check the logs if this error persists for longer time.
+
+  - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh
+    expr: |
+      (
+        sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h]))
+        / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
+      ) > 0
+      and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
+    for: 15m
+    labels:
+      context: committed-resource-syncer
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource syncer unknown flavor group rate >0%"
+      description: >
+        Some commitments reference flavor groups that don't exist in
+        Cortex Knowledge (anymore). This may indicate that flavor group configuration is
+        out of sync between Limes and Cortex, or that Knowledge extraction is failing.
+        Check the flavor group Knowledge CRD and history to see what was changed.
+
+  - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh
+    expr: |
+      (
+        (
+          rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) +
+          rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) +
+          rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
+        ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
+      ) > 0.01
+      and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
+    for: 15m
+    labels:
+      context: committed-resource-syncer
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource syncer local change rate >1%"
+      description: >
+        More than 1% of synced commitments are requiring reservation changes
+        (creates, deletes, or repairs). This is higher than expected for steady-state
+        operation and may indicate data inconsistencies, external modifications to
+        reservations, or issues with the CRDs. Check Cortex logs for details.
+
+  - alert: CortexNovaCommittedResourceSyncerRepairRateHigh
+    expr: |
+      (
+        rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
+        / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
+      ) > 0
+      and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
+    for: 15m
+    labels:
+      context: committed-resource-syncer
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource syncer repair rate >0%"
+      description: >
+        Some commitments have reservations that needed repair
+        (wrong metadata like project ID or flavor group). This may indicate data
+        corruption, bugs in reservation creation, or external modifications.
+        Reservations are automatically repaired, but the root cause should be
+        investigated if this alert persists.
+
+  - alert: CortexNovaDoesntFindValidKVMHosts
+    expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0
+    for: 5m
+    labels:
+      context: scheduling
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Nova scheduling cannot find valid KVM hosts"
+      description: >
+        Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling
+        failed to find a valid `{{$labels.hvtype}}` host. This may indicate
+        capacity issues, misconfigured filters, or resource constraints in the
+        datacenter. Investigate the affected VMs and hypervisor availability.
+
+  - alert: CortexNovaNewDatasourcesNotReconciling
+    expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0
+    for: 60m
+    labels:
+      context: datasources
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "New datasource `{{$labels.datasource}}` has not reconciled"
+      description: >
+        A new datasource `{{$labels.datasource}}` has been added but has not
+        completed its first reconciliation yet. This may indicate issues with
+        the datasource controller's workqueue overprioritizing other datasources.
+
+  - alert: CortexNovaExistingDatasourcesLackingBehind
+    expr: |
+      sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600
+      and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1
+    for: 10m
+    labels:
+      context: datasources
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Existing datasource `{{$labels.datasource}}` is lacking behind"
+      description: >
+        An existing datasource `{{$labels.datasource}}` has been queued for
+        reconciliation for more than 10 minutes. This may indicate issues with
+        the datasource controller's workqueue or that this or another datasource
+        is taking an unusually long time to reconcile.
+
+  - alert: CortexNovaReconcileErrorsHigh
+    expr: |
+      (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m])))
+      / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1
+    for: 15m
+    labels:
+      context: controller-errors
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Controller reconcile error rate >10%"
+      description: >
+        More than 10% of controller reconciles are resulting in errors. This may
+        indicate issues with the controller logic, connectivity problems, or
+        external factors causing failures. Check the controller logs for error
+        details and investigate the affected resources.
+
+  - alert: CortexNovaReconcileDurationHigher10Min
+    expr: |
+      (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m])))
+      / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > 600
+    for: 15m
+    labels:
+      context: controller-duration
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})"
+      description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}"
+
+  - alert: CortexNovaWorkqueueNotDrained
+    expr: |
+      sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0
+    for: 60m
+    labels:
+      context: controller-workqueue
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Controller {{ $labels.name }}'s backlog is not being drained."
+      description: >
+        The workqueue for controller {{ $labels.name }} has a backlog that is
+        not being drained. This may indicate that the controller is overwhelmed
+        with work or is stuck on certain resources. Check the controller logs
+        and the state of the resources it manages for more details.
+
+  - alert: CortexNovaWebhookLatencyHigh
+    expr: |
+      histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2
+    for: 15m
+    labels:
+      context: controller-webhook
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Controller webhook {{ $labels.webhook }} latency is high"
+      description: >
+        The latency for webhook {{ $labels.webhook }} is higher than expected (p90 > 200ms).
+        This may indicate performance issues with the webhook server or the logic it executes.
+        Check the webhook server logs and monitor its resource usage for more insights.
+
+  - alert: CortexNovaWebhookErrorsHigh
+    expr: |
+      (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m])))
+      / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1
+    for: 15m
+    labels:
+      context: controller-webhook
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Controller webhook {{ $labels.webhook }} is experiencing errors"
+      description: >
+        The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes.
+        This may indicate issues with the webhook logic, connectivity problems, or
+        external factors causing failures. Check the webhook server logs for error
+        details and investigate the affected resources.
\ No newline at end of file
diff --git a/helm/bundles/cortex-placement-shim/templates/alerts.yaml b/helm/bundles/cortex-placement-shim/templates/alerts.yaml
new file mode 100644
index 000000000..7db3b96e6
--- /dev/null
+++ b/helm/bundles/cortex-placement-shim/templates/alerts.yaml
@@ -0,0 +1,17 @@
+# Copyright SAP SE
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.alerts.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: cortex-placement-shim-alerts
+  labels:
+    type: alerting-rules
+    prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }}
+spec:
+  {{- $files := .Files.Glob "alerts/*.alerts.yaml" }}
+  {{- range $path, $file := $files }}
+  {{ $file | toString | nindent 2 }}
+  {{- end }}
+{{- end }}
diff --git a/helm/bundles/cortex-placement-shim/templates/clusterrole.yaml b/helm/bundles/cortex-placement-shim/templates/clusterrole.yaml
new file mode 100644
index 000000000..489878c89
--- /dev/null
+++ b/helm/bundles/cortex-placement-shim/templates/clusterrole.yaml
@@ -0,0 +1,23 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    {{- include "chart.labels" . | nindent 4 }}
+  name: cortex-placement-shim-role-hypervisor
+rules:
+- apiGroups:
+  - kvm.cloud.sap
+  resources:
+  - hypervisors
+  verbs:
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - kvm.cloud.sap
+  resources:
+  - hypervisors/status
+  verbs:
+  - get
\ No newline at end of file
diff --git a/helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml b/helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml
new file mode 100644
index 000000000..0388373f9
--- /dev/null
+++ b/helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml
@@ -0,0 +1,14 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:
+    {{- include "chart.labels" . | nindent 4 }}
+  name: cortex-placement-shim-rolebinding-hypervisor
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: cortex-placement-shim-role-hypervisor
+subjects:
+- kind: ServiceAccount
+  name: cortex-placement-shim
+  namespace: {{ .Release.Namespace }}
\ No newline at end of file
diff --git a/helm/bundles/cortex-placement-shim/values.yaml b/helm/bundles/cortex-placement-shim/values.yaml
new file mode 100644
index 000000000..40aa9cb11
--- /dev/null
+++ b/helm/bundles/cortex-placement-shim/values.yaml
@@ -0,0 +1,23 @@
+# Copyright SAP SE
+# SPDX-License-Identifier: Apache-2.0
+
+owner-info:
+  enabled: true
+  helm-chart-url: "https://github.com/cobaltcore-dev/cortex/helm/bundles/cortex-placement-shim"
+  maintainers:
+    - "arno.uhlig@sap.com"
+    - "julius.clausnitzer@sap.com"
+    - "malte.viering@sap.com"
+    - "marcel.gute@sap.com"
+    - "markus.wieland@sap.com"
+    - "p.matthes@sap.com"
+  support-group: "workload-management"
+  service: "cortex-placement-shim"
+
+alerts:
+  enabled: true
+  prometheus: openstack
+
+cortex-shim:
+  namePrefix: cortex-placement
+  conf: {} # TODO
diff --git a/helm/library/cortex-shim/Chart.lock b/helm/library/cortex-shim/Chart.lock
new file mode 100644
index 000000000..db4c5823b
--- /dev/null
+++ b/helm/library/cortex-shim/Chart.lock
@@ -0,0 +1,6 @@
+dependencies:
+- name: owner-info
+  repository: oci://ghcr.io/sapcc/helm-charts
+  version: 1.0.0
+digest: sha256:7643f231cc4ebda347fd12ec62fe4445c280e2b71d27eec555f3025290f5038f
+generated: "2025-08-26T10:55:05.888651+02:00"
diff --git a/helm/library/cortex-shim/Chart.yaml b/helm/library/cortex-shim/Chart.yaml
new file mode 100644
index 000000000..5282dc655
--- /dev/null
+++ b/helm/library/cortex-shim/Chart.yaml
@@ -0,0 +1,8 @@
+apiVersion: v2
+name: cortex-shim
+description: A Helm chart to distribute cortex shims.
+type: application
+version: 0.0.1
+appVersion: "sha-3e56acea"
+icon: "https://example.com/icon.png"
+dependencies: []
diff --git a/helm/library/cortex-shim/templates/_helpers.tpl b/helm/library/cortex-shim/templates/_helpers.tpl
new file mode 100644
index 000000000..782e14eef
--- /dev/null
+++ b/helm/library/cortex-shim/templates/_helpers.tpl
@@ -0,0 +1,50 @@
+{{- define "chart.name" -}}
+{{- if .Chart }}
+  {{- if .Chart.Name }}
+    {{- .Chart.Name | trunc 63 | trimSuffix "-" }}
+  {{- else if .Values.nameOverride }}
+    {{ .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+  {{- else }}
+    scheduling
+  {{- end }}
+{{- else }}
+  scheduling
+{{- end }}
+{{- end }}
+
+
+{{- define "chart.labels" -}}
+{{- if .Chart.AppVersion -}}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+{{- if .Chart.Version }}
+helm.sh/chart: {{ .Chart.Version | quote }}
+{{- end }}
+app.kubernetes.io/name: {{ include "chart.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+
+{{- define "chart.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "chart.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+
+{{- define "chart.hasMutatingWebhooks" -}}
+{{- $hasMutating := false }}
+{{- range . }}
+  {{- if eq .type "mutating" }}
+    $hasMutating = true }}{{- end }}
+{{- end }}
+{{ $hasMutating }}}}{{- end }}
+
+
+{{- define "chart.hasValidatingWebhooks" -}}
+{{- $hasValidating := false }}
+{{- range . }}
+  {{- if eq .type "validating" }}
+    $hasValidating = true }}{{- end }}
+{{- end }}
+{{ $hasValidating }}}}{{- end }}
diff --git a/helm/library/cortex-shim/templates/clusterrole.yaml b/helm/library/cortex-shim/templates/clusterrole.yaml
new file mode 100644
index 000000000..74f8e7ad4
--- /dev/null
+++ b/helm/library/cortex-shim/templates/clusterrole.yaml
@@ -0,0 +1,100 @@
+# Roles that grant the shims access to cortex crds.
+{{- if .Values.rbac.enable }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    {{- include "chart.labels" . | nindent 4 }}
+  name: {{ .Values.namePrefix }}-shim-role
+rules:
+- apiGroups:
+  - cortex.cloud
+  resources:
+  - knowledges
+  - datasources
+  - reservations
+  - decisions
+  - deschedulings
+  - pipelines
+  - kpis
+  - histories
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - cortex.cloud
+  resources:
+  - knowledges/finalizers
+  - datasources/finalizers
+  - reservations/finalizers
+  - decisions/finalizers
+  - deschedulings/finalizers
+  - pipelines/finalizers
+  - kpis/finalizers
+  - histories/finalizers
+  verbs:
+  - update
+- apiGroups:
+  - cortex.cloud
+  resources:
+  - knowledges/status
+  - datasources/status
+  - reservations/status
+  - decisions/status
+  - deschedulings/status
+  - pipelines/status
+  - kpis/status
+  - histories/status
+  verbs:
+  - get
+  - patch
+  - update
+- apiGroups:
+  - events.k8s.io
+  resources:
+  - events
+  verbs:
+  - create
+  - patch
+{{- end -}}
+{{- if and .Values.rbac.enable .Values.metrics.enable }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    {{- include "chart.labels" . | nindent 4 }}
+  name: {{ .Values.namePrefix }}-metrics-reader
+rules:
+- nonResourceURLs:
+  - "/metrics"
+  verbs:
+  - get
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    {{- include "chart.labels" . | nindent 4 }}
+  name: {{ .Values.namePrefix }}-metrics-auth-role
+rules:
+- apiGroups:
+  - authentication.k8s.io
+  resources:
+  - tokenreviews
+  verbs:
+  - create
+- apiGroups:
+  - authorization.k8s.io
+  resources:
+  - subjectaccessreviews
+  verbs:
+  - create
+{{- end -}}
+
diff --git a/helm/library/cortex-shim/templates/clusterrolebinding.yaml b/helm/library/cortex-shim/templates/clusterrolebinding.yaml
new file mode 100644
index 000000000..ca82a0119
--- /dev/null
+++ b/helm/library/cortex-shim/templates/clusterrolebinding.yaml
@@ -0,0 +1,34 @@
+{{- if .Values.rbac.enable }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:
+    {{- include "chart.labels" . | nindent 4 }}
+  name: {{ .Values.namePrefix }}-shim-rolebinding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ .Values.namePrefix }}-shim-role
+subjects:
+- kind: ServiceAccount
+  name: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }}
+  namespace: {{ .Release.Namespace }}
+{{- end -}}
+{{- if and .Values.rbac.enable .Values.metrics.enable }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:
+    {{- include "chart.labels" . | nindent 4 }}
+  name: {{ .Values.namePrefix }}-metrics-auth-rolebinding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ .Values.namePrefix }}-metrics-auth-role
+subjects:
+- kind: ServiceAccount
+  name: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }}
+  namespace: {{ .Release.Namespace }}
+{{- end -}}
+
diff --git a/helm/library/cortex-shim/templates/deployment.yaml b/helm/library/cortex-shim/templates/deployment.yaml
new file mode 100644
index 000000000..b38eb3c02
--- /dev/null
+++ b/helm/library/cortex-shim/templates/deployment.yaml
@@ -0,0 +1,112 @@
+# This file is safe from kubebuilder edit --plugins=helm/v1-alpha
+# If you want to re-generate, add the --force flag.
+
+{{- if .Values.deployment.enable }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ .Values.namePrefix }}-shim
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "chart.labels" . | nindent 4 }}
+spec:
+  replicas:  {{ .Values.deployment.replicas }}
+  selector:
+    matchLabels:
+      {{- include "chart.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      annotations:
+        kubectl.kubernetes.io/default-container: shim
+      labels:
+        {{- include "chart.labels" . | nindent 8 }}
+        {{- if and .Values.deployment.pod .Values.deployment.pod.labels }}
+        {{- range $key, $value := .Values.deployment.pod.labels }}
+        {{ $key }}: {{ $value }}
+        {{- end }}
+        {{- end }}
+    spec:
+      containers:
+        - name: shim
+          args:
+            {{- range .Values.deployment.container.args }}
+            - {{ . }}
+            {{- end }}
+          ports:
+            - name: api
+              containerPort: 8080
+              protocol: TCP
+            - name: metrics
+              containerPort: 2112
+              protocol: TCP
+          command:
+            - /main
+          image: {{ .Values.deployment.container.image.repository }}:{{ .Values.deployment.container.image.tag | default .Chart.AppVersion }}
+          {{- if .Values.deployment.container.image.pullPolicy }}
+          imagePullPolicy: {{ .Values.deployment.container.image.pullPolicy }}
+          {{- end }}
+          {{- if .Values.deployment.container.env }}
+          env:
+            {{- range $key, $value := .Values.deployment.container.env }}
+            - name: {{ $key }}
+              value: {{ $value }}
+            {{- end }}
+          {{- end }}
+          livenessProbe:
+            {{- toYaml .Values.deployment.container.livenessProbe | nindent 12 }}
+          readinessProbe:
+            {{- toYaml .Values.deployment.container.readinessProbe | nindent 12 }}
+          resources:
+            {{- toYaml .Values.deployment.container.resources | nindent 12 }}
+          securityContext:
+            {{- toYaml .Values.deployment.container.securityContext | nindent 12 }}
+          volumeMounts:
+            - name: shim-config-volume
+              mountPath: /etc/config
+            - name: shim-secrets-volume
+              mountPath: /etc/secrets
+              readOnly: true
+      securityContext:
+        {{- toYaml .Values.deployment.securityContext | nindent 8 }}
+      serviceAccountName: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }}
+      terminationGracePeriodSeconds: {{ .Values.deployment.terminationGracePeriodSeconds }}
+      volumes:
+        # Custom values to configure the shim.
+        - name: shim-config-volume
+          configMap:
+            name: {{ .Values.namePrefix }}-shim-config
+        - name: shim-secrets-volume
+          secret:
+            secretName: {{ .Values.namePrefix }}-shim-secrets
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ .Values.namePrefix }}-shim-config
+data:
+  conf.json: |-
+    {{- $mergedConf := dict }}
+    {{- if .Values.global.conf }}
+    {{- $mergedConf = .Values.global.conf }}
+    {{- end }}
+    {{- if .Values.conf }}
+    {{- $mergedConf = mergeOverwrite .Values.conf $mergedConf }}
+    {{- end }}
+    {{ toJson $mergedConf }}
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: {{ .Values.namePrefix }}-shim-secrets
+type: Opaque
+data:
+  secrets.json: |-
+    {{- $mergedSecrets := dict }}
+    {{- if .Values.global.secrets }}
+    {{- $mergedSecrets = .Values.global.secrets }}
+    {{- end }}
+    {{- if .Values.secrets }}
+    {{- $mergedSecrets = mergeOverwrite .Values.secrets $mergedSecrets }}
+    {{- end }}
+    {{ toJson $mergedSecrets | b64enc }}
+{{- end }}
\ No newline at end of file
diff --git a/helm/library/cortex-shim/templates/service.yaml b/helm/library/cortex-shim/templates/service.yaml
new file mode 100644
index 000000000..549ceed95
--- /dev/null
+++ b/helm/library/cortex-shim/templates/service.yaml
@@ -0,0 +1,33 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ .Values.namePrefix }}-shim-service
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "chart.labels" . | nindent 4 }}
+spec:
+  ports:
+    - port: 8080
+      targetPort: api
+      protocol: TCP
+      name: api
+  selector:
+    app.kubernetes.io/name: {{ include "chart.name" . }}
+{{- if .Values.metrics.enable }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ .Values.namePrefix }}-shim-metrics-service
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "chart.labels" . | nindent 4 }}
+spec:
+  ports:
+    - port: 2112
+      targetPort: metrics
+      protocol: TCP
+      name: metrics
+  selector:
+    app.kubernetes.io/name: {{ include "chart.name" . }}
+{{- end }}
diff --git a/helm/library/cortex-shim/templates/serviceaccount.yaml b/helm/library/cortex-shim/templates/serviceaccount.yaml
new file mode 100644
index 000000000..ea0789dd0
--- /dev/null
+++ b/helm/library/cortex-shim/templates/serviceaccount.yaml
@@ -0,0 +1,15 @@
+{{- if .Values.rbac.enable }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  labels:
+    {{- include "chart.labels" . | nindent 4 }}
+  {{- if and .Values.deployment.serviceAccount .Values.deployment.serviceAccount.annotations }}
+  annotations:
+    {{- range $key, $value := .Values.deployment.serviceAccount.annotations }}
+    {{ $key }}: {{ $value }}
+    {{- end }}
+  {{- end }}
+  name: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }}
+  namespace: {{ .Release.Namespace }}
+{{- end -}}
diff --git a/helm/library/cortex-shim/templates/servicemonitor.yaml b/helm/library/cortex-shim/templates/servicemonitor.yaml
new file mode 100644
index 000000000..803e66dd5
--- /dev/null
+++ b/helm/library/cortex-shim/templates/servicemonitor.yaml
@@ -0,0 +1,16 @@
+# To integrate with Prometheus.
+{{- if .Values.prometheus.enable }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  labels:
+    {{- include "chart.labels" . | nindent 4 }}
+  name: {{ .Values.namePrefix }}-shim-metrics-monitor
+  namespace: {{ .Release.Namespace }}
+spec:
+  endpoints:
+    - port: metrics
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: {{ include "chart.name" . }}
+{{- end }}
diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml
new file mode 100644
index 000000000..6434e823a
--- /dev/null
+++ b/helm/library/cortex-shim/values.yaml
@@ -0,0 +1,68 @@
+deployment:
+  enable: true
+  replicas: 1
+  container:
+    image:
+      repository: ghcr.io/cobaltcore-dev/cortex-shim
+    args:
+      - "--metrics-bind-address=:2112"
+      - "--health-probe-bind-address=:8081"
+      - "--metrics-secure=false"
+    resources:
+      limits:
+        cpu: 500m
+        memory: 2048Mi
+      requests:
+        cpu: 10m
+        memory: 64Mi
+    livenessProbe:
+      initialDelaySeconds: 15
+      periodSeconds: 20
+      httpGet:
+        path: /healthz
+        port: 8081
+    readinessProbe:
+      initialDelaySeconds: 5
+      periodSeconds: 10
+      httpGet:
+        path: /readyz
+        port: 8081
+    securityContext:
+      allowPrivilegeEscalation: false
+      capabilities:
+        drop:
+          - "ALL"
+  securityContext:
+    runAsNonRoot: true
+    seccompProfile:
+      type: RuntimeDefault
+  terminationGracePeriodSeconds: 10
+  serviceAccountName: shim
+
+# [METRICS]: Set to true to generate manifests for exporting metrics.
+# To disable metrics export set false, and ensure that the
+# ControllerManager argument "--metrics-bind-address=:8443" is removed.
+metrics:
+  enable: true
+
+# [RBAC]: To enable RBAC (Permissions) configurations
+rbac:
+  enable: true
+
+# [PROMETHEUS]: To enable a ServiceMonitor to export metrics to Prometheus set true
+prometheus:
+  enable: true
+
+global:
+  conf: {}
+
+# Use this to unambiguate multiple cortex deployments in the same cluster.
+namePrefix: cortex
+conf:
+  # The scheduling domain this operator is responsible for.
+  schedulingDomain: cortex
+  # Used to differentiate different cortex deployments in the same cluster (e.g. leader election ID)
+  leaderElectionID: cortex-unknown
+  enabledControllers:
+    # The explanation controller is available for all decision resources.
+    - explanation-controller
diff --git a/helm/library/cortex/templates/manager/manager.yaml b/helm/library/cortex/templates/manager/manager.yaml
index 73672164f..0c9f362aa 100644
--- a/helm/library/cortex/templates/manager/manager.yaml
+++ b/helm/library/cortex/templates/manager/manager.yaml
@@ -51,7 +51,7 @@ spec:
               protocol: TCP
             {{- end }}
           command:
-            - /manager
+            - /main
           image: {{ .Values.controllerManager.container.image.repository }}:{{ .Values.controllerManager.container.image.tag | default .Chart.AppVersion }}
           {{- if .Values.controllerManager.container.image.pullPolicy }}
           imagePullPolicy: {{ .Values.controllerManager.container.image.pullPolicy }}
diff --git a/internal/shim/placement/.gitkeep b/internal/shim/placement/.gitkeep
new file mode 100644
index 000000000..e69de29bb

From b55ca3c98c694f8fc6d291a0e5b72ef3e99e987f Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Thu, 9 Apr 2026 11:56:31 +0200
Subject: [PATCH 2/6] Add monitoring labels and scaffold manager (w/o leader
 election)

---
 cmd/shim/main.go                              | 248 +++++++++++++++++-
 .../bundles/cortex-placement-shim/values.yaml |   6 +-
 helm/library/cortex-shim/values.yaml          |   9 +-
 3 files changed, 252 insertions(+), 11 deletions(-)

diff --git a/cmd/shim/main.go b/cmd/shim/main.go
index 6b0634229..d59490c3c 100644
--- a/cmd/shim/main.go
+++ b/cmd/shim/main.go
@@ -3,7 +3,251 @@
 
 package main
 
+import (
+	"context"
+	"crypto/tls"
+	"errors"
+	"flag"
+	"net/http"
+	"os"
+	"path/filepath"
+
+	"github.com/cobaltcore-dev/cortex/api/v1alpha1"
+	"github.com/cobaltcore-dev/cortex/pkg/conf"
+	"github.com/cobaltcore-dev/cortex/pkg/monitoring"
+	hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
+	"github.com/sapcc/go-bits/httpext"
+	"k8s.io/apimachinery/pkg/runtime"
+	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
+	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/certwatcher"
+	"sigs.k8s.io/controller-runtime/pkg/healthz"
+	"sigs.k8s.io/controller-runtime/pkg/log/zap"
+	"sigs.k8s.io/controller-runtime/pkg/metrics"
+	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
+	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
+	"sigs.k8s.io/controller-runtime/pkg/webhook"
+)
+
+var (
+	// Scheme defines the scheme for the API types used by the shim.
+	scheme = runtime.NewScheme()
+	// setupLog is the logger used for setup operations in the shim.
+	setupLog = ctrl.Log.WithName("setup")
+)
+
+func init() {
+	// Bind the Kubernetes client-go scheme and the custom API types to the
+	// scheme used by the shim.
+	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
+	utilruntime.Must(v1alpha1.AddToScheme(scheme)) // Cortex crds
+	utilruntime.Must(hv1.AddToScheme(scheme))      // Hypervisor crd
+}
+
 func main() {
-	// TODO: this needs scaffolding, for now it just does nothing.
-	select {}
+	ctx := context.Background()
+	restConfig := ctrl.GetConfigOrDie()
+
+	var metricsAddr string
+	var metricsCertPath, metricsCertName, metricsCertKey string
+	var webhookCertPath, webhookCertName, webhookCertKey string
+	// The shim does not require leader election, but this flag is provided to
+	// stay consistent with the kubebuilder scaffold.
+	var enableLeaderElection bool
+	var probeAddr string
+	var secureMetrics bool
+	var enableHTTP2 bool
+	var tlsOpts []func(*tls.Config)
+	flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
+		"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
+	flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
+	flag.BoolVar(&enableLeaderElection, "leader-elect", false,
+		"Enable leader election for controller manager. "+
+			"Enabling this will ensure there is only one active controller manager.")
+	flag.BoolVar(&secureMetrics, "metrics-secure", true,
+		"If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.")
+	flag.StringVar(&webhookCertPath, "webhook-cert-path", "", "The directory that contains the webhook certificate.")
+	flag.StringVar(&webhookCertName, "webhook-cert-name", "tls.crt", "The name of the webhook certificate file.")
+	flag.StringVar(&webhookCertKey, "webhook-cert-key", "tls.key", "The name of the webhook key file.")
+	flag.StringVar(&metricsCertPath, "metrics-cert-path", "",
+		"The directory that contains the metrics server certificate.")
+	flag.StringVar(&metricsCertName, "metrics-cert-name", "tls.crt", "The name of the metrics server certificate file.")
+	flag.StringVar(&metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.")
+	flag.BoolVar(&enableHTTP2, "enable-http2", false,
+		"If set, HTTP/2 will be enabled for the metrics and webhook servers")
+	opts := zap.Options{
+		Development: true,
+	}
+	opts.BindFlags(flag.CommandLine)
+	flag.Parse()
+
+	// Check that we're really running this shim without leader election enabled.
+	if enableLeaderElection {
+		err := errors.New("leader election should not be enabled for the shim")
+		setupLog.Error(err, "invalid configuration")
+		os.Exit(1)
+	}
+
+	ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
+
+	// if the enable-http2 flag is false (the default), http/2 should be disabled
+	// due to its vulnerabilities. More specifically, disabling http/2 will
+	// prevent from being vulnerable to the HTTP/2 Stream Cancellation and
+	// Rapid Reset CVEs. For more information see:
+	// - https://github.com/advisories/GHSA-qppj-fm5r-hxr3
+	// - https://github.com/advisories/GHSA-4374-p667-p6c8
+	disableHTTP2 := func(c *tls.Config) {
+		setupLog.Info("disabling http/2")
+		c.NextProtos = []string{"http/1.1"}
+	}
+
+	if !enableHTTP2 {
+		tlsOpts = append(tlsOpts, disableHTTP2)
+	}
+
+	// Create watchers for metrics and webhooks certificates
+	var metricsCertWatcher, webhookCertWatcher *certwatcher.CertWatcher
+
+	// Initial webhook TLS options
+	webhookTLSOpts := tlsOpts
+
+	if webhookCertPath != "" {
+		setupLog.Info("Initializing webhook certificate watcher using provided certificates",
+			"webhook-cert-path", webhookCertPath, "webhook-cert-name", webhookCertName, "webhook-cert-key", webhookCertKey)
+
+		var err error
+		webhookCertWatcher, err = certwatcher.New(
+			filepath.Join(webhookCertPath, webhookCertName),
+			filepath.Join(webhookCertPath, webhookCertKey),
+		)
+		if err != nil {
+			setupLog.Error(err, "Failed to initialize webhook certificate watcher")
+			os.Exit(1)
+		}
+
+		webhookTLSOpts = append(webhookTLSOpts, func(config *tls.Config) {
+			config.GetCertificate = webhookCertWatcher.GetCertificate
+		})
+	}
+
+	webhookServer := webhook.NewServer(webhook.Options{
+		TLSOpts: webhookTLSOpts,
+	})
+
+	// Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
+	// More info:
+	// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.21.0/pkg/metrics/server
+	// - https://book.kubebuilder.io/reference/metrics.html
+	metricsServerOptions := metricsserver.Options{
+		BindAddress:   metricsAddr,
+		SecureServing: secureMetrics,
+		TLSOpts:       tlsOpts,
+	}
+
+	if secureMetrics {
+		// FilterProvider is used to protect the metrics endpoint with authn/authz.
+		// These configurations ensure that only authorized users and service accounts
+		// can access the metrics endpoint. The RBAC are configured in 'config/rbac/kustomization.yaml'. More info:
+		// https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.21.0/pkg/metrics/filters#WithAuthenticationAndAuthorization
+		metricsServerOptions.FilterProvider = filters.WithAuthenticationAndAuthorization
+	}
+
+	// If the certificate is not specified, controller-runtime will automatically
+	// generate self-signed certificates for the metrics server. While convenient for development and testing,
+	// this setup is not recommended for production.
+	//
+	// If you enable certManager, uncomment the following lines:
+	// - [METRICS-WITH-CERTS] at config/default/kustomization.yaml to generate and use certificates
+	// managed by cert-manager for the metrics server.
+	// - [PROMETHEUS-WITH-CERTS] at config/prometheus/kustomization.yaml for TLS certification.
+	if metricsCertPath != "" {
+		setupLog.Info("Initializing metrics certificate watcher using provided certificates",
+			"metrics-cert-path", metricsCertPath, "metrics-cert-name", metricsCertName, "metrics-cert-key", metricsCertKey)
+
+		var err error
+		metricsCertWatcher, err = certwatcher.New(
+			filepath.Join(metricsCertPath, metricsCertName),
+			filepath.Join(metricsCertPath, metricsCertKey),
+		)
+		if err != nil {
+			setupLog.Error(err, "to initialize metrics certificate watcher", "error", err)
+			os.Exit(1)
+		}
+
+		metricsServerOptions.TLSOpts = append(metricsServerOptions.TLSOpts, func(config *tls.Config) {
+			config.GetCertificate = metricsCertWatcher.GetCertificate
+		})
+	}
+
+	mgr, err := ctrl.NewManager(restConfig, ctrl.Options{
+		Scheme:                 scheme,
+		Metrics:                metricsServerOptions,
+		WebhookServer:          webhookServer,
+		HealthProbeBindAddress: probeAddr,
+		// Kept for consistency with kubebuilder scaffold, but the shim should
+		// always run with leader election disabled.
+		LeaderElection: enableLeaderElection,
+	})
+	if err != nil {
+		setupLog.Error(err, "unable to start manager")
+		os.Exit(1)
+	}
+
+	// TODO: Initialize multicluster client here.
+
+	// Our custom monitoring registry can add prometheus labels to all metrics.
+	// This is useful to distinguish metrics from different deployments.
+	metricsConfig := conf.GetConfigOrDie[monitoring.Config]()
+	metrics.Registry = monitoring.WrapRegistry(metrics.Registry, metricsConfig)
+
+	// API endpoint.
+	mux := http.NewServeMux()
+
+	// +kubebuilder:scaffold:builder
+
+	if metricsCertWatcher != nil {
+		setupLog.Info("Adding metrics certificate watcher to manager")
+		if err := mgr.Add(metricsCertWatcher); err != nil {
+			setupLog.Error(err, "unable to add metrics certificate watcher to manager")
+			os.Exit(1)
+		}
+	}
+
+	if webhookCertWatcher != nil {
+		setupLog.Info("Adding webhook certificate watcher to manager")
+		if err := mgr.Add(webhookCertWatcher); err != nil {
+			setupLog.Error(err, "unable to add webhook certificate watcher to manager")
+			os.Exit(1)
+		}
+	}
+
+	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
+		setupLog.Error(err, "unable to set up health check")
+		os.Exit(1)
+	}
+	if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
+		setupLog.Error(err, "unable to set up ready check")
+		os.Exit(1)
+	}
+
+	errchan := make(chan error)
+	go func() {
+		errchan <- func() error {
+			setupLog.Info("starting api server", "address", ":8080")
+			return httpext.ListenAndServeContext(ctx, ":8080", mux)
+		}()
+	}()
+	go func() {
+		if err := <-errchan; err != nil {
+			setupLog.Error(err, "problem running api server")
+			os.Exit(1)
+		}
+	}()
+
+	setupLog.Info("starting manager")
+	if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
+		setupLog.Error(err, "problem running manager")
+		os.Exit(1)
+	}
 }
diff --git a/helm/bundles/cortex-placement-shim/values.yaml b/helm/bundles/cortex-placement-shim/values.yaml
index 40aa9cb11..6dd793653 100644
--- a/helm/bundles/cortex-placement-shim/values.yaml
+++ b/helm/bundles/cortex-placement-shim/values.yaml
@@ -20,4 +20,8 @@ alerts:
 
 cortex-shim:
   namePrefix: cortex-placement
-  conf: {} # TODO
+  conf:
+    monitoring:
+      labels:
+        github_org: cobaltcore-dev
+        github_repo: cortex
diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml
index 6434e823a..1c45c2542 100644
--- a/helm/library/cortex-shim/values.yaml
+++ b/helm/library/cortex-shim/values.yaml
@@ -58,11 +58,4 @@ global:
 
 # Use this to unambiguate multiple cortex deployments in the same cluster.
 namePrefix: cortex
-conf:
-  # The scheduling domain this operator is responsible for.
-  schedulingDomain: cortex
-  # Used to differentiate different cortex deployments in the same cluster (e.g. leader election ID)
-  leaderElectionID: cortex-unknown
-  enabledControllers:
-    # The explanation controller is available for all decision resources.
-    - explanation-controller
+conf: {} # No config for now that's needed by all the shims.

From 5bd2e0491899ea8f3e44ada30ba2b3878ba0b93d Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Thu, 9 Apr 2026 12:01:19 +0200
Subject: [PATCH 3/6] Remove alerts

---
 .../alerts/placement-shim.alerts.yaml         | 735 +-----------------
 1 file changed, 2 insertions(+), 733 deletions(-)

diff --git a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml
index 41bf29794..03aea7763 100644
--- a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml
+++ b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml
@@ -1,734 +1,3 @@
 groups:
-- name: cortex-nova-alerts
-  rules:
-  - alert: CortexNovaSchedulingDown
-    expr: |
-      up{pod=~"cortex-nova-scheduling-.*"} != 1 or
-      absent(up{pod=~"cortex-nova-scheduling-.*"})
-    for: 5m
-    labels:
-      context: liveness
-      dashboard: cortex/cortex
-      service: cortex
-      severity: critical
-      support_group: workload-management
-      playbook: docs/support/playbook/cortex/down
-    annotations:
-      summary: "Cortex Scheduling for Nova is down"
-      description: >
-        The Cortex scheduling service is down. Scheduling requests from Nova will
-        not be served. This is non-critical for vmware virtual machines, but
-        blocks kvm virtual machines from being scheduled. Thus, it is
-        recommended to immediately investigate and resolve the issue.
-
-  - alert: CortexNovaKnowledgeDown
-    expr: |
-      up{pod=~"cortex-nova-knowledge-.*"} != 1 or
-      absent(up{pod=~"cortex-nova-knowledge-.*"})
-    for: 5m
-    labels:
-      context: liveness
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-      playbook: docs/support/playbook/cortex/down
-    annotations:
-      summary: "Cortex Knowledge for Nova is down"
-      description: >
-        The Cortex Knowledge service is down. This is no immediate problem,
-        since cortex is still able to process requests,
-        but the quality of the responses may be affected.
-
-  - alert: CortexNovaDeschedulerPipelineErroring
-    expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0
-    for: 5m
-    labels:
-      context: descheduler
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Descheduler pipeline is erroring."
-      description: >
-        The Cortex descheduler pipeline is encountering errors during its execution.
-        This may indicate issues with the descheduling logic or the underlying infrastructure.
-        It is recommended to investigate the descheduler logs and the state of the VMs being processed.
-
-  - alert: CortexNovaHttpRequest400sTooHigh
-    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1
-    for: 5m
-    labels:
-      context: api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Nova Scheduler HTTP request 400 errors too high"
-      description: >
-        Nova Scheduler is responding to placement requests with HTTP 4xx
-        errors. This is expected when the scheduling request cannot be served
-        by Cortex. However, it could also indicate that the request format has
-        changed and Cortex is unable to parse it.
-
-  - alert: CortexNovaSchedulingHttpRequest500sTooHigh
-    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1
-    for: 5m
-    labels:
-      context: api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Nova Scheduler HTTP request 500 errors too high"
-      description: >
-        Nova Scheduler is responding to placement requests with HTTP 5xx errors.
-        This is not expected and indicates that Cortex is having some internal problem.
-        Nova will continue to place new VMs, but the placement will be less desirable.
-        Thus, no immediate action is needed.
-
-  - alert: CortexNovaHighMemoryUsage
-    expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > 6000 * 1024 * 1024
-    for: 5m
-    labels:
-      context: memory
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "`{{$labels.component}}` uses too much memory"
-      description: >
-        `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it
-        should use much less, so there may be a memory leak or other changes
-        that are causing the memory usage to increase significantly.
-
-  - alert: CortexNovaHighCPUUsage
-    expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5
-    for: 5m
-    labels:
-      context: cpu
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "`{{$labels.component}}` uses too much CPU"
-      description: >
-        `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually
-        it should use much less, so there may be a CPU leak or other changes
-        that are causing the CPU usage to increase significantly.
-
-  - alert: CortexNovaTooManyDBConnectionAttempts
-    expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1
-    for: 5m
-    labels:
-      context: db
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "`{{$labels.component}}` is trying to connect to the database too often"
-      description: >
-        `{{$labels.component}}` is trying to connect to the database too often. This may happen
-        when the database is down or the connection parameters are misconfigured.
-
-  - alert: CortexNovaSyncNotSuccessful
-    expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0
-    for: 5m
-    labels:
-      context: syncstatus
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "`{{$labels.component}}` Sync not successful"
-      description: >
-        `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may
-        happen when the datasource (OpenStack, Prometheus, etc.) is down or
-        the sync module is misconfigured. No immediate action is needed, since
-        the sync module will retry the sync operation and the currently synced
-        data will be kept. However, when this problem persists for a longer
-        time the service will have a less recent view of the datacenter.
-
-  - alert: CortexNovaSyncObjectsDroppedToZero
-    expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0
-    for: 60m
-    labels:
-      context: syncobjects
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`"
-      description: >
-        `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen
-        when the datasource (OpenStack, Prometheus, etc.) is down or the sync
-        module is misconfigured. No immediate action is needed, since the sync
-        module will retry the sync operation and the currently synced data will
-        be kept. However, when this problem persists for a longer time the
-        service will have a less recent view of the datacenter.
-
-  - alert: CortexNovaDatasourceUnready
-    expr: cortex_datasource_state{domain="nova",state!="ready"} != 0
-    for: 60m
-    labels:
-      context: datasources
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state"
-      description: >
-        This may indicate issues with the datasource
-        connectivity or configuration. It is recommended to investigate the
-        datasource status and logs for more details.
-
-  - alert: CortexNovaKnowledgeUnready
-    expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0
-    for: 60m
-    labels:
-      context: knowledge
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state"
-      description: >
-        This may indicate issues with the knowledge
-        configuration. It is recommended to investigate the
-        knowledge status and logs for more details.
-
-  - alert: CortexNovaDecisionsWithErrors
-    expr: cortex_decision_state{domain="nova",state="error"} > 0
-    for: 5m
-    labels:
-      context: decisions
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Some decisions are in error state for operator `{{$labels.operator}}`"
-      description: >
-        The cortex scheduling pipeline generated decisions that are in error state.
-        This may indicate issues with the decision logic or the underlying infrastructure.
-        It is recommended to investigate the decision logs and the state of the
-        VMs being processed.
-
-  - alert: CortexNovaTooManyDecisionsWaiting
-    expr: cortex_decision_state{domain="nova",state="waiting"} > 10
-    for: 5m
-    labels:
-      context: decisions
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`"
-      description: >
-        The cortex scheduling pipeline has a high number of decisions for which
-        no target host has been assigned yet.
-
-        This may indicate a backlog in processing or issues with the decision logic.
-        It is recommended to investigate the decision logs and the state of the
-        VMs being processed.
-
-  - alert: CortexNovaKPIUnready
-    expr: |
-      cortex_kpi_state{domain="nova",state!="ready"} != 0
-    for: 60m
-    labels:
-      context: kpis
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state"
-      description: >
-        This may indicate issues with the KPI
-        configuration. It is recommended to investigate the
-        KPI status and logs for more details.
-
-  - alert: CortexNovaPipelineUnready
-    expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0
-    for: 5m
-    labels:
-      context: pipelines
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state"
-      description: >
-        This may indicate issues with the pipeline
-        configuration. It is recommended to investigate the
-        pipeline status and logs for more details.
-
-  # Committed Resource Info API Alerts
-  - alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh
-    expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
-    for: 5m
-    labels:
-      context: committed-resource-api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource info API HTTP 500 errors too high"
-      description: >
-        The committed resource info API (Limes LIQUID integration) is responding
-        with HTTP 5xx errors. This indicates internal problems building service info,
-        such as invalid flavor group data. Limes will not be able to discover available
-        resources until the issue is resolved.
-
-  # Committed Resource Change API Alerts
-  - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh
-    expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
-    for: 5m
-    labels:
-      context: committed-resource-api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource change API HTTP 400 errors too high"
-      description: >
-        The committed resource change API (Limes LIQUID integration) is responding
-        with HTTP 4xx errors. This may happen when Limes sends a request with
-        an outdated info version (409), the API is temporarily unavailable,
-        or the request format is invalid. Limes will typically retry these
-        requests, so no immediate action is needed unless the errors persist.
-
-  - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh
-    expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
-    for: 5m
-    labels:
-      context: committed-resource-api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource change API HTTP 500 errors too high"
-      description: >
-        The committed resource change API (Limes LIQUID integration) is responding
-        with HTTP 5xx errors. This is not expected and indicates that Cortex
-        is having an internal problem processing commitment changes. Limes will
-        continue to retry, but new commitments may not be fulfilled until the
-        issue is resolved.
-
-  - alert: CortexNovaCommittedResourceLatencyTooHigh
-    expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30
-    for: 5m
-    labels:
-      context: committed-resource-api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource change API latency too high"
-      description: >
-        The committed resource change API (Limes LIQUID integration) is experiencing
-        high latency (p95 > 30s). This may indicate that the scheduling pipeline
-        is under heavy load or that reservation scheduling is taking longer than
-        expected. Limes requests may time out, causing commitment changes to fail.
-
-  - alert: CortexNovaCommittedResourceRejectionRateTooHigh
-    expr: |
-      sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m]))
-      / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5
-    for: 5m
-    labels:
-      context: committed-resource-api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource rejection rate too high"
-      description: >
-        More than 50% of commitment change requests are being rejected.
-        This may indicate insufficient capacity in the datacenter to fulfill
-        new commitments, or issues with the commitment scheduling logic.
-        Rejected commitments are rolled back, so Limes will see them as failed
-        and may retry or report the failure to users.
-
-  - alert: CortexNovaCommittedResourceTimeoutsTooHigh
-    expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0
-    for: 5m
-    labels:
-      context: committed-resource-api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource change API timeouts too high"
-      description: >
-        The committed resource change API (Limes LIQUID integration) timed out
-        while waiting for reservations to become ready. This indicates that the
-        scheduling pipeline is overloaded or reservations are taking too long
-        to be scheduled. Affected commitment changes are rolled back and Limes
-        will see them as failed. Consider investigating the scheduler performance
-        or increasing the timeout configuration.
-
-  # Committed Resource Usage API Alerts
-  - alert: CortexNovaCommittedResourceUsageHttpRequest400sTooHigh
-    expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
-    for: 5m
-    labels:
-      context: committed-resource-api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource usage API HTTP 400 errors too high"
-      description: >
-        The committed resource usage API (Limes LIQUID integration) is responding
-        with HTTP 4xx errors. This may indicate invalid project IDs or malformed
-        requests from Limes. Limes will typically retry these requests.
-
-  - alert: CortexNovaCommittedResourceUsageHttpRequest500sTooHigh
-    expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
-    for: 5m
-    labels:
-      context: committed-resource-api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource usage API HTTP 500 errors too high"
-      description: >
-        The committed resource usage API (Limes LIQUID integration) is responding
-        with HTTP 5xx errors. This indicates internal problems fetching reservations
-        or Nova server data. Limes may receive stale or incomplete usage data.
-
-  - alert: CortexNovaCommittedResourceUsageLatencyTooHigh
-    expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_usage_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
-    for: 5m
-    labels:
-      context: committed-resource-api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource usage API latency too high"
-      description: >
-        The committed resource usage API (Limes LIQUID integration) is experiencing
-        high latency (p95 > 5s). This may indicate slow Nova API responses or
-        database queries. Limes scrapes may time out, affecting quota reporting.
-
-  # Committed Resource Capacity API Alerts
-  - alert: CortexNovaCommittedResourceCapacityHttpRequest400sTooHigh
-    expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
-    for: 5m
-    labels:
-      context: committed-resource-api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource capacity API HTTP 400 errors too high"
-      description: >
-        The committed resource capacity API (Limes LIQUID integration) is responding
-        with HTTP 4xx errors. This may indicate malformed requests from Limes.
-
-  - alert: CortexNovaCommittedResourceCapacityHttpRequest500sTooHigh
-    expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
-    for: 5m
-    labels:
-      context: committed-resource-api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource capacity API HTTP 500 errors too high"
-      description: >
-        The committed resource capacity API (Limes LIQUID integration) is responding
-        with HTTP 5xx errors. This indicates internal problems calculating cluster
-        capacity. Limes may receive stale or incomplete capacity data.
-
-  - alert: CortexNovaCommittedResourceCapacityLatencyTooHigh
-    expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_capacity_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
-    for: 5m
-    labels:
-      context: committed-resource-api
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource capacity API latency too high"
-      description: >
-        The committed resource capacity API (Limes LIQUID integration) is experiencing
-        high latency (p95 > 5s). This may indicate slow database queries or knowledge
-        CRD retrieval. Limes scrapes may time out, affecting capacity reporting.
-
-  # Committed Resource Syncer Alerts
-  - alert: CortexNovaCommittedResourceSyncerErrorsHigh
-    expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3
-    for: 5m
-    labels:
-      context: committed-resource-syncer
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource syncer experiencing errors"
-      description: >
-        The committed resource syncer has encountered multiple errors in the last hour.
-        This may indicate connectivity issues with Limes. Check the syncer logs for error details.
-
-  - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh
-    expr: |
-      (
-        sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h]))
-        / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
-      ) > 0.05
-      and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
-    for: 15m
-    labels:
-      context: committed-resource-syncer
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource syncer unit mismatch rate >5%"
-      description: >
-        More than 5% of commitments are being skipped due to unit mismatches between
-        Limes and Cortex flavor groups. This happens when Limes has not yet been
-        updated to use the new unit format after a flavor group change. The affected
-        commitments will keep their existing reservations until Limes notices the update.
-        Check the logs if this error persists for longer time.
-
-  - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh
-    expr: |
-      (
-        sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h]))
-        / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
-      ) > 0
-      and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
-    for: 15m
-    labels:
-      context: committed-resource-syncer
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource syncer unknown flavor group rate >0%"
-      description: >
-        Some commitments reference flavor groups that don't exist in
-        Cortex Knowledge (anymore). This may indicate that flavor group configuration is
-        out of sync between Limes and Cortex, or that Knowledge extraction is failing.
-        Check the flavor group Knowledge CRD and history to see what was changed.
-
-  - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh
-    expr: |
-      (
-        (
-          rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) +
-          rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) +
-          rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
-        ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
-      ) > 0.01
-      and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
-    for: 15m
-    labels:
-      context: committed-resource-syncer
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource syncer local change rate >1%"
-      description: >
-        More than 1% of synced commitments are requiring reservation changes
-        (creates, deletes, or repairs). This is higher than expected for steady-state
-        operation and may indicate data inconsistencies, external modifications to
-        reservations, or issues with the CRDs. Check Cortex logs for details.
-
-  - alert: CortexNovaCommittedResourceSyncerRepairRateHigh
-    expr: |
-      (
-        rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
-        / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
-      ) > 0
-      and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
-    for: 15m
-    labels:
-      context: committed-resource-syncer
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Committed Resource syncer repair rate >0%"
-      description: >
-        Some commitments have reservations that needed repair
-        (wrong metadata like project ID or flavor group). This may indicate data
-        corruption, bugs in reservation creation, or external modifications.
-        Reservations are automatically repaired, but the root cause should be
-        investigated if this alert persists.
-
-  - alert: CortexNovaDoesntFindValidKVMHosts
-    expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0
-    for: 5m
-    labels:
-      context: scheduling
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Nova scheduling cannot find valid KVM hosts"
-      description: >
-        Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling
-        failed to find a valid `{{$labels.hvtype}}` host. This may indicate
-        capacity issues, misconfigured filters, or resource constraints in the
-        datacenter. Investigate the affected VMs and hypervisor availability.
-
-  - alert: CortexNovaNewDatasourcesNotReconciling
-    expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0
-    for: 60m
-    labels:
-      context: datasources
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "New datasource `{{$labels.datasource}}` has not reconciled"
-      description: >
-        A new datasource `{{$labels.datasource}}` has been added but has not
-        completed its first reconciliation yet. This may indicate issues with
-        the datasource controller's workqueue overprioritizing other datasources.
-
-  - alert: CortexNovaExistingDatasourcesLackingBehind
-    expr: |
-      sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600
-      and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1
-    for: 10m
-    labels:
-      context: datasources
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Existing datasource `{{$labels.datasource}}` is lacking behind"
-      description: >
-        An existing datasource `{{$labels.datasource}}` has been queued for
-        reconciliation for more than 10 minutes. This may indicate issues with
-        the datasource controller's workqueue or that this or another datasource
-        is taking an unusually long time to reconcile.
-
-  - alert: CortexNovaReconcileErrorsHigh
-    expr: |
-      (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m])))
-      / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1
-    for: 15m
-    labels:
-      context: controller-errors
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Controller reconcile error rate >10%"
-      description: >
-        More than 10% of controller reconciles are resulting in errors. This may
-        indicate issues with the controller logic, connectivity problems, or
-        external factors causing failures. Check the controller logs for error
-        details and investigate the affected resources.
-
-  - alert: CortexNovaReconcileDurationHigher10Min
-    expr: |
-      (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m])))
-      / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > 600
-    for: 15m
-    labels:
-      context: controller-duration
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})"
-      description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}"
-
-  - alert: CortexNovaWorkqueueNotDrained
-    expr: |
-      sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0
-    for: 60m
-    labels:
-      context: controller-workqueue
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Controller {{ $labels.name }}'s backlog is not being drained."
-      description: >
-        The workqueue for controller {{ $labels.name }} has a backlog that is
-        not being drained. This may indicate that the controller is overwhelmed
-        with work or is stuck on certain resources. Check the controller logs
-        and the state of the resources it manages for more details.
-
-  - alert: CortexNovaWebhookLatencyHigh
-    expr: |
-      histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2
-    for: 15m
-    labels:
-      context: controller-webhook
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Controller webhook {{ $labels.webhook }} latency is high"
-      description: >
-        The latency for webhook {{ $labels.webhook }} is higher than expected (p90 > 200ms).
-        This may indicate performance issues with the webhook server or the logic it executes.
-        Check the webhook server logs and monitor its resource usage for more insights.
-
-  - alert: CortexNovaWebhookErrorsHigh
-    expr: |
-      (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m])))
-      / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1
-    for: 15m
-    labels:
-      context: controller-webhook
-      dashboard: cortex/cortex
-      service: cortex
-      severity: warning
-      support_group: workload-management
-    annotations:
-      summary: "Controller webhook {{ $labels.webhook }} is experiencing errors"
-      description: >
-        The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes.
-        This may indicate issues with the webhook logic, connectivity problems, or
-        external factors causing failures. Check the webhook server logs for error
-        details and investigate the affected resources.
\ No newline at end of file
+- name: cortex-placement-shim-alerts
+  rules: []
\ No newline at end of file

From efcfef53f4e567024a0c98cd37c9cb29edc6204e Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Thu, 9 Apr 2026 13:20:52 +0200
Subject: [PATCH 4/6] PR feedback

---
 .github/workflows/push-images.yaml              |  7 ++++++-
 cmd/shim/main.go                                |  9 ++++-----
 helm/library/cortex-shim/templates/_helpers.tpl | 12 ++++++++----
 helm/library/cortex-shim/templates/service.yaml |  4 ++--
 helm/library/cortex-shim/values.yaml            |  4 ++--
 5 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml
index 3085b503b..f3be685ce 100644
--- a/.github/workflows/push-images.yaml
+++ b/.github/workflows/push-images.yaml
@@ -81,6 +81,11 @@ jobs:
           files: |
             cmd/shim/**
             internal/shim/**
+            api/**
+            pkg/**
+            go.mod
+            go.sum
+            Dockerfile
       - name: Docker Meta (Cortex Shim)
         if: steps.changed_shim_files.outputs.all_changed_files != ''
         id: meta_cortex_shim
@@ -99,7 +104,7 @@ jobs:
         id: push_cortex_shim
         uses: docker/build-push-action@v7
         with:
-          context: cmd/shim
+          context: .
           platforms: linux/amd64,linux/arm64
           push: true
           tags: ${{ steps.meta_cortex_shim.outputs.tags }}
diff --git a/cmd/shim/main.go b/cmd/shim/main.go
index d59490c3c..970c8c934 100644
--- a/cmd/shim/main.go
+++ b/cmd/shim/main.go
@@ -4,7 +4,6 @@
 package main
 
 import (
-	"context"
 	"crypto/tls"
 	"errors"
 	"flag"
@@ -46,7 +45,7 @@ func init() {
 }
 
 func main() {
-	ctx := context.Background()
+	ctx := ctrl.SetupSignalHandler()
 	restConfig := ctrl.GetConfigOrDie()
 
 	var metricsAddr string
@@ -110,7 +109,7 @@ func main() {
 	var metricsCertWatcher, webhookCertWatcher *certwatcher.CertWatcher
 
 	// Initial webhook TLS options
-	webhookTLSOpts := tlsOpts
+	webhookTLSOpts := append([]func(*tls.Config){}, tlsOpts...)
 
 	if webhookCertPath != "" {
 		setupLog.Info("Initializing webhook certificate watcher using provided certificates",
@@ -142,7 +141,7 @@ func main() {
 	metricsServerOptions := metricsserver.Options{
 		BindAddress:   metricsAddr,
 		SecureServing: secureMetrics,
-		TLSOpts:       tlsOpts,
+		TLSOpts:       append([]func(*tls.Config){}, tlsOpts...),
 	}
 
 	if secureMetrics {
@@ -246,7 +245,7 @@ func main() {
 	}()
 
 	setupLog.Info("starting manager")
-	if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
+	if err := mgr.Start(ctx); err != nil {
 		setupLog.Error(err, "problem running manager")
 		os.Exit(1)
 	}
diff --git a/helm/library/cortex-shim/templates/_helpers.tpl b/helm/library/cortex-shim/templates/_helpers.tpl
index 782e14eef..cca33d701 100644
--- a/helm/library/cortex-shim/templates/_helpers.tpl
+++ b/helm/library/cortex-shim/templates/_helpers.tpl
@@ -36,15 +36,19 @@ app.kubernetes.io/instance: {{ .Release.Name }}
 {{- $hasMutating := false }}
 {{- range . }}
   {{- if eq .type "mutating" }}
-    $hasMutating = true }}{{- end }}
+    {{- $hasMutating = true -}}
+  {{- end }}
+{{- end }}
+{{ $hasMutating }}
 {{- end }}
-{{ $hasMutating }}}}{{- end }}
 
 
 {{- define "chart.hasValidatingWebhooks" -}}
 {{- $hasValidating := false }}
 {{- range . }}
   {{- if eq .type "validating" }}
-    $hasValidating = true }}{{- end }}
+    {{- $hasValidating = true -}}
+  {{- end }}
+{{- end }}
+{{ $hasValidating }}
 {{- end }}
-{{ $hasValidating }}}}{{- end }}
diff --git a/helm/library/cortex-shim/templates/service.yaml b/helm/library/cortex-shim/templates/service.yaml
index 549ceed95..faf3082a3 100644
--- a/helm/library/cortex-shim/templates/service.yaml
+++ b/helm/library/cortex-shim/templates/service.yaml
@@ -12,7 +12,7 @@ spec:
       protocol: TCP
       name: api
   selector:
-    app.kubernetes.io/name: {{ include "chart.name" . }}
+    {{- include "chart.selectorLabels" . | nindent 4 }}
 {{- if .Values.metrics.enable }}
 ---
 apiVersion: v1
@@ -29,5 +29,5 @@ spec:
       protocol: TCP
       name: metrics
   selector:
-    app.kubernetes.io/name: {{ include "chart.name" . }}
+    {{- include "chart.selectorLabels" . | nindent 4 }}
 {{- end }}
diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml
index 1c45c2542..1d1bc844c 100644
--- a/helm/library/cortex-shim/values.yaml
+++ b/helm/library/cortex-shim/values.yaml
@@ -40,8 +40,8 @@ deployment:
   serviceAccountName: shim
 
 # [METRICS]: Set to true to generate manifests for exporting metrics.
-# To disable metrics export set false, and ensure that the
-# ControllerManager argument "--metrics-bind-address=:8443" is removed.
+# To disable metrics export set false, and remove the container args
+# "--metrics-bind-address=:2112" and "--metrics-secure=false".
 metrics:
   enable: true
 

From 653cf8591e6ec3ea401a1e9c476fb996e5dafc0a Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Thu, 9 Apr 2026 13:36:10 +0200
Subject: [PATCH 5/6] PR feedback

---
 cmd/shim/main.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmd/shim/main.go b/cmd/shim/main.go
index 970c8c934..9feea8d5f 100644
--- a/cmd/shim/main.go
+++ b/cmd/shim/main.go
@@ -170,7 +170,7 @@ func main() {
 			filepath.Join(metricsCertPath, metricsCertKey),
 		)
 		if err != nil {
-			setupLog.Error(err, "to initialize metrics certificate watcher", "error", err)
+			setupLog.Error(err, "Failed to initialize metrics certificate watcher")
 			os.Exit(1)
 		}
 

From 88b2cb49ac0e481320725ed0c923547bf8c36eb9 Mon Sep 17 00:00:00 2001
From: Philipp Matthes <p.matthes@sap.com>
Date: Thu, 9 Apr 2026 13:42:55 +0200
Subject: [PATCH 6/6] 3 replicas by default

---
 helm/library/cortex-shim/values.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml
index 1d1bc844c..63574fbe4 100644
--- a/helm/library/cortex-shim/values.yaml
+++ b/helm/library/cortex-shim/values.yaml
@@ -1,6 +1,6 @@
 deployment:
   enable: true
-  replicas: 1
+  replicas: 3
   container:
     image:
       repository: ghcr.io/cobaltcore-dev/cortex-shim