From 6771b30e17bd409c263519a76a58b006692b8090 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 11:27:28 +0200 Subject: [PATCH 1/6] Scaffold cortex placement api shim --- .github/workflows/push-charts.yaml | 19 - .github/workflows/push-images.yaml | 45 ++ .github/workflows/update-appversion.yml | 21 + .gitignore | 1 + AGENTS.md | 3 +- Dockerfile | 8 +- Tiltfile | 24 +- cmd/{ => manager}/main.go | 0 cmd/shim/main.go | 9 + helm/bundles/cortex-placement-shim/Chart.yaml | 20 + .../alerts/placement-shim.alerts.yaml | 734 ++++++++++++++++++ .../templates/alerts.yaml | 17 + .../templates/clusterrole.yaml | 23 + .../templates/clusterrolebinding.yaml | 14 + .../bundles/cortex-placement-shim/values.yaml | 23 + helm/library/cortex-shim/Chart.lock | 6 + helm/library/cortex-shim/Chart.yaml | 8 + .../cortex-shim/templates/_helpers.tpl | 50 ++ .../cortex-shim/templates/clusterrole.yaml | 100 +++ .../templates/clusterrolebinding.yaml | 34 + .../cortex-shim/templates/deployment.yaml | 112 +++ .../cortex-shim/templates/service.yaml | 33 + .../cortex-shim/templates/serviceaccount.yaml | 15 + .../cortex-shim/templates/servicemonitor.yaml | 16 + helm/library/cortex-shim/values.yaml | 68 ++ .../cortex/templates/manager/manager.yaml | 2 +- internal/shim/placement/.gitkeep | 0 27 files changed, 1379 insertions(+), 26 deletions(-) rename cmd/{ => manager}/main.go (100%) create mode 100644 cmd/shim/main.go create mode 100644 helm/bundles/cortex-placement-shim/Chart.yaml create mode 100644 helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml create mode 100644 helm/bundles/cortex-placement-shim/templates/alerts.yaml create mode 100644 helm/bundles/cortex-placement-shim/templates/clusterrole.yaml create mode 100644 helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml create mode 100644 helm/bundles/cortex-placement-shim/values.yaml create mode 100644 helm/library/cortex-shim/Chart.lock create mode 100644 helm/library/cortex-shim/Chart.yaml create mode 100644 helm/library/cortex-shim/templates/_helpers.tpl create mode 100644 helm/library/cortex-shim/templates/clusterrole.yaml create mode 100644 helm/library/cortex-shim/templates/clusterrolebinding.yaml create mode 100644 helm/library/cortex-shim/templates/deployment.yaml create mode 100644 helm/library/cortex-shim/templates/service.yaml create mode 100644 helm/library/cortex-shim/templates/serviceaccount.yaml create mode 100644 helm/library/cortex-shim/templates/servicemonitor.yaml create mode 100644 helm/library/cortex-shim/values.yaml create mode 100644 internal/shim/placement/.gitkeep diff --git a/.github/workflows/push-charts.yaml b/.github/workflows/push-charts.yaml index 2e3577275..a4559d15a 100644 --- a/.github/workflows/push-charts.yaml +++ b/.github/workflows/push-charts.yaml @@ -27,25 +27,6 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Get all changed helm/library/cortex Chart.yaml files - id: changed-chart-yaml-files-core - uses: tj-actions/changed-files@v47 - with: - files: | - helm/library/cortex/Chart.yaml - - name: Push cortex core charts to registry - if: steps.changed-chart-yaml-files-core.outputs.all_changed_files != '' - shell: bash - env: - ALL_CHANGED_FILES: ${{ steps.changed-chart-yaml-files-core.outputs.all_changed_files }} - run: | - for CHART_FILE in ${ALL_CHANGED_FILES}; do - CHART_DIR=$(dirname $CHART_FILE) - helm package $CHART_DIR --dependency-update --destination $CHART_DIR - CHART_PACKAGE=$(ls $CHART_DIR/*.tgz) - helm push $CHART_PACKAGE oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/ - done - - name: Get all changed library Chart.yaml files id: changed-chart-yaml-files-library uses: tj-actions/changed-files@v47 diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml index 997595976..3085b503b 100644 --- a/.github/workflows/push-images.yaml +++ b/.github/workflows/push-images.yaml @@ -72,6 +72,50 @@ jobs: subject-digest: ${{ steps.push_cortex_postgres.outputs.digest }} push-to-registry: true + # Only build and push the cortex-shim image if there are changes related + # to the cortex shims (e.g., in cmd/shim or internal/shim). + - name: Get all changed shim/ files + id: changed_shim_files + uses: tj-actions/changed-files@v47 + with: + files: | + cmd/shim/** + internal/shim/** + - name: Docker Meta (Cortex Shim) + if: steps.changed_shim_files.outputs.all_changed_files != '' + id: meta_cortex_shim + uses: docker/metadata-action@v6 + with: + images: ${{ env.REGISTRY }}/${{ github.repository }}-shim + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=sha + latest + env: + DOCKER_METADATA_SHORT_SHA_LENGTH: 8 + - name: Build and Push Cortex Shim + if: steps.changed_shim_files.outputs.all_changed_files != '' + id: push_cortex_shim + uses: docker/build-push-action@v7 + with: + context: cmd/shim + platforms: linux/amd64,linux/arm64 + push: true + tags: ${{ steps.meta_cortex_shim.outputs.tags }} + labels: ${{ steps.meta_cortex_shim.outputs.labels }} + build-args: | + GIT_TAG=${{ github.ref_name }} + GIT_COMMIT=${{ github.sha }} + GOMAIN=cmd/shim/main.go + - name: Generate Artifact Attestation for Cortex Shim + if: steps.changed_shim_files.outputs.all_changed_files != '' + uses: actions/attest-build-provenance@v4 + with: + subject-name: ${{ env.REGISTRY }}/${{ github.repository }}-shim + subject-digest: ${{ steps.push_cortex_shim.outputs.digest }} + push-to-registry: true + # Build & push new cortex image - name: Docker Meta (Cortex) id: meta_cortex @@ -98,6 +142,7 @@ jobs: build-args: | GIT_TAG=${{ github.ref_name }} GIT_COMMIT=${{ github.sha }} + GOMAIN=cmd/manager/main.go - name: Generate Artifact Attestation for Cortex uses: actions/attest-build-provenance@v4 with: diff --git a/.github/workflows/update-appversion.yml b/.github/workflows/update-appversion.yml index cc5ccdc9f..20087fa80 100644 --- a/.github/workflows/update-appversion.yml +++ b/.github/workflows/update-appversion.yml @@ -44,6 +44,27 @@ jobs: git commit -m "Bump cortex-postgres chart appVersions to ${{ steps.vars.outputs.sha }} [skip ci]" || echo "No changes to commit" git push origin HEAD:main + # Only bumped if there are changes in shim-related directories + - name: Get all changed shim files + id: changed_shim_files + uses: tj-actions/changed-files@v47 + with: + files: | + internal/shim/** + cmd/shim/** + - name: Update appVersion in cortex-shim Chart.yaml + if: steps.changed_shim_files.outputs.all_changed_files != '' + run: | + sed -i 's/^\([ ]*appVersion:[ ]*\).*/\1"${{ steps.vars.outputs.sha }}"/' helm/library/cortex-shim/Chart.yaml + - name: Commit and push changes for cortex-shim + if: steps.changed_shim_files.outputs.all_changed_files != '' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add helm/library/cortex-shim/Chart.yaml + git commit -m "Bump cortex-shim chart appVersions to ${{ steps.vars.outputs.sha }} [skip ci]" || echo "No changes to commit" + git push origin HEAD:main + - name: Update appVersion in helm/library/cortex/Chart.yaml run: | sed -i 's/^\([ ]*appVersion:[ ]*\).*/\1"${{ steps.vars.outputs.sha }}"/' helm/library/cortex/Chart.yaml diff --git a/.gitignore b/.gitignore index 04bac2d09..7e21248bc 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ cortex.secrets.yaml !.editorconfig !.gitignore !.github +!.gitkeep !.golangci.yaml !.license-scan-overrides.jsonl !.license-scan-rules.json diff --git a/AGENTS.md b/AGENTS.md index 6f2e12a17..59747bd8c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -50,7 +50,8 @@ Helm charts: ## Repository Structure Code: -- `cmd/main.go` is the entry point for the manager, which starts the controllers and webhooks +- `cmd/manager/main.go` is the entry point for the manager, which starts the controllers and webhooks +- `cmd/shim/main.go` is the entry point for cortex shims exposing cortex capabilities over REST endpoints - `api/v1alpha1` is where the CRD specs of cortex lives - `api/external` contains messages sent to cortex via http from external openstack services - `internal/scheduling` contains the logic for scheduling in different cloud domains diff --git a/Dockerfile b/Dockerfile index 6f7e79bea..2580e9637 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,8 @@ ARG TARGETARCH ARG GO_MOD_PATH=. ARG GOCACHE=/root/.cache/go-build ENV GOCACHE=${GOCACHE} +ARG GOMAIN=cmd/manager/main.go +ENV GOMAIN=${GOMAIN} # Note: avoid using COPY to /lib which will lead to docker build errors. WORKDIR /workspace/${GO_MOD_PATH} @@ -29,13 +31,13 @@ ENV GOOS=${TARGETOS:-linux} ENV GOARCH=${TARGETARCH} RUN --mount=type=cache,target=/go/pkg/mod/ \ --mount=type=cache,target=${GOCACHE} \ - go build -a -o /manager cmd/main.go + go build -a -o /main ${GOMAIN} # Use distroless as minimal base image to package the manager binary # Refer to https://github.com/GoogleContainerTools/distroless for more details FROM gcr.io/distroless/static:nonroot WORKDIR / -COPY --from=builder /manager . +COPY --from=builder /main . USER 65532:65532 -ENTRYPOINT ["/manager"] +ENTRYPOINT ["/main"] diff --git a/Tiltfile b/Tiltfile index 6871d18b3..bc87f4d30 100644 --- a/Tiltfile +++ b/Tiltfile @@ -7,7 +7,10 @@ analytics_settings(False) # Use the ACTIVE_DEPLOYMENTS env var to select which Cortex bundles to deploy. -ACTIVE_DEPLOYMENTS_ENV = os.getenv('ACTIVE_DEPLOYMENTS', 'nova,manila,cinder,ironcore,pods') +ACTIVE_DEPLOYMENTS_ENV = os.getenv( + 'ACTIVE_DEPLOYMENTS', + 'nova,manila,cinder,ironcore,pods,placement', +) if ACTIVE_DEPLOYMENTS_ENV == "": ACTIVE_DEPLOYMENTS = [] # Catch "".split(",") = [""] else: @@ -78,13 +81,22 @@ local('kubectl wait --namespace cert-manager --for=condition=available deploymen url = 'https://raw.githubusercontent.com/cobaltcore-dev/openstack-hypervisor-operator/refs/heads/main/charts/openstack-hypervisor-operator/crds/kvm.cloud.sap_hypervisors.yaml' local('curl -L ' + url + ' | kubectl apply -f -') -########### Cortex Operator & CRDs +########### Cortex Manager & CRDs docker_build('ghcr.io/cobaltcore-dev/cortex', '.', dockerfile='Dockerfile', + build_args={'GOMAIN': 'cmd/manager/main.go'}, only=['internal/', 'cmd/', 'api/', 'pkg', 'go.mod', 'go.sum', 'Dockerfile'], ) local('sh helm/sync.sh helm/library/cortex') +########### Cortex Shim +docker_build('ghcr.io/cobaltcore-dev/cortex-shim', '.', + dockerfile='Dockerfile', + build_args={'GOMAIN': 'cmd/shim/main.go'}, + only=['internal/', 'cmd/', 'api/', 'pkg', 'go.mod', 'go.sum', 'Dockerfile'], +) +local('sh helm/sync.sh helm/library/cortex-shim') + ########### Cortex Bundles docker_build('ghcr.io/cobaltcore-dev/cortex-postgres', 'postgres') @@ -98,6 +110,7 @@ bundle_charts = [ ('helm/bundles/cortex-cinder', 'cortex-cinder'), ('helm/bundles/cortex-ironcore', 'cortex-ironcore'), ('helm/bundles/cortex-pods', 'cortex-pods'), + ('helm/bundles/cortex-placement-shim', 'cortex-placement-shim'), ] dep_charts = { 'cortex-crds': [ @@ -123,6 +136,9 @@ dep_charts = { ('helm/library/cortex-postgres', 'cortex-postgres'), ('helm/library/cortex', 'cortex'), ], + 'cortex-placement-shim': [ + ('helm/library/cortex-shim', 'cortex-shim'), + ], } for (bundle_chart_path, bundle_chart_name) in bundle_charts: @@ -255,6 +271,10 @@ if 'pods' in ACTIVE_DEPLOYMENTS: k8s_yaml('samples/pods/pod.yaml') k8s_resource('test-pod', labels=['Cortex-Pods']) +if 'placement' in ACTIVE_DEPLOYMENTS: + print("Activating Cortex Placement Shim bundle") + k8s_yaml(helm('./helm/bundles/cortex-placement-shim', name='cortex-placement-shim', values=tilt_values, set=env_set_overrides)) + ########### Dev Dependencies local('sh helm/sync.sh helm/dev/cortex-prometheus-operator') k8s_yaml(helm('./helm/dev/cortex-prometheus-operator', name='cortex-prometheus-operator')) # Operator diff --git a/cmd/main.go b/cmd/manager/main.go similarity index 100% rename from cmd/main.go rename to cmd/manager/main.go diff --git a/cmd/shim/main.go b/cmd/shim/main.go new file mode 100644 index 000000000..6b0634229 --- /dev/null +++ b/cmd/shim/main.go @@ -0,0 +1,9 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package main + +func main() { + // TODO: this needs scaffolding, for now it just does nothing. + select {} +} diff --git a/helm/bundles/cortex-placement-shim/Chart.yaml b/helm/bundles/cortex-placement-shim/Chart.yaml new file mode 100644 index 000000000..7f53ed347 --- /dev/null +++ b/helm/bundles/cortex-placement-shim/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright SAP SE +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v2 +name: cortex-placement-shim +description: A Helm chart deploying the Cortex placement shim. +type: application +version: 0.0.1 +appVersion: 0.1.0 +dependencies: + # from: file://../../library/cortex-shim + - name: cortex-shim + repository: oci://ghcr.io/cobaltcore-dev/cortex/charts + version: 0.0.1 + # Owner info adds a configmap to the kubernetes cluster with information on + # the service owner. This makes it easier to find out who to contact in case + # of issues. See: https://github.com/sapcc/helm-charts/pkgs/container/helm-charts%2Fowner-info + - name: owner-info + repository: oci://ghcr.io/sapcc/helm-charts + version: 1.0.0 diff --git a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml new file mode 100644 index 000000000..41bf29794 --- /dev/null +++ b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml @@ -0,0 +1,734 @@ +groups: +- name: cortex-nova-alerts + rules: + - alert: CortexNovaSchedulingDown + expr: | + up{pod=~"cortex-nova-scheduling-.*"} != 1 or + absent(up{pod=~"cortex-nova-scheduling-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex/cortex + service: cortex + severity: critical + support_group: workload-management + playbook: docs/support/playbook/cortex/down + annotations: + summary: "Cortex Scheduling for Nova is down" + description: > + The Cortex scheduling service is down. Scheduling requests from Nova will + not be served. This is non-critical for vmware virtual machines, but + blocks kvm virtual machines from being scheduled. Thus, it is + recommended to immediately investigate and resolve the issue. + + - alert: CortexNovaKnowledgeDown + expr: | + up{pod=~"cortex-nova-knowledge-.*"} != 1 or + absent(up{pod=~"cortex-nova-knowledge-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/down + annotations: + summary: "Cortex Knowledge for Nova is down" + description: > + The Cortex Knowledge service is down. This is no immediate problem, + since cortex is still able to process requests, + but the quality of the responses may be affected. + + - alert: CortexNovaDeschedulerPipelineErroring + expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 + for: 5m + labels: + context: descheduler + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Descheduler pipeline is erroring." + description: > + The Cortex descheduler pipeline is encountering errors during its execution. + This may indicate issues with the descheduling logic or the underlying infrastructure. + It is recommended to investigate the descheduler logs and the state of the VMs being processed. + + - alert: CortexNovaHttpRequest400sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Nova Scheduler HTTP request 400 errors too high" + description: > + Nova Scheduler is responding to placement requests with HTTP 4xx + errors. This is expected when the scheduling request cannot be served + by Cortex. However, it could also indicate that the request format has + changed and Cortex is unable to parse it. + + - alert: CortexNovaSchedulingHttpRequest500sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Nova Scheduler HTTP request 500 errors too high" + description: > + Nova Scheduler is responding to placement requests with HTTP 5xx errors. + This is not expected and indicates that Cortex is having some internal problem. + Nova will continue to place new VMs, but the placement will be less desirable. + Thus, no immediate action is needed. + + - alert: CortexNovaHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > 6000 * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{$labels.component}}` uses too much memory" + description: > + `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it + should use much less, so there may be a memory leak or other changes + that are causing the memory usage to increase significantly. + + - alert: CortexNovaHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5 + for: 5m + labels: + context: cpu + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{$labels.component}}` uses too much CPU" + description: > + `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually + it should use much less, so there may be a CPU leak or other changes + that are causing the CPU usage to increase significantly. + + - alert: CortexNovaTooManyDBConnectionAttempts + expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1 + for: 5m + labels: + context: db + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{$labels.component}}` is trying to connect to the database too often" + description: > + `{{$labels.component}}` is trying to connect to the database too often. This may happen + when the database is down or the connection parameters are misconfigured. + + - alert: CortexNovaSyncNotSuccessful + expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0 + for: 5m + labels: + context: syncstatus + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{$labels.component}}` Sync not successful" + description: > + `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may + happen when the datasource (OpenStack, Prometheus, etc.) is down or + the sync module is misconfigured. No immediate action is needed, since + the sync module will retry the sync operation and the currently synced + data will be kept. However, when this problem persists for a longer + time the service will have a less recent view of the datacenter. + + - alert: CortexNovaSyncObjectsDroppedToZero + expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0 + for: 60m + labels: + context: syncobjects + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" + description: > + `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen + when the datasource (OpenStack, Prometheus, etc.) is down or the sync + module is misconfigured. No immediate action is needed, since the sync + module will retry the sync operation and the currently synced data will + be kept. However, when this problem persists for a longer time the + service will have a less recent view of the datacenter. + + - alert: CortexNovaDatasourceUnready + expr: cortex_datasource_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: datasources + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" + description: > + This may indicate issues with the datasource + connectivity or configuration. It is recommended to investigate the + datasource status and logs for more details. + + - alert: CortexNovaKnowledgeUnready + expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: knowledge + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" + description: > + This may indicate issues with the knowledge + configuration. It is recommended to investigate the + knowledge status and logs for more details. + + - alert: CortexNovaDecisionsWithErrors + expr: cortex_decision_state{domain="nova",state="error"} > 0 + for: 5m + labels: + context: decisions + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Some decisions are in error state for operator `{{$labels.operator}}`" + description: > + The cortex scheduling pipeline generated decisions that are in error state. + This may indicate issues with the decision logic or the underlying infrastructure. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexNovaTooManyDecisionsWaiting + expr: cortex_decision_state{domain="nova",state="waiting"} > 10 + for: 5m + labels: + context: decisions + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`" + description: > + The cortex scheduling pipeline has a high number of decisions for which + no target host has been assigned yet. + + This may indicate a backlog in processing or issues with the decision logic. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexNovaKPIUnready + expr: | + cortex_kpi_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: kpis + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" + description: > + This may indicate issues with the KPI + configuration. It is recommended to investigate the + KPI status and logs for more details. + + - alert: CortexNovaPipelineUnready + expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0 + for: 5m + labels: + context: pipelines + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" + description: > + This may indicate issues with the pipeline + configuration. It is recommended to investigate the + pipeline status and logs for more details. + + # Committed Resource Info API Alerts + - alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh + expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource info API HTTP 500 errors too high" + description: > + The committed resource info API (Limes LIQUID integration) is responding + with HTTP 5xx errors. This indicates internal problems building service info, + such as invalid flavor group data. Limes will not be able to discover available + resources until the issue is resolved. + + # Committed Resource Change API Alerts + - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh + expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API HTTP 400 errors too high" + description: > + The committed resource change API (Limes LIQUID integration) is responding + with HTTP 4xx errors. This may happen when Limes sends a request with + an outdated info version (409), the API is temporarily unavailable, + or the request format is invalid. Limes will typically retry these + requests, so no immediate action is needed unless the errors persist. + + - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh + expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API HTTP 500 errors too high" + description: > + The committed resource change API (Limes LIQUID integration) is responding + with HTTP 5xx errors. This is not expected and indicates that Cortex + is having an internal problem processing commitment changes. Limes will + continue to retry, but new commitments may not be fulfilled until the + issue is resolved. + + - alert: CortexNovaCommittedResourceLatencyTooHigh + expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API latency too high" + description: > + The committed resource change API (Limes LIQUID integration) is experiencing + high latency (p95 > 30s). This may indicate that the scheduling pipeline + is under heavy load or that reservation scheduling is taking longer than + expected. Limes requests may time out, causing commitment changes to fail. + + - alert: CortexNovaCommittedResourceRejectionRateTooHigh + expr: | + sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m])) + / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource rejection rate too high" + description: > + More than 50% of commitment change requests are being rejected. + This may indicate insufficient capacity in the datacenter to fulfill + new commitments, or issues with the commitment scheduling logic. + Rejected commitments are rolled back, so Limes will see them as failed + and may retry or report the failure to users. + + - alert: CortexNovaCommittedResourceTimeoutsTooHigh + expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API timeouts too high" + description: > + The committed resource change API (Limes LIQUID integration) timed out + while waiting for reservations to become ready. This indicates that the + scheduling pipeline is overloaded or reservations are taking too long + to be scheduled. Affected commitment changes are rolled back and Limes + will see them as failed. Consider investigating the scheduler performance + or increasing the timeout configuration. + + # Committed Resource Usage API Alerts + - alert: CortexNovaCommittedResourceUsageHttpRequest400sTooHigh + expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource usage API HTTP 400 errors too high" + description: > + The committed resource usage API (Limes LIQUID integration) is responding + with HTTP 4xx errors. This may indicate invalid project IDs or malformed + requests from Limes. Limes will typically retry these requests. + + - alert: CortexNovaCommittedResourceUsageHttpRequest500sTooHigh + expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource usage API HTTP 500 errors too high" + description: > + The committed resource usage API (Limes LIQUID integration) is responding + with HTTP 5xx errors. This indicates internal problems fetching reservations + or Nova server data. Limes may receive stale or incomplete usage data. + + - alert: CortexNovaCommittedResourceUsageLatencyTooHigh + expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_usage_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource usage API latency too high" + description: > + The committed resource usage API (Limes LIQUID integration) is experiencing + high latency (p95 > 5s). This may indicate slow Nova API responses or + database queries. Limes scrapes may time out, affecting quota reporting. + + # Committed Resource Capacity API Alerts + - alert: CortexNovaCommittedResourceCapacityHttpRequest400sTooHigh + expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource capacity API HTTP 400 errors too high" + description: > + The committed resource capacity API (Limes LIQUID integration) is responding + with HTTP 4xx errors. This may indicate malformed requests from Limes. + + - alert: CortexNovaCommittedResourceCapacityHttpRequest500sTooHigh + expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource capacity API HTTP 500 errors too high" + description: > + The committed resource capacity API (Limes LIQUID integration) is responding + with HTTP 5xx errors. This indicates internal problems calculating cluster + capacity. Limes may receive stale or incomplete capacity data. + + - alert: CortexNovaCommittedResourceCapacityLatencyTooHigh + expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_capacity_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource capacity API latency too high" + description: > + The committed resource capacity API (Limes LIQUID integration) is experiencing + high latency (p95 > 5s). This may indicate slow database queries or knowledge + CRD retrieval. Limes scrapes may time out, affecting capacity reporting. + + # Committed Resource Syncer Alerts + - alert: CortexNovaCommittedResourceSyncerErrorsHigh + expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3 + for: 5m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer experiencing errors" + description: > + The committed resource syncer has encountered multiple errors in the last hour. + This may indicate connectivity issues with Limes. Check the syncer logs for error details. + + - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh + expr: | + ( + sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h])) + / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) + ) > 0.05 + and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer unit mismatch rate >5%" + description: > + More than 5% of commitments are being skipped due to unit mismatches between + Limes and Cortex flavor groups. This happens when Limes has not yet been + updated to use the new unit format after a flavor group change. The affected + commitments will keep their existing reservations until Limes notices the update. + Check the logs if this error persists for longer time. + + - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh + expr: | + ( + sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h])) + / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) + ) > 0 + and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer unknown flavor group rate >0%" + description: > + Some commitments reference flavor groups that don't exist in + Cortex Knowledge (anymore). This may indicate that flavor group configuration is + out of sync between Limes and Cortex, or that Knowledge extraction is failing. + Check the flavor group Knowledge CRD and history to see what was changed. + + - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh + expr: | + ( + ( + rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) + + rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) + + rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) + ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) + ) > 0.01 + and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer local change rate >1%" + description: > + More than 1% of synced commitments are requiring reservation changes + (creates, deletes, or repairs). This is higher than expected for steady-state + operation and may indicate data inconsistencies, external modifications to + reservations, or issues with the CRDs. Check Cortex logs for details. + + - alert: CortexNovaCommittedResourceSyncerRepairRateHigh + expr: | + ( + rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) + ) > 0 + and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer repair rate >0%" + description: > + Some commitments have reservations that needed repair + (wrong metadata like project ID or flavor group). This may indicate data + corruption, bugs in reservation creation, or external modifications. + Reservations are automatically repaired, but the root cause should be + investigated if this alert persists. + + - alert: CortexNovaDoesntFindValidKVMHosts + expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0 + for: 5m + labels: + context: scheduling + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Nova scheduling cannot find valid KVM hosts" + description: > + Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling + failed to find a valid `{{$labels.hvtype}}` host. This may indicate + capacity issues, misconfigured filters, or resource constraints in the + datacenter. Investigate the affected VMs and hypervisor availability. + + - alert: CortexNovaNewDatasourcesNotReconciling + expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0 + for: 60m + labels: + context: datasources + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "New datasource `{{$labels.datasource}}` has not reconciled" + description: > + A new datasource `{{$labels.datasource}}` has been added but has not + completed its first reconciliation yet. This may indicate issues with + the datasource controller's workqueue overprioritizing other datasources. + + - alert: CortexNovaExistingDatasourcesLackingBehind + expr: | + sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600 + and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1 + for: 10m + labels: + context: datasources + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Existing datasource `{{$labels.datasource}}` is lacking behind" + description: > + An existing datasource `{{$labels.datasource}}` has been queued for + reconciliation for more than 10 minutes. This may indicate issues with + the datasource controller's workqueue or that this or another datasource + is taking an unusually long time to reconcile. + + - alert: CortexNovaReconcileErrorsHigh + expr: | + (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m]))) + / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1 + for: 15m + labels: + context: controller-errors + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller reconcile error rate >10%" + description: > + More than 10% of controller reconciles are resulting in errors. This may + indicate issues with the controller logic, connectivity problems, or + external factors causing failures. Check the controller logs for error + details and investigate the affected resources. + + - alert: CortexNovaReconcileDurationHigher10Min + expr: | + (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m]))) + / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > 600 + for: 15m + labels: + context: controller-duration + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})" + description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}" + + - alert: CortexNovaWorkqueueNotDrained + expr: | + sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0 + for: 60m + labels: + context: controller-workqueue + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller {{ $labels.name }}'s backlog is not being drained." + description: > + The workqueue for controller {{ $labels.name }} has a backlog that is + not being drained. This may indicate that the controller is overwhelmed + with work or is stuck on certain resources. Check the controller logs + and the state of the resources it manages for more details. + + - alert: CortexNovaWebhookLatencyHigh + expr: | + histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2 + for: 15m + labels: + context: controller-webhook + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller webhook {{ $labels.webhook }} latency is high" + description: > + The latency for webhook {{ $labels.webhook }} is higher than expected (p90 > 200ms). + This may indicate performance issues with the webhook server or the logic it executes. + Check the webhook server logs and monitor its resource usage for more insights. + + - alert: CortexNovaWebhookErrorsHigh + expr: | + (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m]))) + / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1 + for: 15m + labels: + context: controller-webhook + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller webhook {{ $labels.webhook }} is experiencing errors" + description: > + The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes. + This may indicate issues with the webhook logic, connectivity problems, or + external factors causing failures. Check the webhook server logs for error + details and investigate the affected resources. \ No newline at end of file diff --git a/helm/bundles/cortex-placement-shim/templates/alerts.yaml b/helm/bundles/cortex-placement-shim/templates/alerts.yaml new file mode 100644 index 000000000..7db3b96e6 --- /dev/null +++ b/helm/bundles/cortex-placement-shim/templates/alerts.yaml @@ -0,0 +1,17 @@ +# Copyright SAP SE +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.alerts.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: cortex-placement-shim-alerts + labels: + type: alerting-rules + prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} +spec: + {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} + {{- range $path, $file := $files }} + {{ $file | toString | nindent 2 }} + {{- end }} +{{- end }} diff --git a/helm/bundles/cortex-placement-shim/templates/clusterrole.yaml b/helm/bundles/cortex-placement-shim/templates/clusterrole.yaml new file mode 100644 index 000000000..489878c89 --- /dev/null +++ b/helm/bundles/cortex-placement-shim/templates/clusterrole.yaml @@ -0,0 +1,23 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: cortex-placement-shim-role-hypervisor +rules: +- apiGroups: + - kvm.cloud.sap + resources: + - hypervisors + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - kvm.cloud.sap + resources: + - hypervisors/status + verbs: + - get \ No newline at end of file diff --git a/helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml b/helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml new file mode 100644 index 000000000..0388373f9 --- /dev/null +++ b/helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: cortex-placement-shim-rolebinding-hypervisor +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cortex-placement-shim-role-hypervisor +subjects: +- kind: ServiceAccount + name: cortex-placement-shim + namespace: {{ .Release.Namespace }} \ No newline at end of file diff --git a/helm/bundles/cortex-placement-shim/values.yaml b/helm/bundles/cortex-placement-shim/values.yaml new file mode 100644 index 000000000..40aa9cb11 --- /dev/null +++ b/helm/bundles/cortex-placement-shim/values.yaml @@ -0,0 +1,23 @@ +# Copyright SAP SE +# SPDX-License-Identifier: Apache-2.0 + +owner-info: + enabled: true + helm-chart-url: "https://github.com/cobaltcore-dev/cortex/helm/bundles/cortex-placement-shim" + maintainers: + - "arno.uhlig@sap.com" + - "julius.clausnitzer@sap.com" + - "malte.viering@sap.com" + - "marcel.gute@sap.com" + - "markus.wieland@sap.com" + - "p.matthes@sap.com" + support-group: "workload-management" + service: "cortex-placement-shim" + +alerts: + enabled: true + prometheus: openstack + +cortex-shim: + namePrefix: cortex-placement + conf: {} # TODO diff --git a/helm/library/cortex-shim/Chart.lock b/helm/library/cortex-shim/Chart.lock new file mode 100644 index 000000000..db4c5823b --- /dev/null +++ b/helm/library/cortex-shim/Chart.lock @@ -0,0 +1,6 @@ +dependencies: +- name: owner-info + repository: oci://ghcr.io/sapcc/helm-charts + version: 1.0.0 +digest: sha256:7643f231cc4ebda347fd12ec62fe4445c280e2b71d27eec555f3025290f5038f +generated: "2025-08-26T10:55:05.888651+02:00" diff --git a/helm/library/cortex-shim/Chart.yaml b/helm/library/cortex-shim/Chart.yaml new file mode 100644 index 000000000..5282dc655 --- /dev/null +++ b/helm/library/cortex-shim/Chart.yaml @@ -0,0 +1,8 @@ +apiVersion: v2 +name: cortex-shim +description: A Helm chart to distribute cortex shims. +type: application +version: 0.0.1 +appVersion: "sha-3e56acea" +icon: "https://example.com/icon.png" +dependencies: [] diff --git a/helm/library/cortex-shim/templates/_helpers.tpl b/helm/library/cortex-shim/templates/_helpers.tpl new file mode 100644 index 000000000..782e14eef --- /dev/null +++ b/helm/library/cortex-shim/templates/_helpers.tpl @@ -0,0 +1,50 @@ +{{- define "chart.name" -}} +{{- if .Chart }} + {{- if .Chart.Name }} + {{- .Chart.Name | trunc 63 | trimSuffix "-" }} + {{- else if .Values.nameOverride }} + {{ .Values.nameOverride | trunc 63 | trimSuffix "-" }} + {{- else }} + scheduling + {{- end }} +{{- else }} + scheduling +{{- end }} +{{- end }} + + +{{- define "chart.labels" -}} +{{- if .Chart.AppVersion -}} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +{{- if .Chart.Version }} +helm.sh/chart: {{ .Chart.Version | quote }} +{{- end }} +app.kubernetes.io/name: {{ include "chart.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + + +{{- define "chart.selectorLabels" -}} +app.kubernetes.io/name: {{ include "chart.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + + +{{- define "chart.hasMutatingWebhooks" -}} +{{- $hasMutating := false }} +{{- range . }} + {{- if eq .type "mutating" }} + $hasMutating = true }}{{- end }} +{{- end }} +{{ $hasMutating }}}}{{- end }} + + +{{- define "chart.hasValidatingWebhooks" -}} +{{- $hasValidating := false }} +{{- range . }} + {{- if eq .type "validating" }} + $hasValidating = true }}{{- end }} +{{- end }} +{{ $hasValidating }}}}{{- end }} diff --git a/helm/library/cortex-shim/templates/clusterrole.yaml b/helm/library/cortex-shim/templates/clusterrole.yaml new file mode 100644 index 000000000..74f8e7ad4 --- /dev/null +++ b/helm/library/cortex-shim/templates/clusterrole.yaml @@ -0,0 +1,100 @@ +# Roles that grant the shims access to cortex crds. +{{- if .Values.rbac.enable }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: {{ .Values.namePrefix }}-shim-role +rules: +- apiGroups: + - cortex.cloud + resources: + - knowledges + - datasources + - reservations + - decisions + - deschedulings + - pipelines + - kpis + - histories + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - cortex.cloud + resources: + - knowledges/finalizers + - datasources/finalizers + - reservations/finalizers + - decisions/finalizers + - deschedulings/finalizers + - pipelines/finalizers + - kpis/finalizers + - histories/finalizers + verbs: + - update +- apiGroups: + - cortex.cloud + resources: + - knowledges/status + - datasources/status + - reservations/status + - decisions/status + - deschedulings/status + - pipelines/status + - kpis/status + - histories/status + verbs: + - get + - patch + - update +- apiGroups: + - events.k8s.io + resources: + - events + verbs: + - create + - patch +{{- end -}} +{{- if and .Values.rbac.enable .Values.metrics.enable }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: {{ .Values.namePrefix }}-metrics-reader +rules: +- nonResourceURLs: + - "/metrics" + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: {{ .Values.namePrefix }}-metrics-auth-role +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +{{- end -}} + diff --git a/helm/library/cortex-shim/templates/clusterrolebinding.yaml b/helm/library/cortex-shim/templates/clusterrolebinding.yaml new file mode 100644 index 000000000..ca82a0119 --- /dev/null +++ b/helm/library/cortex-shim/templates/clusterrolebinding.yaml @@ -0,0 +1,34 @@ +{{- if .Values.rbac.enable }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: {{ .Values.namePrefix }}-shim-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Values.namePrefix }}-shim-role +subjects: +- kind: ServiceAccount + name: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }} + namespace: {{ .Release.Namespace }} +{{- end -}} +{{- if and .Values.rbac.enable .Values.metrics.enable }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: {{ .Values.namePrefix }}-metrics-auth-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Values.namePrefix }}-metrics-auth-role +subjects: +- kind: ServiceAccount + name: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }} + namespace: {{ .Release.Namespace }} +{{- end -}} + diff --git a/helm/library/cortex-shim/templates/deployment.yaml b/helm/library/cortex-shim/templates/deployment.yaml new file mode 100644 index 000000000..b38eb3c02 --- /dev/null +++ b/helm/library/cortex-shim/templates/deployment.yaml @@ -0,0 +1,112 @@ +# This file is safe from kubebuilder edit --plugins=helm/v1-alpha +# If you want to re-generate, add the --force flag. + +{{- if .Values.deployment.enable }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Values.namePrefix }}-shim + namespace: {{ .Release.Namespace }} + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.deployment.replicas }} + selector: + matchLabels: + {{- include "chart.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: shim + labels: + {{- include "chart.labels" . | nindent 8 }} + {{- if and .Values.deployment.pod .Values.deployment.pod.labels }} + {{- range $key, $value := .Values.deployment.pod.labels }} + {{ $key }}: {{ $value }} + {{- end }} + {{- end }} + spec: + containers: + - name: shim + args: + {{- range .Values.deployment.container.args }} + - {{ . }} + {{- end }} + ports: + - name: api + containerPort: 8080 + protocol: TCP + - name: metrics + containerPort: 2112 + protocol: TCP + command: + - /main + image: {{ .Values.deployment.container.image.repository }}:{{ .Values.deployment.container.image.tag | default .Chart.AppVersion }} + {{- if .Values.deployment.container.image.pullPolicy }} + imagePullPolicy: {{ .Values.deployment.container.image.pullPolicy }} + {{- end }} + {{- if .Values.deployment.container.env }} + env: + {{- range $key, $value := .Values.deployment.container.env }} + - name: {{ $key }} + value: {{ $value }} + {{- end }} + {{- end }} + livenessProbe: + {{- toYaml .Values.deployment.container.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.deployment.container.readinessProbe | nindent 12 }} + resources: + {{- toYaml .Values.deployment.container.resources | nindent 12 }} + securityContext: + {{- toYaml .Values.deployment.container.securityContext | nindent 12 }} + volumeMounts: + - name: shim-config-volume + mountPath: /etc/config + - name: shim-secrets-volume + mountPath: /etc/secrets + readOnly: true + securityContext: + {{- toYaml .Values.deployment.securityContext | nindent 8 }} + serviceAccountName: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }} + terminationGracePeriodSeconds: {{ .Values.deployment.terminationGracePeriodSeconds }} + volumes: + # Custom values to configure the shim. + - name: shim-config-volume + configMap: + name: {{ .Values.namePrefix }}-shim-config + - name: shim-secrets-volume + secret: + secretName: {{ .Values.namePrefix }}-shim-secrets +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.namePrefix }}-shim-config +data: + conf.json: |- + {{- $mergedConf := dict }} + {{- if .Values.global.conf }} + {{- $mergedConf = .Values.global.conf }} + {{- end }} + {{- if .Values.conf }} + {{- $mergedConf = mergeOverwrite .Values.conf $mergedConf }} + {{- end }} + {{ toJson $mergedConf }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Values.namePrefix }}-shim-secrets +type: Opaque +data: + secrets.json: |- + {{- $mergedSecrets := dict }} + {{- if .Values.global.secrets }} + {{- $mergedSecrets = .Values.global.secrets }} + {{- end }} + {{- if .Values.secrets }} + {{- $mergedSecrets = mergeOverwrite .Values.secrets $mergedSecrets }} + {{- end }} + {{ toJson $mergedSecrets | b64enc }} +{{- end }} \ No newline at end of file diff --git a/helm/library/cortex-shim/templates/service.yaml b/helm/library/cortex-shim/templates/service.yaml new file mode 100644 index 000000000..549ceed95 --- /dev/null +++ b/helm/library/cortex-shim/templates/service.yaml @@ -0,0 +1,33 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ .Values.namePrefix }}-shim-service + namespace: {{ .Release.Namespace }} + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + ports: + - port: 8080 + targetPort: api + protocol: TCP + name: api + selector: + app.kubernetes.io/name: {{ include "chart.name" . }} +{{- if .Values.metrics.enable }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ .Values.namePrefix }}-shim-metrics-service + namespace: {{ .Release.Namespace }} + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + ports: + - port: 2112 + targetPort: metrics + protocol: TCP + name: metrics + selector: + app.kubernetes.io/name: {{ include "chart.name" . }} +{{- end }} diff --git a/helm/library/cortex-shim/templates/serviceaccount.yaml b/helm/library/cortex-shim/templates/serviceaccount.yaml new file mode 100644 index 000000000..ea0789dd0 --- /dev/null +++ b/helm/library/cortex-shim/templates/serviceaccount.yaml @@ -0,0 +1,15 @@ +{{- if .Values.rbac.enable }} +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + {{- if and .Values.deployment.serviceAccount .Values.deployment.serviceAccount.annotations }} + annotations: + {{- range $key, $value := .Values.deployment.serviceAccount.annotations }} + {{ $key }}: {{ $value }} + {{- end }} + {{- end }} + name: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }} + namespace: {{ .Release.Namespace }} +{{- end -}} diff --git a/helm/library/cortex-shim/templates/servicemonitor.yaml b/helm/library/cortex-shim/templates/servicemonitor.yaml new file mode 100644 index 000000000..803e66dd5 --- /dev/null +++ b/helm/library/cortex-shim/templates/servicemonitor.yaml @@ -0,0 +1,16 @@ +# To integrate with Prometheus. +{{- if .Values.prometheus.enable }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: {{ .Values.namePrefix }}-shim-metrics-monitor + namespace: {{ .Release.Namespace }} +spec: + endpoints: + - port: metrics + selector: + matchLabels: + app.kubernetes.io/name: {{ include "chart.name" . }} +{{- end }} diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml new file mode 100644 index 000000000..6434e823a --- /dev/null +++ b/helm/library/cortex-shim/values.yaml @@ -0,0 +1,68 @@ +deployment: + enable: true + replicas: 1 + container: + image: + repository: ghcr.io/cobaltcore-dev/cortex-shim + args: + - "--metrics-bind-address=:2112" + - "--health-probe-bind-address=:8081" + - "--metrics-secure=false" + resources: + limits: + cpu: 500m + memory: 2048Mi + requests: + cpu: 10m + memory: 64Mi + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 20 + httpGet: + path: /healthz + port: 8081 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 10 + httpGet: + path: /readyz + port: 8081 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - "ALL" + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + terminationGracePeriodSeconds: 10 + serviceAccountName: shim + +# [METRICS]: Set to true to generate manifests for exporting metrics. +# To disable metrics export set false, and ensure that the +# ControllerManager argument "--metrics-bind-address=:8443" is removed. +metrics: + enable: true + +# [RBAC]: To enable RBAC (Permissions) configurations +rbac: + enable: true + +# [PROMETHEUS]: To enable a ServiceMonitor to export metrics to Prometheus set true +prometheus: + enable: true + +global: + conf: {} + +# Use this to unambiguate multiple cortex deployments in the same cluster. +namePrefix: cortex +conf: + # The scheduling domain this operator is responsible for. + schedulingDomain: cortex + # Used to differentiate different cortex deployments in the same cluster (e.g. leader election ID) + leaderElectionID: cortex-unknown + enabledControllers: + # The explanation controller is available for all decision resources. + - explanation-controller diff --git a/helm/library/cortex/templates/manager/manager.yaml b/helm/library/cortex/templates/manager/manager.yaml index 73672164f..0c9f362aa 100644 --- a/helm/library/cortex/templates/manager/manager.yaml +++ b/helm/library/cortex/templates/manager/manager.yaml @@ -51,7 +51,7 @@ spec: protocol: TCP {{- end }} command: - - /manager + - /main image: {{ .Values.controllerManager.container.image.repository }}:{{ .Values.controllerManager.container.image.tag | default .Chart.AppVersion }} {{- if .Values.controllerManager.container.image.pullPolicy }} imagePullPolicy: {{ .Values.controllerManager.container.image.pullPolicy }} diff --git a/internal/shim/placement/.gitkeep b/internal/shim/placement/.gitkeep new file mode 100644 index 000000000..e69de29bb From b55ca3c98c694f8fc6d291a0e5b72ef3e99e987f Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 11:56:31 +0200 Subject: [PATCH 2/6] Add monitoring labels and scaffold manager (w/o leader election) --- cmd/shim/main.go | 248 +++++++++++++++++- .../bundles/cortex-placement-shim/values.yaml | 6 +- helm/library/cortex-shim/values.yaml | 9 +- 3 files changed, 252 insertions(+), 11 deletions(-) diff --git a/cmd/shim/main.go b/cmd/shim/main.go index 6b0634229..d59490c3c 100644 --- a/cmd/shim/main.go +++ b/cmd/shim/main.go @@ -3,7 +3,251 @@ package main +import ( + "context" + "crypto/tls" + "errors" + "flag" + "net/http" + "os" + "path/filepath" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/cobaltcore-dev/cortex/pkg/monitoring" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/sapcc/go-bits/httpext" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/certwatcher" + "sigs.k8s.io/controller-runtime/pkg/healthz" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/metrics" + "sigs.k8s.io/controller-runtime/pkg/metrics/filters" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + "sigs.k8s.io/controller-runtime/pkg/webhook" +) + +var ( + // Scheme defines the scheme for the API types used by the shim. + scheme = runtime.NewScheme() + // setupLog is the logger used for setup operations in the shim. + setupLog = ctrl.Log.WithName("setup") +) + +func init() { + // Bind the Kubernetes client-go scheme and the custom API types to the + // scheme used by the shim. + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha1.AddToScheme(scheme)) // Cortex crds + utilruntime.Must(hv1.AddToScheme(scheme)) // Hypervisor crd +} + func main() { - // TODO: this needs scaffolding, for now it just does nothing. - select {} + ctx := context.Background() + restConfig := ctrl.GetConfigOrDie() + + var metricsAddr string + var metricsCertPath, metricsCertName, metricsCertKey string + var webhookCertPath, webhookCertName, webhookCertKey string + // The shim does not require leader election, but this flag is provided to + // stay consistent with the kubebuilder scaffold. + var enableLeaderElection bool + var probeAddr string + var secureMetrics bool + var enableHTTP2 bool + var tlsOpts []func(*tls.Config) + flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ + "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") + flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") + flag.BoolVar(&enableLeaderElection, "leader-elect", false, + "Enable leader election for controller manager. "+ + "Enabling this will ensure there is only one active controller manager.") + flag.BoolVar(&secureMetrics, "metrics-secure", true, + "If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.") + flag.StringVar(&webhookCertPath, "webhook-cert-path", "", "The directory that contains the webhook certificate.") + flag.StringVar(&webhookCertName, "webhook-cert-name", "tls.crt", "The name of the webhook certificate file.") + flag.StringVar(&webhookCertKey, "webhook-cert-key", "tls.key", "The name of the webhook key file.") + flag.StringVar(&metricsCertPath, "metrics-cert-path", "", + "The directory that contains the metrics server certificate.") + flag.StringVar(&metricsCertName, "metrics-cert-name", "tls.crt", "The name of the metrics server certificate file.") + flag.StringVar(&metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.") + flag.BoolVar(&enableHTTP2, "enable-http2", false, + "If set, HTTP/2 will be enabled for the metrics and webhook servers") + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) + flag.Parse() + + // Check that we're really running this shim without leader election enabled. + if enableLeaderElection { + err := errors.New("leader election should not be enabled for the shim") + setupLog.Error(err, "invalid configuration") + os.Exit(1) + } + + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + // if the enable-http2 flag is false (the default), http/2 should be disabled + // due to its vulnerabilities. More specifically, disabling http/2 will + // prevent from being vulnerable to the HTTP/2 Stream Cancellation and + // Rapid Reset CVEs. For more information see: + // - https://github.com/advisories/GHSA-qppj-fm5r-hxr3 + // - https://github.com/advisories/GHSA-4374-p667-p6c8 + disableHTTP2 := func(c *tls.Config) { + setupLog.Info("disabling http/2") + c.NextProtos = []string{"http/1.1"} + } + + if !enableHTTP2 { + tlsOpts = append(tlsOpts, disableHTTP2) + } + + // Create watchers for metrics and webhooks certificates + var metricsCertWatcher, webhookCertWatcher *certwatcher.CertWatcher + + // Initial webhook TLS options + webhookTLSOpts := tlsOpts + + if webhookCertPath != "" { + setupLog.Info("Initializing webhook certificate watcher using provided certificates", + "webhook-cert-path", webhookCertPath, "webhook-cert-name", webhookCertName, "webhook-cert-key", webhookCertKey) + + var err error + webhookCertWatcher, err = certwatcher.New( + filepath.Join(webhookCertPath, webhookCertName), + filepath.Join(webhookCertPath, webhookCertKey), + ) + if err != nil { + setupLog.Error(err, "Failed to initialize webhook certificate watcher") + os.Exit(1) + } + + webhookTLSOpts = append(webhookTLSOpts, func(config *tls.Config) { + config.GetCertificate = webhookCertWatcher.GetCertificate + }) + } + + webhookServer := webhook.NewServer(webhook.Options{ + TLSOpts: webhookTLSOpts, + }) + + // Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server. + // More info: + // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.21.0/pkg/metrics/server + // - https://book.kubebuilder.io/reference/metrics.html + metricsServerOptions := metricsserver.Options{ + BindAddress: metricsAddr, + SecureServing: secureMetrics, + TLSOpts: tlsOpts, + } + + if secureMetrics { + // FilterProvider is used to protect the metrics endpoint with authn/authz. + // These configurations ensure that only authorized users and service accounts + // can access the metrics endpoint. The RBAC are configured in 'config/rbac/kustomization.yaml'. More info: + // https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.21.0/pkg/metrics/filters#WithAuthenticationAndAuthorization + metricsServerOptions.FilterProvider = filters.WithAuthenticationAndAuthorization + } + + // If the certificate is not specified, controller-runtime will automatically + // generate self-signed certificates for the metrics server. While convenient for development and testing, + // this setup is not recommended for production. + // + // If you enable certManager, uncomment the following lines: + // - [METRICS-WITH-CERTS] at config/default/kustomization.yaml to generate and use certificates + // managed by cert-manager for the metrics server. + // - [PROMETHEUS-WITH-CERTS] at config/prometheus/kustomization.yaml for TLS certification. + if metricsCertPath != "" { + setupLog.Info("Initializing metrics certificate watcher using provided certificates", + "metrics-cert-path", metricsCertPath, "metrics-cert-name", metricsCertName, "metrics-cert-key", metricsCertKey) + + var err error + metricsCertWatcher, err = certwatcher.New( + filepath.Join(metricsCertPath, metricsCertName), + filepath.Join(metricsCertPath, metricsCertKey), + ) + if err != nil { + setupLog.Error(err, "to initialize metrics certificate watcher", "error", err) + os.Exit(1) + } + + metricsServerOptions.TLSOpts = append(metricsServerOptions.TLSOpts, func(config *tls.Config) { + config.GetCertificate = metricsCertWatcher.GetCertificate + }) + } + + mgr, err := ctrl.NewManager(restConfig, ctrl.Options{ + Scheme: scheme, + Metrics: metricsServerOptions, + WebhookServer: webhookServer, + HealthProbeBindAddress: probeAddr, + // Kept for consistency with kubebuilder scaffold, but the shim should + // always run with leader election disabled. + LeaderElection: enableLeaderElection, + }) + if err != nil { + setupLog.Error(err, "unable to start manager") + os.Exit(1) + } + + // TODO: Initialize multicluster client here. + + // Our custom monitoring registry can add prometheus labels to all metrics. + // This is useful to distinguish metrics from different deployments. + metricsConfig := conf.GetConfigOrDie[monitoring.Config]() + metrics.Registry = monitoring.WrapRegistry(metrics.Registry, metricsConfig) + + // API endpoint. + mux := http.NewServeMux() + + // +kubebuilder:scaffold:builder + + if metricsCertWatcher != nil { + setupLog.Info("Adding metrics certificate watcher to manager") + if err := mgr.Add(metricsCertWatcher); err != nil { + setupLog.Error(err, "unable to add metrics certificate watcher to manager") + os.Exit(1) + } + } + + if webhookCertWatcher != nil { + setupLog.Info("Adding webhook certificate watcher to manager") + if err := mgr.Add(webhookCertWatcher); err != nil { + setupLog.Error(err, "unable to add webhook certificate watcher to manager") + os.Exit(1) + } + } + + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up health check") + os.Exit(1) + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up ready check") + os.Exit(1) + } + + errchan := make(chan error) + go func() { + errchan <- func() error { + setupLog.Info("starting api server", "address", ":8080") + return httpext.ListenAndServeContext(ctx, ":8080", mux) + }() + }() + go func() { + if err := <-errchan; err != nil { + setupLog.Error(err, "problem running api server") + os.Exit(1) + } + }() + + setupLog.Info("starting manager") + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + setupLog.Error(err, "problem running manager") + os.Exit(1) + } } diff --git a/helm/bundles/cortex-placement-shim/values.yaml b/helm/bundles/cortex-placement-shim/values.yaml index 40aa9cb11..6dd793653 100644 --- a/helm/bundles/cortex-placement-shim/values.yaml +++ b/helm/bundles/cortex-placement-shim/values.yaml @@ -20,4 +20,8 @@ alerts: cortex-shim: namePrefix: cortex-placement - conf: {} # TODO + conf: + monitoring: + labels: + github_org: cobaltcore-dev + github_repo: cortex diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml index 6434e823a..1c45c2542 100644 --- a/helm/library/cortex-shim/values.yaml +++ b/helm/library/cortex-shim/values.yaml @@ -58,11 +58,4 @@ global: # Use this to unambiguate multiple cortex deployments in the same cluster. namePrefix: cortex -conf: - # The scheduling domain this operator is responsible for. - schedulingDomain: cortex - # Used to differentiate different cortex deployments in the same cluster (e.g. leader election ID) - leaderElectionID: cortex-unknown - enabledControllers: - # The explanation controller is available for all decision resources. - - explanation-controller +conf: {} # No config for now that's needed by all the shims. From 5bd2e0491899ea8f3e44ada30ba2b3878ba0b93d Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 12:01:19 +0200 Subject: [PATCH 3/6] Remove alerts --- .../alerts/placement-shim.alerts.yaml | 735 +----------------- 1 file changed, 2 insertions(+), 733 deletions(-) diff --git a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml index 41bf29794..03aea7763 100644 --- a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml +++ b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml @@ -1,734 +1,3 @@ groups: -- name: cortex-nova-alerts - rules: - - alert: CortexNovaSchedulingDown - expr: | - up{pod=~"cortex-nova-scheduling-.*"} != 1 or - absent(up{pod=~"cortex-nova-scheduling-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex/cortex - service: cortex - severity: critical - support_group: workload-management - playbook: docs/support/playbook/cortex/down - annotations: - summary: "Cortex Scheduling for Nova is down" - description: > - The Cortex scheduling service is down. Scheduling requests from Nova will - not be served. This is non-critical for vmware virtual machines, but - blocks kvm virtual machines from being scheduled. Thus, it is - recommended to immediately investigate and resolve the issue. - - - alert: CortexNovaKnowledgeDown - expr: | - up{pod=~"cortex-nova-knowledge-.*"} != 1 or - absent(up{pod=~"cortex-nova-knowledge-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/down - annotations: - summary: "Cortex Knowledge for Nova is down" - description: > - The Cortex Knowledge service is down. This is no immediate problem, - since cortex is still able to process requests, - but the quality of the responses may be affected. - - - alert: CortexNovaDeschedulerPipelineErroring - expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 - for: 5m - labels: - context: descheduler - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Descheduler pipeline is erroring." - description: > - The Cortex descheduler pipeline is encountering errors during its execution. - This may indicate issues with the descheduling logic or the underlying infrastructure. - It is recommended to investigate the descheduler logs and the state of the VMs being processed. - - - alert: CortexNovaHttpRequest400sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Nova Scheduler HTTP request 400 errors too high" - description: > - Nova Scheduler is responding to placement requests with HTTP 4xx - errors. This is expected when the scheduling request cannot be served - by Cortex. However, it could also indicate that the request format has - changed and Cortex is unable to parse it. - - - alert: CortexNovaSchedulingHttpRequest500sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Nova Scheduler HTTP request 500 errors too high" - description: > - Nova Scheduler is responding to placement requests with HTTP 5xx errors. - This is not expected and indicates that Cortex is having some internal problem. - Nova will continue to place new VMs, but the placement will be less desirable. - Thus, no immediate action is needed. - - - alert: CortexNovaHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > 6000 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` uses too much memory" - description: > - `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it - should use much less, so there may be a memory leak or other changes - that are causing the memory usage to increase significantly. - - - alert: CortexNovaHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5 - for: 5m - labels: - context: cpu - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` uses too much CPU" - description: > - `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually - it should use much less, so there may be a CPU leak or other changes - that are causing the CPU usage to increase significantly. - - - alert: CortexNovaTooManyDBConnectionAttempts - expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1 - for: 5m - labels: - context: db - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` is trying to connect to the database too often" - description: > - `{{$labels.component}}` is trying to connect to the database too often. This may happen - when the database is down or the connection parameters are misconfigured. - - - alert: CortexNovaSyncNotSuccessful - expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0 - for: 5m - labels: - context: syncstatus - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` Sync not successful" - description: > - `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may - happen when the datasource (OpenStack, Prometheus, etc.) is down or - the sync module is misconfigured. No immediate action is needed, since - the sync module will retry the sync operation and the currently synced - data will be kept. However, when this problem persists for a longer - time the service will have a less recent view of the datacenter. - - - alert: CortexNovaSyncObjectsDroppedToZero - expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0 - for: 60m - labels: - context: syncobjects - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" - description: > - `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen - when the datasource (OpenStack, Prometheus, etc.) is down or the sync - module is misconfigured. No immediate action is needed, since the sync - module will retry the sync operation and the currently synced data will - be kept. However, when this problem persists for a longer time the - service will have a less recent view of the datacenter. - - - alert: CortexNovaDatasourceUnready - expr: cortex_datasource_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: datasources - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the datasource - connectivity or configuration. It is recommended to investigate the - datasource status and logs for more details. - - - alert: CortexNovaKnowledgeUnready - expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: knowledge - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the knowledge - configuration. It is recommended to investigate the - knowledge status and logs for more details. - - - alert: CortexNovaDecisionsWithErrors - expr: cortex_decision_state{domain="nova",state="error"} > 0 - for: 5m - labels: - context: decisions - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Some decisions are in error state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline generated decisions that are in error state. - This may indicate issues with the decision logic or the underlying infrastructure. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexNovaTooManyDecisionsWaiting - expr: cortex_decision_state{domain="nova",state="waiting"} > 10 - for: 5m - labels: - context: decisions - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline has a high number of decisions for which - no target host has been assigned yet. - - This may indicate a backlog in processing or issues with the decision logic. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexNovaKPIUnready - expr: | - cortex_kpi_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: kpis - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the KPI - configuration. It is recommended to investigate the - KPI status and logs for more details. - - - alert: CortexNovaPipelineUnready - expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0 - for: 5m - labels: - context: pipelines - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the pipeline - configuration. It is recommended to investigate the - pipeline status and logs for more details. - - # Committed Resource Info API Alerts - - alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh - expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource info API HTTP 500 errors too high" - description: > - The committed resource info API (Limes LIQUID integration) is responding - with HTTP 5xx errors. This indicates internal problems building service info, - such as invalid flavor group data. Limes will not be able to discover available - resources until the issue is resolved. - - # Committed Resource Change API Alerts - - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh - expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource change API HTTP 400 errors too high" - description: > - The committed resource change API (Limes LIQUID integration) is responding - with HTTP 4xx errors. This may happen when Limes sends a request with - an outdated info version (409), the API is temporarily unavailable, - or the request format is invalid. Limes will typically retry these - requests, so no immediate action is needed unless the errors persist. - - - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh - expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource change API HTTP 500 errors too high" - description: > - The committed resource change API (Limes LIQUID integration) is responding - with HTTP 5xx errors. This is not expected and indicates that Cortex - is having an internal problem processing commitment changes. Limes will - continue to retry, but new commitments may not be fulfilled until the - issue is resolved. - - - alert: CortexNovaCommittedResourceLatencyTooHigh - expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource change API latency too high" - description: > - The committed resource change API (Limes LIQUID integration) is experiencing - high latency (p95 > 30s). This may indicate that the scheduling pipeline - is under heavy load or that reservation scheduling is taking longer than - expected. Limes requests may time out, causing commitment changes to fail. - - - alert: CortexNovaCommittedResourceRejectionRateTooHigh - expr: | - sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m])) - / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource rejection rate too high" - description: > - More than 50% of commitment change requests are being rejected. - This may indicate insufficient capacity in the datacenter to fulfill - new commitments, or issues with the commitment scheduling logic. - Rejected commitments are rolled back, so Limes will see them as failed - and may retry or report the failure to users. - - - alert: CortexNovaCommittedResourceTimeoutsTooHigh - expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource change API timeouts too high" - description: > - The committed resource change API (Limes LIQUID integration) timed out - while waiting for reservations to become ready. This indicates that the - scheduling pipeline is overloaded or reservations are taking too long - to be scheduled. Affected commitment changes are rolled back and Limes - will see them as failed. Consider investigating the scheduler performance - or increasing the timeout configuration. - - # Committed Resource Usage API Alerts - - alert: CortexNovaCommittedResourceUsageHttpRequest400sTooHigh - expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource usage API HTTP 400 errors too high" - description: > - The committed resource usage API (Limes LIQUID integration) is responding - with HTTP 4xx errors. This may indicate invalid project IDs or malformed - requests from Limes. Limes will typically retry these requests. - - - alert: CortexNovaCommittedResourceUsageHttpRequest500sTooHigh - expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource usage API HTTP 500 errors too high" - description: > - The committed resource usage API (Limes LIQUID integration) is responding - with HTTP 5xx errors. This indicates internal problems fetching reservations - or Nova server data. Limes may receive stale or incomplete usage data. - - - alert: CortexNovaCommittedResourceUsageLatencyTooHigh - expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_usage_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource usage API latency too high" - description: > - The committed resource usage API (Limes LIQUID integration) is experiencing - high latency (p95 > 5s). This may indicate slow Nova API responses or - database queries. Limes scrapes may time out, affecting quota reporting. - - # Committed Resource Capacity API Alerts - - alert: CortexNovaCommittedResourceCapacityHttpRequest400sTooHigh - expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource capacity API HTTP 400 errors too high" - description: > - The committed resource capacity API (Limes LIQUID integration) is responding - with HTTP 4xx errors. This may indicate malformed requests from Limes. - - - alert: CortexNovaCommittedResourceCapacityHttpRequest500sTooHigh - expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource capacity API HTTP 500 errors too high" - description: > - The committed resource capacity API (Limes LIQUID integration) is responding - with HTTP 5xx errors. This indicates internal problems calculating cluster - capacity. Limes may receive stale or incomplete capacity data. - - - alert: CortexNovaCommittedResourceCapacityLatencyTooHigh - expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_capacity_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource capacity API latency too high" - description: > - The committed resource capacity API (Limes LIQUID integration) is experiencing - high latency (p95 > 5s). This may indicate slow database queries or knowledge - CRD retrieval. Limes scrapes may time out, affecting capacity reporting. - - # Committed Resource Syncer Alerts - - alert: CortexNovaCommittedResourceSyncerErrorsHigh - expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3 - for: 5m - labels: - context: committed-resource-syncer - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer experiencing errors" - description: > - The committed resource syncer has encountered multiple errors in the last hour. - This may indicate connectivity issues with Limes. Check the syncer logs for error details. - - - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh - expr: | - ( - sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h])) - / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) - ) > 0.05 - and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0 - for: 15m - labels: - context: committed-resource-syncer - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer unit mismatch rate >5%" - description: > - More than 5% of commitments are being skipped due to unit mismatches between - Limes and Cortex flavor groups. This happens when Limes has not yet been - updated to use the new unit format after a flavor group change. The affected - commitments will keep their existing reservations until Limes notices the update. - Check the logs if this error persists for longer time. - - - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh - expr: | - ( - sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h])) - / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) - ) > 0 - and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0 - for: 15m - labels: - context: committed-resource-syncer - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer unknown flavor group rate >0%" - description: > - Some commitments reference flavor groups that don't exist in - Cortex Knowledge (anymore). This may indicate that flavor group configuration is - out of sync between Limes and Cortex, or that Knowledge extraction is failing. - Check the flavor group Knowledge CRD and history to see what was changed. - - - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh - expr: | - ( - ( - rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) + - rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) + - rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) - ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) - ) > 0.01 - and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 - for: 15m - labels: - context: committed-resource-syncer - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer local change rate >1%" - description: > - More than 1% of synced commitments are requiring reservation changes - (creates, deletes, or repairs). This is higher than expected for steady-state - operation and may indicate data inconsistencies, external modifications to - reservations, or issues with the CRDs. Check Cortex logs for details. - - - alert: CortexNovaCommittedResourceSyncerRepairRateHigh - expr: | - ( - rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) - / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) - ) > 0 - and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 - for: 15m - labels: - context: committed-resource-syncer - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer repair rate >0%" - description: > - Some commitments have reservations that needed repair - (wrong metadata like project ID or flavor group). This may indicate data - corruption, bugs in reservation creation, or external modifications. - Reservations are automatically repaired, but the root cause should be - investigated if this alert persists. - - - alert: CortexNovaDoesntFindValidKVMHosts - expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0 - for: 5m - labels: - context: scheduling - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Nova scheduling cannot find valid KVM hosts" - description: > - Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling - failed to find a valid `{{$labels.hvtype}}` host. This may indicate - capacity issues, misconfigured filters, or resource constraints in the - datacenter. Investigate the affected VMs and hypervisor availability. - - - alert: CortexNovaNewDatasourcesNotReconciling - expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0 - for: 60m - labels: - context: datasources - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "New datasource `{{$labels.datasource}}` has not reconciled" - description: > - A new datasource `{{$labels.datasource}}` has been added but has not - completed its first reconciliation yet. This may indicate issues with - the datasource controller's workqueue overprioritizing other datasources. - - - alert: CortexNovaExistingDatasourcesLackingBehind - expr: | - sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600 - and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1 - for: 10m - labels: - context: datasources - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Existing datasource `{{$labels.datasource}}` is lacking behind" - description: > - An existing datasource `{{$labels.datasource}}` has been queued for - reconciliation for more than 10 minutes. This may indicate issues with - the datasource controller's workqueue or that this or another datasource - is taking an unusually long time to reconcile. - - - alert: CortexNovaReconcileErrorsHigh - expr: | - (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m]))) - / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1 - for: 15m - labels: - context: controller-errors - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller reconcile error rate >10%" - description: > - More than 10% of controller reconciles are resulting in errors. This may - indicate issues with the controller logic, connectivity problems, or - external factors causing failures. Check the controller logs for error - details and investigate the affected resources. - - - alert: CortexNovaReconcileDurationHigher10Min - expr: | - (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m]))) - / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > 600 - for: 15m - labels: - context: controller-duration - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})" - description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}" - - - alert: CortexNovaWorkqueueNotDrained - expr: | - sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0 - for: 60m - labels: - context: controller-workqueue - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller {{ $labels.name }}'s backlog is not being drained." - description: > - The workqueue for controller {{ $labels.name }} has a backlog that is - not being drained. This may indicate that the controller is overwhelmed - with work or is stuck on certain resources. Check the controller logs - and the state of the resources it manages for more details. - - - alert: CortexNovaWebhookLatencyHigh - expr: | - histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2 - for: 15m - labels: - context: controller-webhook - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller webhook {{ $labels.webhook }} latency is high" - description: > - The latency for webhook {{ $labels.webhook }} is higher than expected (p90 > 200ms). - This may indicate performance issues with the webhook server or the logic it executes. - Check the webhook server logs and monitor its resource usage for more insights. - - - alert: CortexNovaWebhookErrorsHigh - expr: | - (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m]))) - / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1 - for: 15m - labels: - context: controller-webhook - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller webhook {{ $labels.webhook }} is experiencing errors" - description: > - The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes. - This may indicate issues with the webhook logic, connectivity problems, or - external factors causing failures. Check the webhook server logs for error - details and investigate the affected resources. \ No newline at end of file +- name: cortex-placement-shim-alerts + rules: [] \ No newline at end of file From efcfef53f4e567024a0c98cd37c9cb29edc6204e Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 13:20:52 +0200 Subject: [PATCH 4/6] PR feedback --- .github/workflows/push-images.yaml | 7 ++++++- cmd/shim/main.go | 9 ++++----- helm/library/cortex-shim/templates/_helpers.tpl | 12 ++++++++---- helm/library/cortex-shim/templates/service.yaml | 4 ++-- helm/library/cortex-shim/values.yaml | 4 ++-- 5 files changed, 22 insertions(+), 14 deletions(-) diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml index 3085b503b..f3be685ce 100644 --- a/.github/workflows/push-images.yaml +++ b/.github/workflows/push-images.yaml @@ -81,6 +81,11 @@ jobs: files: | cmd/shim/** internal/shim/** + api/** + pkg/** + go.mod + go.sum + Dockerfile - name: Docker Meta (Cortex Shim) if: steps.changed_shim_files.outputs.all_changed_files != '' id: meta_cortex_shim @@ -99,7 +104,7 @@ jobs: id: push_cortex_shim uses: docker/build-push-action@v7 with: - context: cmd/shim + context: . platforms: linux/amd64,linux/arm64 push: true tags: ${{ steps.meta_cortex_shim.outputs.tags }} diff --git a/cmd/shim/main.go b/cmd/shim/main.go index d59490c3c..970c8c934 100644 --- a/cmd/shim/main.go +++ b/cmd/shim/main.go @@ -4,7 +4,6 @@ package main import ( - "context" "crypto/tls" "errors" "flag" @@ -46,7 +45,7 @@ func init() { } func main() { - ctx := context.Background() + ctx := ctrl.SetupSignalHandler() restConfig := ctrl.GetConfigOrDie() var metricsAddr string @@ -110,7 +109,7 @@ func main() { var metricsCertWatcher, webhookCertWatcher *certwatcher.CertWatcher // Initial webhook TLS options - webhookTLSOpts := tlsOpts + webhookTLSOpts := append([]func(*tls.Config){}, tlsOpts...) if webhookCertPath != "" { setupLog.Info("Initializing webhook certificate watcher using provided certificates", @@ -142,7 +141,7 @@ func main() { metricsServerOptions := metricsserver.Options{ BindAddress: metricsAddr, SecureServing: secureMetrics, - TLSOpts: tlsOpts, + TLSOpts: append([]func(*tls.Config){}, tlsOpts...), } if secureMetrics { @@ -246,7 +245,7 @@ func main() { }() setupLog.Info("starting manager") - if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + if err := mgr.Start(ctx); err != nil { setupLog.Error(err, "problem running manager") os.Exit(1) } diff --git a/helm/library/cortex-shim/templates/_helpers.tpl b/helm/library/cortex-shim/templates/_helpers.tpl index 782e14eef..cca33d701 100644 --- a/helm/library/cortex-shim/templates/_helpers.tpl +++ b/helm/library/cortex-shim/templates/_helpers.tpl @@ -36,15 +36,19 @@ app.kubernetes.io/instance: {{ .Release.Name }} {{- $hasMutating := false }} {{- range . }} {{- if eq .type "mutating" }} - $hasMutating = true }}{{- end }} + {{- $hasMutating = true -}} + {{- end }} +{{- end }} +{{ $hasMutating }} {{- end }} -{{ $hasMutating }}}}{{- end }} {{- define "chart.hasValidatingWebhooks" -}} {{- $hasValidating := false }} {{- range . }} {{- if eq .type "validating" }} - $hasValidating = true }}{{- end }} + {{- $hasValidating = true -}} + {{- end }} +{{- end }} +{{ $hasValidating }} {{- end }} -{{ $hasValidating }}}}{{- end }} diff --git a/helm/library/cortex-shim/templates/service.yaml b/helm/library/cortex-shim/templates/service.yaml index 549ceed95..faf3082a3 100644 --- a/helm/library/cortex-shim/templates/service.yaml +++ b/helm/library/cortex-shim/templates/service.yaml @@ -12,7 +12,7 @@ spec: protocol: TCP name: api selector: - app.kubernetes.io/name: {{ include "chart.name" . }} + {{- include "chart.selectorLabels" . | nindent 4 }} {{- if .Values.metrics.enable }} --- apiVersion: v1 @@ -29,5 +29,5 @@ spec: protocol: TCP name: metrics selector: - app.kubernetes.io/name: {{ include "chart.name" . }} + {{- include "chart.selectorLabels" . | nindent 4 }} {{- end }} diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml index 1c45c2542..1d1bc844c 100644 --- a/helm/library/cortex-shim/values.yaml +++ b/helm/library/cortex-shim/values.yaml @@ -40,8 +40,8 @@ deployment: serviceAccountName: shim # [METRICS]: Set to true to generate manifests for exporting metrics. -# To disable metrics export set false, and ensure that the -# ControllerManager argument "--metrics-bind-address=:8443" is removed. +# To disable metrics export set false, and remove the container args +# "--metrics-bind-address=:2112" and "--metrics-secure=false". metrics: enable: true From 653cf8591e6ec3ea401a1e9c476fb996e5dafc0a Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 13:36:10 +0200 Subject: [PATCH 5/6] PR feedback --- cmd/shim/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/shim/main.go b/cmd/shim/main.go index 970c8c934..9feea8d5f 100644 --- a/cmd/shim/main.go +++ b/cmd/shim/main.go @@ -170,7 +170,7 @@ func main() { filepath.Join(metricsCertPath, metricsCertKey), ) if err != nil { - setupLog.Error(err, "to initialize metrics certificate watcher", "error", err) + setupLog.Error(err, "Failed to initialize metrics certificate watcher") os.Exit(1) } From 88b2cb49ac0e481320725ed0c923547bf8c36eb9 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 13:42:55 +0200 Subject: [PATCH 6/6] 3 replicas by default --- helm/library/cortex-shim/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml index 1d1bc844c..63574fbe4 100644 --- a/helm/library/cortex-shim/values.yaml +++ b/helm/library/cortex-shim/values.yaml @@ -1,6 +1,6 @@ deployment: enable: true - replicas: 1 + replicas: 3 container: image: repository: ghcr.io/cobaltcore-dev/cortex-shim