From 8b9272e8fc1e1aa0798c99b28497d9d2a94c7321 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 11:27:28 +0200 Subject: [PATCH 01/17] Scaffold cortex placement api shim --- .github/workflows/push-charts.yaml | 19 - .github/workflows/push-images.yaml | 45 ++ .github/workflows/update-appversion.yml | 21 + .gitignore | 1 + AGENTS.md | 3 +- Dockerfile | 8 +- Tiltfile | 24 +- cmd/{ => manager}/main.go | 0 cmd/shim/main.go | 9 + helm/bundles/cortex-placement-shim/Chart.yaml | 20 + .../alerts/placement-shim.alerts.yaml | 734 ++++++++++++++++++ .../templates/alerts.yaml | 17 + .../templates/clusterrole.yaml | 23 + .../templates/clusterrolebinding.yaml | 14 + .../bundles/cortex-placement-shim/values.yaml | 23 + helm/library/cortex-shim/Chart.lock | 6 + helm/library/cortex-shim/Chart.yaml | 8 + .../cortex-shim/templates/_helpers.tpl | 50 ++ .../cortex-shim/templates/clusterrole.yaml | 100 +++ .../templates/clusterrolebinding.yaml | 34 + .../cortex-shim/templates/deployment.yaml | 112 +++ .../cortex-shim/templates/service.yaml | 33 + .../cortex-shim/templates/serviceaccount.yaml | 15 + .../cortex-shim/templates/servicemonitor.yaml | 16 + helm/library/cortex-shim/values.yaml | 68 ++ .../cortex/templates/manager/manager.yaml | 2 +- internal/shim/placement/.gitkeep | 0 27 files changed, 1379 insertions(+), 26 deletions(-) rename cmd/{ => manager}/main.go (100%) create mode 100644 cmd/shim/main.go create mode 100644 helm/bundles/cortex-placement-shim/Chart.yaml create mode 100644 helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml create mode 100644 helm/bundles/cortex-placement-shim/templates/alerts.yaml create mode 100644 helm/bundles/cortex-placement-shim/templates/clusterrole.yaml create mode 100644 helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml create mode 100644 helm/bundles/cortex-placement-shim/values.yaml create mode 100644 helm/library/cortex-shim/Chart.lock create mode 100644 helm/library/cortex-shim/Chart.yaml create mode 100644 helm/library/cortex-shim/templates/_helpers.tpl create mode 100644 helm/library/cortex-shim/templates/clusterrole.yaml create mode 100644 helm/library/cortex-shim/templates/clusterrolebinding.yaml create mode 100644 helm/library/cortex-shim/templates/deployment.yaml create mode 100644 helm/library/cortex-shim/templates/service.yaml create mode 100644 helm/library/cortex-shim/templates/serviceaccount.yaml create mode 100644 helm/library/cortex-shim/templates/servicemonitor.yaml create mode 100644 helm/library/cortex-shim/values.yaml create mode 100644 internal/shim/placement/.gitkeep diff --git a/.github/workflows/push-charts.yaml b/.github/workflows/push-charts.yaml index 2e3577275..a4559d15a 100644 --- a/.github/workflows/push-charts.yaml +++ b/.github/workflows/push-charts.yaml @@ -27,25 +27,6 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Get all changed helm/library/cortex Chart.yaml files - id: changed-chart-yaml-files-core - uses: tj-actions/changed-files@v47 - with: - files: | - helm/library/cortex/Chart.yaml - - name: Push cortex core charts to registry - if: steps.changed-chart-yaml-files-core.outputs.all_changed_files != '' - shell: bash - env: - ALL_CHANGED_FILES: ${{ steps.changed-chart-yaml-files-core.outputs.all_changed_files }} - run: | - for CHART_FILE in ${ALL_CHANGED_FILES}; do - CHART_DIR=$(dirname $CHART_FILE) - helm package $CHART_DIR --dependency-update --destination $CHART_DIR - CHART_PACKAGE=$(ls $CHART_DIR/*.tgz) - helm push $CHART_PACKAGE oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/ - done - - name: Get all changed library Chart.yaml files id: changed-chart-yaml-files-library uses: tj-actions/changed-files@v47 diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml index 997595976..3085b503b 100644 --- a/.github/workflows/push-images.yaml +++ b/.github/workflows/push-images.yaml @@ -72,6 +72,50 @@ jobs: subject-digest: ${{ steps.push_cortex_postgres.outputs.digest }} push-to-registry: true + # Only build and push the cortex-shim image if there are changes related + # to the cortex shims (e.g., in cmd/shim or internal/shim). + - name: Get all changed shim/ files + id: changed_shim_files + uses: tj-actions/changed-files@v47 + with: + files: | + cmd/shim/** + internal/shim/** + - name: Docker Meta (Cortex Shim) + if: steps.changed_shim_files.outputs.all_changed_files != '' + id: meta_cortex_shim + uses: docker/metadata-action@v6 + with: + images: ${{ env.REGISTRY }}/${{ github.repository }}-shim + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=sha + latest + env: + DOCKER_METADATA_SHORT_SHA_LENGTH: 8 + - name: Build and Push Cortex Shim + if: steps.changed_shim_files.outputs.all_changed_files != '' + id: push_cortex_shim + uses: docker/build-push-action@v7 + with: + context: cmd/shim + platforms: linux/amd64,linux/arm64 + push: true + tags: ${{ steps.meta_cortex_shim.outputs.tags }} + labels: ${{ steps.meta_cortex_shim.outputs.labels }} + build-args: | + GIT_TAG=${{ github.ref_name }} + GIT_COMMIT=${{ github.sha }} + GOMAIN=cmd/shim/main.go + - name: Generate Artifact Attestation for Cortex Shim + if: steps.changed_shim_files.outputs.all_changed_files != '' + uses: actions/attest-build-provenance@v4 + with: + subject-name: ${{ env.REGISTRY }}/${{ github.repository }}-shim + subject-digest: ${{ steps.push_cortex_shim.outputs.digest }} + push-to-registry: true + # Build & push new cortex image - name: Docker Meta (Cortex) id: meta_cortex @@ -98,6 +142,7 @@ jobs: build-args: | GIT_TAG=${{ github.ref_name }} GIT_COMMIT=${{ github.sha }} + GOMAIN=cmd/manager/main.go - name: Generate Artifact Attestation for Cortex uses: actions/attest-build-provenance@v4 with: diff --git a/.github/workflows/update-appversion.yml b/.github/workflows/update-appversion.yml index cc5ccdc9f..20087fa80 100644 --- a/.github/workflows/update-appversion.yml +++ b/.github/workflows/update-appversion.yml @@ -44,6 +44,27 @@ jobs: git commit -m "Bump cortex-postgres chart appVersions to ${{ steps.vars.outputs.sha }} [skip ci]" || echo "No changes to commit" git push origin HEAD:main + # Only bumped if there are changes in shim-related directories + - name: Get all changed shim files + id: changed_shim_files + uses: tj-actions/changed-files@v47 + with: + files: | + internal/shim/** + cmd/shim/** + - name: Update appVersion in cortex-shim Chart.yaml + if: steps.changed_shim_files.outputs.all_changed_files != '' + run: | + sed -i 's/^\([ ]*appVersion:[ ]*\).*/\1"${{ steps.vars.outputs.sha }}"/' helm/library/cortex-shim/Chart.yaml + - name: Commit and push changes for cortex-shim + if: steps.changed_shim_files.outputs.all_changed_files != '' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add helm/library/cortex-shim/Chart.yaml + git commit -m "Bump cortex-shim chart appVersions to ${{ steps.vars.outputs.sha }} [skip ci]" || echo "No changes to commit" + git push origin HEAD:main + - name: Update appVersion in helm/library/cortex/Chart.yaml run: | sed -i 's/^\([ ]*appVersion:[ ]*\).*/\1"${{ steps.vars.outputs.sha }}"/' helm/library/cortex/Chart.yaml diff --git a/.gitignore b/.gitignore index 04bac2d09..7e21248bc 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ cortex.secrets.yaml !.editorconfig !.gitignore !.github +!.gitkeep !.golangci.yaml !.license-scan-overrides.jsonl !.license-scan-rules.json diff --git a/AGENTS.md b/AGENTS.md index 6f2e12a17..59747bd8c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -50,7 +50,8 @@ Helm charts: ## Repository Structure Code: -- `cmd/main.go` is the entry point for the manager, which starts the controllers and webhooks +- `cmd/manager/main.go` is the entry point for the manager, which starts the controllers and webhooks +- `cmd/shim/main.go` is the entry point for cortex shims exposing cortex capabilities over REST endpoints - `api/v1alpha1` is where the CRD specs of cortex lives - `api/external` contains messages sent to cortex via http from external openstack services - `internal/scheduling` contains the logic for scheduling in different cloud domains diff --git a/Dockerfile b/Dockerfile index 6f7e79bea..2580e9637 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,8 @@ ARG TARGETARCH ARG GO_MOD_PATH=. ARG GOCACHE=/root/.cache/go-build ENV GOCACHE=${GOCACHE} +ARG GOMAIN=cmd/manager/main.go +ENV GOMAIN=${GOMAIN} # Note: avoid using COPY to /lib which will lead to docker build errors. WORKDIR /workspace/${GO_MOD_PATH} @@ -29,13 +31,13 @@ ENV GOOS=${TARGETOS:-linux} ENV GOARCH=${TARGETARCH} RUN --mount=type=cache,target=/go/pkg/mod/ \ --mount=type=cache,target=${GOCACHE} \ - go build -a -o /manager cmd/main.go + go build -a -o /main ${GOMAIN} # Use distroless as minimal base image to package the manager binary # Refer to https://github.com/GoogleContainerTools/distroless for more details FROM gcr.io/distroless/static:nonroot WORKDIR / -COPY --from=builder /manager . +COPY --from=builder /main . USER 65532:65532 -ENTRYPOINT ["/manager"] +ENTRYPOINT ["/main"] diff --git a/Tiltfile b/Tiltfile index 6871d18b3..bc87f4d30 100644 --- a/Tiltfile +++ b/Tiltfile @@ -7,7 +7,10 @@ analytics_settings(False) # Use the ACTIVE_DEPLOYMENTS env var to select which Cortex bundles to deploy. -ACTIVE_DEPLOYMENTS_ENV = os.getenv('ACTIVE_DEPLOYMENTS', 'nova,manila,cinder,ironcore,pods') +ACTIVE_DEPLOYMENTS_ENV = os.getenv( + 'ACTIVE_DEPLOYMENTS', + 'nova,manila,cinder,ironcore,pods,placement', +) if ACTIVE_DEPLOYMENTS_ENV == "": ACTIVE_DEPLOYMENTS = [] # Catch "".split(",") = [""] else: @@ -78,13 +81,22 @@ local('kubectl wait --namespace cert-manager --for=condition=available deploymen url = 'https://raw.githubusercontent.com/cobaltcore-dev/openstack-hypervisor-operator/refs/heads/main/charts/openstack-hypervisor-operator/crds/kvm.cloud.sap_hypervisors.yaml' local('curl -L ' + url + ' | kubectl apply -f -') -########### Cortex Operator & CRDs +########### Cortex Manager & CRDs docker_build('ghcr.io/cobaltcore-dev/cortex', '.', dockerfile='Dockerfile', + build_args={'GOMAIN': 'cmd/manager/main.go'}, only=['internal/', 'cmd/', 'api/', 'pkg', 'go.mod', 'go.sum', 'Dockerfile'], ) local('sh helm/sync.sh helm/library/cortex') +########### Cortex Shim +docker_build('ghcr.io/cobaltcore-dev/cortex-shim', '.', + dockerfile='Dockerfile', + build_args={'GOMAIN': 'cmd/shim/main.go'}, + only=['internal/', 'cmd/', 'api/', 'pkg', 'go.mod', 'go.sum', 'Dockerfile'], +) +local('sh helm/sync.sh helm/library/cortex-shim') + ########### Cortex Bundles docker_build('ghcr.io/cobaltcore-dev/cortex-postgres', 'postgres') @@ -98,6 +110,7 @@ bundle_charts = [ ('helm/bundles/cortex-cinder', 'cortex-cinder'), ('helm/bundles/cortex-ironcore', 'cortex-ironcore'), ('helm/bundles/cortex-pods', 'cortex-pods'), + ('helm/bundles/cortex-placement-shim', 'cortex-placement-shim'), ] dep_charts = { 'cortex-crds': [ @@ -123,6 +136,9 @@ dep_charts = { ('helm/library/cortex-postgres', 'cortex-postgres'), ('helm/library/cortex', 'cortex'), ], + 'cortex-placement-shim': [ + ('helm/library/cortex-shim', 'cortex-shim'), + ], } for (bundle_chart_path, bundle_chart_name) in bundle_charts: @@ -255,6 +271,10 @@ if 'pods' in ACTIVE_DEPLOYMENTS: k8s_yaml('samples/pods/pod.yaml') k8s_resource('test-pod', labels=['Cortex-Pods']) +if 'placement' in ACTIVE_DEPLOYMENTS: + print("Activating Cortex Placement Shim bundle") + k8s_yaml(helm('./helm/bundles/cortex-placement-shim', name='cortex-placement-shim', values=tilt_values, set=env_set_overrides)) + ########### Dev Dependencies local('sh helm/sync.sh helm/dev/cortex-prometheus-operator') k8s_yaml(helm('./helm/dev/cortex-prometheus-operator', name='cortex-prometheus-operator')) # Operator diff --git a/cmd/main.go b/cmd/manager/main.go similarity index 100% rename from cmd/main.go rename to cmd/manager/main.go diff --git a/cmd/shim/main.go b/cmd/shim/main.go new file mode 100644 index 000000000..6b0634229 --- /dev/null +++ b/cmd/shim/main.go @@ -0,0 +1,9 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package main + +func main() { + // TODO: this needs scaffolding, for now it just does nothing. + select {} +} diff --git a/helm/bundles/cortex-placement-shim/Chart.yaml b/helm/bundles/cortex-placement-shim/Chart.yaml new file mode 100644 index 000000000..7f53ed347 --- /dev/null +++ b/helm/bundles/cortex-placement-shim/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright SAP SE +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v2 +name: cortex-placement-shim +description: A Helm chart deploying the Cortex placement shim. +type: application +version: 0.0.1 +appVersion: 0.1.0 +dependencies: + # from: file://../../library/cortex-shim + - name: cortex-shim + repository: oci://ghcr.io/cobaltcore-dev/cortex/charts + version: 0.0.1 + # Owner info adds a configmap to the kubernetes cluster with information on + # the service owner. This makes it easier to find out who to contact in case + # of issues. See: https://github.com/sapcc/helm-charts/pkgs/container/helm-charts%2Fowner-info + - name: owner-info + repository: oci://ghcr.io/sapcc/helm-charts + version: 1.0.0 diff --git a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml new file mode 100644 index 000000000..41bf29794 --- /dev/null +++ b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml @@ -0,0 +1,734 @@ +groups: +- name: cortex-nova-alerts + rules: + - alert: CortexNovaSchedulingDown + expr: | + up{pod=~"cortex-nova-scheduling-.*"} != 1 or + absent(up{pod=~"cortex-nova-scheduling-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex/cortex + service: cortex + severity: critical + support_group: workload-management + playbook: docs/support/playbook/cortex/down + annotations: + summary: "Cortex Scheduling for Nova is down" + description: > + The Cortex scheduling service is down. Scheduling requests from Nova will + not be served. This is non-critical for vmware virtual machines, but + blocks kvm virtual machines from being scheduled. Thus, it is + recommended to immediately investigate and resolve the issue. + + - alert: CortexNovaKnowledgeDown + expr: | + up{pod=~"cortex-nova-knowledge-.*"} != 1 or + absent(up{pod=~"cortex-nova-knowledge-.*"}) + for: 5m + labels: + context: liveness + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + playbook: docs/support/playbook/cortex/down + annotations: + summary: "Cortex Knowledge for Nova is down" + description: > + The Cortex Knowledge service is down. This is no immediate problem, + since cortex is still able to process requests, + but the quality of the responses may be affected. + + - alert: CortexNovaDeschedulerPipelineErroring + expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 + for: 5m + labels: + context: descheduler + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Descheduler pipeline is erroring." + description: > + The Cortex descheduler pipeline is encountering errors during its execution. + This may indicate issues with the descheduling logic or the underlying infrastructure. + It is recommended to investigate the descheduler logs and the state of the VMs being processed. + + - alert: CortexNovaHttpRequest400sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Nova Scheduler HTTP request 400 errors too high" + description: > + Nova Scheduler is responding to placement requests with HTTP 4xx + errors. This is expected when the scheduling request cannot be served + by Cortex. However, it could also indicate that the request format has + changed and Cortex is unable to parse it. + + - alert: CortexNovaSchedulingHttpRequest500sTooHigh + expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1 + for: 5m + labels: + context: api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Nova Scheduler HTTP request 500 errors too high" + description: > + Nova Scheduler is responding to placement requests with HTTP 5xx errors. + This is not expected and indicates that Cortex is having some internal problem. + Nova will continue to place new VMs, but the placement will be less desirable. + Thus, no immediate action is needed. + + - alert: CortexNovaHighMemoryUsage + expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > 6000 * 1024 * 1024 + for: 5m + labels: + context: memory + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{$labels.component}}` uses too much memory" + description: > + `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it + should use much less, so there may be a memory leak or other changes + that are causing the memory usage to increase significantly. + + - alert: CortexNovaHighCPUUsage + expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5 + for: 5m + labels: + context: cpu + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{$labels.component}}` uses too much CPU" + description: > + `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually + it should use much less, so there may be a CPU leak or other changes + that are causing the CPU usage to increase significantly. + + - alert: CortexNovaTooManyDBConnectionAttempts + expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1 + for: 5m + labels: + context: db + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{$labels.component}}` is trying to connect to the database too often" + description: > + `{{$labels.component}}` is trying to connect to the database too often. This may happen + when the database is down or the connection parameters are misconfigured. + + - alert: CortexNovaSyncNotSuccessful + expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0 + for: 5m + labels: + context: syncstatus + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{$labels.component}}` Sync not successful" + description: > + `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may + happen when the datasource (OpenStack, Prometheus, etc.) is down or + the sync module is misconfigured. No immediate action is needed, since + the sync module will retry the sync operation and the currently synced + data will be kept. However, when this problem persists for a longer + time the service will have a less recent view of the datacenter. + + - alert: CortexNovaSyncObjectsDroppedToZero + expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0 + for: 60m + labels: + context: syncobjects + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" + description: > + `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen + when the datasource (OpenStack, Prometheus, etc.) is down or the sync + module is misconfigured. No immediate action is needed, since the sync + module will retry the sync operation and the currently synced data will + be kept. However, when this problem persists for a longer time the + service will have a less recent view of the datacenter. + + - alert: CortexNovaDatasourceUnready + expr: cortex_datasource_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: datasources + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" + description: > + This may indicate issues with the datasource + connectivity or configuration. It is recommended to investigate the + datasource status and logs for more details. + + - alert: CortexNovaKnowledgeUnready + expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: knowledge + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" + description: > + This may indicate issues with the knowledge + configuration. It is recommended to investigate the + knowledge status and logs for more details. + + - alert: CortexNovaDecisionsWithErrors + expr: cortex_decision_state{domain="nova",state="error"} > 0 + for: 5m + labels: + context: decisions + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Some decisions are in error state for operator `{{$labels.operator}}`" + description: > + The cortex scheduling pipeline generated decisions that are in error state. + This may indicate issues with the decision logic or the underlying infrastructure. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexNovaTooManyDecisionsWaiting + expr: cortex_decision_state{domain="nova",state="waiting"} > 10 + for: 5m + labels: + context: decisions + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`" + description: > + The cortex scheduling pipeline has a high number of decisions for which + no target host has been assigned yet. + + This may indicate a backlog in processing or issues with the decision logic. + It is recommended to investigate the decision logs and the state of the + VMs being processed. + + - alert: CortexNovaKPIUnready + expr: | + cortex_kpi_state{domain="nova",state!="ready"} != 0 + for: 60m + labels: + context: kpis + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" + description: > + This may indicate issues with the KPI + configuration. It is recommended to investigate the + KPI status and logs for more details. + + - alert: CortexNovaPipelineUnready + expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0 + for: 5m + labels: + context: pipelines + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" + description: > + This may indicate issues with the pipeline + configuration. It is recommended to investigate the + pipeline status and logs for more details. + + # Committed Resource Info API Alerts + - alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh + expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource info API HTTP 500 errors too high" + description: > + The committed resource info API (Limes LIQUID integration) is responding + with HTTP 5xx errors. This indicates internal problems building service info, + such as invalid flavor group data. Limes will not be able to discover available + resources until the issue is resolved. + + # Committed Resource Change API Alerts + - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh + expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API HTTP 400 errors too high" + description: > + The committed resource change API (Limes LIQUID integration) is responding + with HTTP 4xx errors. This may happen when Limes sends a request with + an outdated info version (409), the API is temporarily unavailable, + or the request format is invalid. Limes will typically retry these + requests, so no immediate action is needed unless the errors persist. + + - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh + expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API HTTP 500 errors too high" + description: > + The committed resource change API (Limes LIQUID integration) is responding + with HTTP 5xx errors. This is not expected and indicates that Cortex + is having an internal problem processing commitment changes. Limes will + continue to retry, but new commitments may not be fulfilled until the + issue is resolved. + + - alert: CortexNovaCommittedResourceLatencyTooHigh + expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API latency too high" + description: > + The committed resource change API (Limes LIQUID integration) is experiencing + high latency (p95 > 30s). This may indicate that the scheduling pipeline + is under heavy load or that reservation scheduling is taking longer than + expected. Limes requests may time out, causing commitment changes to fail. + + - alert: CortexNovaCommittedResourceRejectionRateTooHigh + expr: | + sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m])) + / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource rejection rate too high" + description: > + More than 50% of commitment change requests are being rejected. + This may indicate insufficient capacity in the datacenter to fulfill + new commitments, or issues with the commitment scheduling logic. + Rejected commitments are rolled back, so Limes will see them as failed + and may retry or report the failure to users. + + - alert: CortexNovaCommittedResourceTimeoutsTooHigh + expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource change API timeouts too high" + description: > + The committed resource change API (Limes LIQUID integration) timed out + while waiting for reservations to become ready. This indicates that the + scheduling pipeline is overloaded or reservations are taking too long + to be scheduled. Affected commitment changes are rolled back and Limes + will see them as failed. Consider investigating the scheduler performance + or increasing the timeout configuration. + + # Committed Resource Usage API Alerts + - alert: CortexNovaCommittedResourceUsageHttpRequest400sTooHigh + expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource usage API HTTP 400 errors too high" + description: > + The committed resource usage API (Limes LIQUID integration) is responding + with HTTP 4xx errors. This may indicate invalid project IDs or malformed + requests from Limes. Limes will typically retry these requests. + + - alert: CortexNovaCommittedResourceUsageHttpRequest500sTooHigh + expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource usage API HTTP 500 errors too high" + description: > + The committed resource usage API (Limes LIQUID integration) is responding + with HTTP 5xx errors. This indicates internal problems fetching reservations + or Nova server data. Limes may receive stale or incomplete usage data. + + - alert: CortexNovaCommittedResourceUsageLatencyTooHigh + expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_usage_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource usage API latency too high" + description: > + The committed resource usage API (Limes LIQUID integration) is experiencing + high latency (p95 > 5s). This may indicate slow Nova API responses or + database queries. Limes scrapes may time out, affecting quota reporting. + + # Committed Resource Capacity API Alerts + - alert: CortexNovaCommittedResourceCapacityHttpRequest400sTooHigh + expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource capacity API HTTP 400 errors too high" + description: > + The committed resource capacity API (Limes LIQUID integration) is responding + with HTTP 4xx errors. This may indicate malformed requests from Limes. + + - alert: CortexNovaCommittedResourceCapacityHttpRequest500sTooHigh + expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource capacity API HTTP 500 errors too high" + description: > + The committed resource capacity API (Limes LIQUID integration) is responding + with HTTP 5xx errors. This indicates internal problems calculating cluster + capacity. Limes may receive stale or incomplete capacity data. + + - alert: CortexNovaCommittedResourceCapacityLatencyTooHigh + expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_capacity_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5 + for: 5m + labels: + context: committed-resource-api + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource capacity API latency too high" + description: > + The committed resource capacity API (Limes LIQUID integration) is experiencing + high latency (p95 > 5s). This may indicate slow database queries or knowledge + CRD retrieval. Limes scrapes may time out, affecting capacity reporting. + + # Committed Resource Syncer Alerts + - alert: CortexNovaCommittedResourceSyncerErrorsHigh + expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3 + for: 5m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer experiencing errors" + description: > + The committed resource syncer has encountered multiple errors in the last hour. + This may indicate connectivity issues with Limes. Check the syncer logs for error details. + + - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh + expr: | + ( + sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h])) + / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) + ) > 0.05 + and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer unit mismatch rate >5%" + description: > + More than 5% of commitments are being skipped due to unit mismatches between + Limes and Cortex flavor groups. This happens when Limes has not yet been + updated to use the new unit format after a flavor group change. The affected + commitments will keep their existing reservations until Limes notices the update. + Check the logs if this error persists for longer time. + + - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh + expr: | + ( + sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h])) + / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) + ) > 0 + and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer unknown flavor group rate >0%" + description: > + Some commitments reference flavor groups that don't exist in + Cortex Knowledge (anymore). This may indicate that flavor group configuration is + out of sync between Limes and Cortex, or that Knowledge extraction is failing. + Check the flavor group Knowledge CRD and history to see what was changed. + + - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh + expr: | + ( + ( + rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) + + rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) + + rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) + ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) + ) > 0.01 + and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer local change rate >1%" + description: > + More than 1% of synced commitments are requiring reservation changes + (creates, deletes, or repairs). This is higher than expected for steady-state + operation and may indicate data inconsistencies, external modifications to + reservations, or issues with the CRDs. Check Cortex logs for details. + + - alert: CortexNovaCommittedResourceSyncerRepairRateHigh + expr: | + ( + rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) + ) > 0 + and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 + for: 15m + labels: + context: committed-resource-syncer + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Committed Resource syncer repair rate >0%" + description: > + Some commitments have reservations that needed repair + (wrong metadata like project ID or flavor group). This may indicate data + corruption, bugs in reservation creation, or external modifications. + Reservations are automatically repaired, but the root cause should be + investigated if this alert persists. + + - alert: CortexNovaDoesntFindValidKVMHosts + expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0 + for: 5m + labels: + context: scheduling + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Nova scheduling cannot find valid KVM hosts" + description: > + Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling + failed to find a valid `{{$labels.hvtype}}` host. This may indicate + capacity issues, misconfigured filters, or resource constraints in the + datacenter. Investigate the affected VMs and hypervisor availability. + + - alert: CortexNovaNewDatasourcesNotReconciling + expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0 + for: 60m + labels: + context: datasources + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "New datasource `{{$labels.datasource}}` has not reconciled" + description: > + A new datasource `{{$labels.datasource}}` has been added but has not + completed its first reconciliation yet. This may indicate issues with + the datasource controller's workqueue overprioritizing other datasources. + + - alert: CortexNovaExistingDatasourcesLackingBehind + expr: | + sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600 + and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1 + for: 10m + labels: + context: datasources + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Existing datasource `{{$labels.datasource}}` is lacking behind" + description: > + An existing datasource `{{$labels.datasource}}` has been queued for + reconciliation for more than 10 minutes. This may indicate issues with + the datasource controller's workqueue or that this or another datasource + is taking an unusually long time to reconcile. + + - alert: CortexNovaReconcileErrorsHigh + expr: | + (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m]))) + / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1 + for: 15m + labels: + context: controller-errors + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller reconcile error rate >10%" + description: > + More than 10% of controller reconciles are resulting in errors. This may + indicate issues with the controller logic, connectivity problems, or + external factors causing failures. Check the controller logs for error + details and investigate the affected resources. + + - alert: CortexNovaReconcileDurationHigher10Min + expr: | + (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m]))) + / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > 600 + for: 15m + labels: + context: controller-duration + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})" + description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}" + + - alert: CortexNovaWorkqueueNotDrained + expr: | + sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0 + for: 60m + labels: + context: controller-workqueue + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller {{ $labels.name }}'s backlog is not being drained." + description: > + The workqueue for controller {{ $labels.name }} has a backlog that is + not being drained. This may indicate that the controller is overwhelmed + with work or is stuck on certain resources. Check the controller logs + and the state of the resources it manages for more details. + + - alert: CortexNovaWebhookLatencyHigh + expr: | + histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2 + for: 15m + labels: + context: controller-webhook + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller webhook {{ $labels.webhook }} latency is high" + description: > + The latency for webhook {{ $labels.webhook }} is higher than expected (p90 > 200ms). + This may indicate performance issues with the webhook server or the logic it executes. + Check the webhook server logs and monitor its resource usage for more insights. + + - alert: CortexNovaWebhookErrorsHigh + expr: | + (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m]))) + / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1 + for: 15m + labels: + context: controller-webhook + dashboard: cortex/cortex + service: cortex + severity: warning + support_group: workload-management + annotations: + summary: "Controller webhook {{ $labels.webhook }} is experiencing errors" + description: > + The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes. + This may indicate issues with the webhook logic, connectivity problems, or + external factors causing failures. Check the webhook server logs for error + details and investigate the affected resources. \ No newline at end of file diff --git a/helm/bundles/cortex-placement-shim/templates/alerts.yaml b/helm/bundles/cortex-placement-shim/templates/alerts.yaml new file mode 100644 index 000000000..7db3b96e6 --- /dev/null +++ b/helm/bundles/cortex-placement-shim/templates/alerts.yaml @@ -0,0 +1,17 @@ +# Copyright SAP SE +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.alerts.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: cortex-placement-shim-alerts + labels: + type: alerting-rules + prometheus: {{ required ".Values.alerts.prometheus missing" .Values.alerts.prometheus | quote }} +spec: + {{- $files := .Files.Glob "alerts/*.alerts.yaml" }} + {{- range $path, $file := $files }} + {{ $file | toString | nindent 2 }} + {{- end }} +{{- end }} diff --git a/helm/bundles/cortex-placement-shim/templates/clusterrole.yaml b/helm/bundles/cortex-placement-shim/templates/clusterrole.yaml new file mode 100644 index 000000000..489878c89 --- /dev/null +++ b/helm/bundles/cortex-placement-shim/templates/clusterrole.yaml @@ -0,0 +1,23 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: cortex-placement-shim-role-hypervisor +rules: +- apiGroups: + - kvm.cloud.sap + resources: + - hypervisors + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - kvm.cloud.sap + resources: + - hypervisors/status + verbs: + - get \ No newline at end of file diff --git a/helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml b/helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml new file mode 100644 index 000000000..0388373f9 --- /dev/null +++ b/helm/bundles/cortex-placement-shim/templates/clusterrolebinding.yaml @@ -0,0 +1,14 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: cortex-placement-shim-rolebinding-hypervisor +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cortex-placement-shim-role-hypervisor +subjects: +- kind: ServiceAccount + name: cortex-placement-shim + namespace: {{ .Release.Namespace }} \ No newline at end of file diff --git a/helm/bundles/cortex-placement-shim/values.yaml b/helm/bundles/cortex-placement-shim/values.yaml new file mode 100644 index 000000000..40aa9cb11 --- /dev/null +++ b/helm/bundles/cortex-placement-shim/values.yaml @@ -0,0 +1,23 @@ +# Copyright SAP SE +# SPDX-License-Identifier: Apache-2.0 + +owner-info: + enabled: true + helm-chart-url: "https://github.com/cobaltcore-dev/cortex/helm/bundles/cortex-placement-shim" + maintainers: + - "arno.uhlig@sap.com" + - "julius.clausnitzer@sap.com" + - "malte.viering@sap.com" + - "marcel.gute@sap.com" + - "markus.wieland@sap.com" + - "p.matthes@sap.com" + support-group: "workload-management" + service: "cortex-placement-shim" + +alerts: + enabled: true + prometheus: openstack + +cortex-shim: + namePrefix: cortex-placement + conf: {} # TODO diff --git a/helm/library/cortex-shim/Chart.lock b/helm/library/cortex-shim/Chart.lock new file mode 100644 index 000000000..db4c5823b --- /dev/null +++ b/helm/library/cortex-shim/Chart.lock @@ -0,0 +1,6 @@ +dependencies: +- name: owner-info + repository: oci://ghcr.io/sapcc/helm-charts + version: 1.0.0 +digest: sha256:7643f231cc4ebda347fd12ec62fe4445c280e2b71d27eec555f3025290f5038f +generated: "2025-08-26T10:55:05.888651+02:00" diff --git a/helm/library/cortex-shim/Chart.yaml b/helm/library/cortex-shim/Chart.yaml new file mode 100644 index 000000000..5282dc655 --- /dev/null +++ b/helm/library/cortex-shim/Chart.yaml @@ -0,0 +1,8 @@ +apiVersion: v2 +name: cortex-shim +description: A Helm chart to distribute cortex shims. +type: application +version: 0.0.1 +appVersion: "sha-3e56acea" +icon: "https://example.com/icon.png" +dependencies: [] diff --git a/helm/library/cortex-shim/templates/_helpers.tpl b/helm/library/cortex-shim/templates/_helpers.tpl new file mode 100644 index 000000000..782e14eef --- /dev/null +++ b/helm/library/cortex-shim/templates/_helpers.tpl @@ -0,0 +1,50 @@ +{{- define "chart.name" -}} +{{- if .Chart }} + {{- if .Chart.Name }} + {{- .Chart.Name | trunc 63 | trimSuffix "-" }} + {{- else if .Values.nameOverride }} + {{ .Values.nameOverride | trunc 63 | trimSuffix "-" }} + {{- else }} + scheduling + {{- end }} +{{- else }} + scheduling +{{- end }} +{{- end }} + + +{{- define "chart.labels" -}} +{{- if .Chart.AppVersion -}} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +{{- if .Chart.Version }} +helm.sh/chart: {{ .Chart.Version | quote }} +{{- end }} +app.kubernetes.io/name: {{ include "chart.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + + +{{- define "chart.selectorLabels" -}} +app.kubernetes.io/name: {{ include "chart.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + + +{{- define "chart.hasMutatingWebhooks" -}} +{{- $hasMutating := false }} +{{- range . }} + {{- if eq .type "mutating" }} + $hasMutating = true }}{{- end }} +{{- end }} +{{ $hasMutating }}}}{{- end }} + + +{{- define "chart.hasValidatingWebhooks" -}} +{{- $hasValidating := false }} +{{- range . }} + {{- if eq .type "validating" }} + $hasValidating = true }}{{- end }} +{{- end }} +{{ $hasValidating }}}}{{- end }} diff --git a/helm/library/cortex-shim/templates/clusterrole.yaml b/helm/library/cortex-shim/templates/clusterrole.yaml new file mode 100644 index 000000000..74f8e7ad4 --- /dev/null +++ b/helm/library/cortex-shim/templates/clusterrole.yaml @@ -0,0 +1,100 @@ +# Roles that grant the shims access to cortex crds. +{{- if .Values.rbac.enable }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: {{ .Values.namePrefix }}-shim-role +rules: +- apiGroups: + - cortex.cloud + resources: + - knowledges + - datasources + - reservations + - decisions + - deschedulings + - pipelines + - kpis + - histories + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - cortex.cloud + resources: + - knowledges/finalizers + - datasources/finalizers + - reservations/finalizers + - decisions/finalizers + - deschedulings/finalizers + - pipelines/finalizers + - kpis/finalizers + - histories/finalizers + verbs: + - update +- apiGroups: + - cortex.cloud + resources: + - knowledges/status + - datasources/status + - reservations/status + - decisions/status + - deschedulings/status + - pipelines/status + - kpis/status + - histories/status + verbs: + - get + - patch + - update +- apiGroups: + - events.k8s.io + resources: + - events + verbs: + - create + - patch +{{- end -}} +{{- if and .Values.rbac.enable .Values.metrics.enable }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: {{ .Values.namePrefix }}-metrics-reader +rules: +- nonResourceURLs: + - "/metrics" + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: {{ .Values.namePrefix }}-metrics-auth-role +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +{{- end -}} + diff --git a/helm/library/cortex-shim/templates/clusterrolebinding.yaml b/helm/library/cortex-shim/templates/clusterrolebinding.yaml new file mode 100644 index 000000000..ca82a0119 --- /dev/null +++ b/helm/library/cortex-shim/templates/clusterrolebinding.yaml @@ -0,0 +1,34 @@ +{{- if .Values.rbac.enable }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: {{ .Values.namePrefix }}-shim-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Values.namePrefix }}-shim-role +subjects: +- kind: ServiceAccount + name: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }} + namespace: {{ .Release.Namespace }} +{{- end -}} +{{- if and .Values.rbac.enable .Values.metrics.enable }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: {{ .Values.namePrefix }}-metrics-auth-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Values.namePrefix }}-metrics-auth-role +subjects: +- kind: ServiceAccount + name: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }} + namespace: {{ .Release.Namespace }} +{{- end -}} + diff --git a/helm/library/cortex-shim/templates/deployment.yaml b/helm/library/cortex-shim/templates/deployment.yaml new file mode 100644 index 000000000..b38eb3c02 --- /dev/null +++ b/helm/library/cortex-shim/templates/deployment.yaml @@ -0,0 +1,112 @@ +# This file is safe from kubebuilder edit --plugins=helm/v1-alpha +# If you want to re-generate, add the --force flag. + +{{- if .Values.deployment.enable }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Values.namePrefix }}-shim + namespace: {{ .Release.Namespace }} + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.deployment.replicas }} + selector: + matchLabels: + {{- include "chart.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: shim + labels: + {{- include "chart.labels" . | nindent 8 }} + {{- if and .Values.deployment.pod .Values.deployment.pod.labels }} + {{- range $key, $value := .Values.deployment.pod.labels }} + {{ $key }}: {{ $value }} + {{- end }} + {{- end }} + spec: + containers: + - name: shim + args: + {{- range .Values.deployment.container.args }} + - {{ . }} + {{- end }} + ports: + - name: api + containerPort: 8080 + protocol: TCP + - name: metrics + containerPort: 2112 + protocol: TCP + command: + - /main + image: {{ .Values.deployment.container.image.repository }}:{{ .Values.deployment.container.image.tag | default .Chart.AppVersion }} + {{- if .Values.deployment.container.image.pullPolicy }} + imagePullPolicy: {{ .Values.deployment.container.image.pullPolicy }} + {{- end }} + {{- if .Values.deployment.container.env }} + env: + {{- range $key, $value := .Values.deployment.container.env }} + - name: {{ $key }} + value: {{ $value }} + {{- end }} + {{- end }} + livenessProbe: + {{- toYaml .Values.deployment.container.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.deployment.container.readinessProbe | nindent 12 }} + resources: + {{- toYaml .Values.deployment.container.resources | nindent 12 }} + securityContext: + {{- toYaml .Values.deployment.container.securityContext | nindent 12 }} + volumeMounts: + - name: shim-config-volume + mountPath: /etc/config + - name: shim-secrets-volume + mountPath: /etc/secrets + readOnly: true + securityContext: + {{- toYaml .Values.deployment.securityContext | nindent 8 }} + serviceAccountName: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }} + terminationGracePeriodSeconds: {{ .Values.deployment.terminationGracePeriodSeconds }} + volumes: + # Custom values to configure the shim. + - name: shim-config-volume + configMap: + name: {{ .Values.namePrefix }}-shim-config + - name: shim-secrets-volume + secret: + secretName: {{ .Values.namePrefix }}-shim-secrets +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.namePrefix }}-shim-config +data: + conf.json: |- + {{- $mergedConf := dict }} + {{- if .Values.global.conf }} + {{- $mergedConf = .Values.global.conf }} + {{- end }} + {{- if .Values.conf }} + {{- $mergedConf = mergeOverwrite .Values.conf $mergedConf }} + {{- end }} + {{ toJson $mergedConf }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Values.namePrefix }}-shim-secrets +type: Opaque +data: + secrets.json: |- + {{- $mergedSecrets := dict }} + {{- if .Values.global.secrets }} + {{- $mergedSecrets = .Values.global.secrets }} + {{- end }} + {{- if .Values.secrets }} + {{- $mergedSecrets = mergeOverwrite .Values.secrets $mergedSecrets }} + {{- end }} + {{ toJson $mergedSecrets | b64enc }} +{{- end }} \ No newline at end of file diff --git a/helm/library/cortex-shim/templates/service.yaml b/helm/library/cortex-shim/templates/service.yaml new file mode 100644 index 000000000..549ceed95 --- /dev/null +++ b/helm/library/cortex-shim/templates/service.yaml @@ -0,0 +1,33 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ .Values.namePrefix }}-shim-service + namespace: {{ .Release.Namespace }} + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + ports: + - port: 8080 + targetPort: api + protocol: TCP + name: api + selector: + app.kubernetes.io/name: {{ include "chart.name" . }} +{{- if .Values.metrics.enable }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ .Values.namePrefix }}-shim-metrics-service + namespace: {{ .Release.Namespace }} + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + ports: + - port: 2112 + targetPort: metrics + protocol: TCP + name: metrics + selector: + app.kubernetes.io/name: {{ include "chart.name" . }} +{{- end }} diff --git a/helm/library/cortex-shim/templates/serviceaccount.yaml b/helm/library/cortex-shim/templates/serviceaccount.yaml new file mode 100644 index 000000000..ea0789dd0 --- /dev/null +++ b/helm/library/cortex-shim/templates/serviceaccount.yaml @@ -0,0 +1,15 @@ +{{- if .Values.rbac.enable }} +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + {{- if and .Values.deployment.serviceAccount .Values.deployment.serviceAccount.annotations }} + annotations: + {{- range $key, $value := .Values.deployment.serviceAccount.annotations }} + {{ $key }}: {{ $value }} + {{- end }} + {{- end }} + name: {{ .Values.namePrefix }}-{{ .Values.deployment.serviceAccountName }} + namespace: {{ .Release.Namespace }} +{{- end -}} diff --git a/helm/library/cortex-shim/templates/servicemonitor.yaml b/helm/library/cortex-shim/templates/servicemonitor.yaml new file mode 100644 index 000000000..803e66dd5 --- /dev/null +++ b/helm/library/cortex-shim/templates/servicemonitor.yaml @@ -0,0 +1,16 @@ +# To integrate with Prometheus. +{{- if .Values.prometheus.enable }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + name: {{ .Values.namePrefix }}-shim-metrics-monitor + namespace: {{ .Release.Namespace }} +spec: + endpoints: + - port: metrics + selector: + matchLabels: + app.kubernetes.io/name: {{ include "chart.name" . }} +{{- end }} diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml new file mode 100644 index 000000000..6434e823a --- /dev/null +++ b/helm/library/cortex-shim/values.yaml @@ -0,0 +1,68 @@ +deployment: + enable: true + replicas: 1 + container: + image: + repository: ghcr.io/cobaltcore-dev/cortex-shim + args: + - "--metrics-bind-address=:2112" + - "--health-probe-bind-address=:8081" + - "--metrics-secure=false" + resources: + limits: + cpu: 500m + memory: 2048Mi + requests: + cpu: 10m + memory: 64Mi + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 20 + httpGet: + path: /healthz + port: 8081 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 10 + httpGet: + path: /readyz + port: 8081 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - "ALL" + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + terminationGracePeriodSeconds: 10 + serviceAccountName: shim + +# [METRICS]: Set to true to generate manifests for exporting metrics. +# To disable metrics export set false, and ensure that the +# ControllerManager argument "--metrics-bind-address=:8443" is removed. +metrics: + enable: true + +# [RBAC]: To enable RBAC (Permissions) configurations +rbac: + enable: true + +# [PROMETHEUS]: To enable a ServiceMonitor to export metrics to Prometheus set true +prometheus: + enable: true + +global: + conf: {} + +# Use this to unambiguate multiple cortex deployments in the same cluster. +namePrefix: cortex +conf: + # The scheduling domain this operator is responsible for. + schedulingDomain: cortex + # Used to differentiate different cortex deployments in the same cluster (e.g. leader election ID) + leaderElectionID: cortex-unknown + enabledControllers: + # The explanation controller is available for all decision resources. + - explanation-controller diff --git a/helm/library/cortex/templates/manager/manager.yaml b/helm/library/cortex/templates/manager/manager.yaml index 73672164f..0c9f362aa 100644 --- a/helm/library/cortex/templates/manager/manager.yaml +++ b/helm/library/cortex/templates/manager/manager.yaml @@ -51,7 +51,7 @@ spec: protocol: TCP {{- end }} command: - - /manager + - /main image: {{ .Values.controllerManager.container.image.repository }}:{{ .Values.controllerManager.container.image.tag | default .Chart.AppVersion }} {{- if .Values.controllerManager.container.image.pullPolicy }} imagePullPolicy: {{ .Values.controllerManager.container.image.pullPolicy }} diff --git a/internal/shim/placement/.gitkeep b/internal/shim/placement/.gitkeep new file mode 100644 index 000000000..e69de29bb From 1f7b644c1cd15776b62cf2f4b686dae30ec46d2a Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 11:56:31 +0200 Subject: [PATCH 02/17] Add monitoring labels and scaffold manager (w/o leader election) --- cmd/shim/main.go | 248 +++++++++++++++++- .../bundles/cortex-placement-shim/values.yaml | 6 +- helm/library/cortex-shim/values.yaml | 9 +- 3 files changed, 252 insertions(+), 11 deletions(-) diff --git a/cmd/shim/main.go b/cmd/shim/main.go index 6b0634229..d59490c3c 100644 --- a/cmd/shim/main.go +++ b/cmd/shim/main.go @@ -3,7 +3,251 @@ package main +import ( + "context" + "crypto/tls" + "errors" + "flag" + "net/http" + "os" + "path/filepath" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/cobaltcore-dev/cortex/pkg/monitoring" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/sapcc/go-bits/httpext" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/certwatcher" + "sigs.k8s.io/controller-runtime/pkg/healthz" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/metrics" + "sigs.k8s.io/controller-runtime/pkg/metrics/filters" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + "sigs.k8s.io/controller-runtime/pkg/webhook" +) + +var ( + // Scheme defines the scheme for the API types used by the shim. + scheme = runtime.NewScheme() + // setupLog is the logger used for setup operations in the shim. + setupLog = ctrl.Log.WithName("setup") +) + +func init() { + // Bind the Kubernetes client-go scheme and the custom API types to the + // scheme used by the shim. + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha1.AddToScheme(scheme)) // Cortex crds + utilruntime.Must(hv1.AddToScheme(scheme)) // Hypervisor crd +} + func main() { - // TODO: this needs scaffolding, for now it just does nothing. - select {} + ctx := context.Background() + restConfig := ctrl.GetConfigOrDie() + + var metricsAddr string + var metricsCertPath, metricsCertName, metricsCertKey string + var webhookCertPath, webhookCertName, webhookCertKey string + // The shim does not require leader election, but this flag is provided to + // stay consistent with the kubebuilder scaffold. + var enableLeaderElection bool + var probeAddr string + var secureMetrics bool + var enableHTTP2 bool + var tlsOpts []func(*tls.Config) + flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ + "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") + flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") + flag.BoolVar(&enableLeaderElection, "leader-elect", false, + "Enable leader election for controller manager. "+ + "Enabling this will ensure there is only one active controller manager.") + flag.BoolVar(&secureMetrics, "metrics-secure", true, + "If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.") + flag.StringVar(&webhookCertPath, "webhook-cert-path", "", "The directory that contains the webhook certificate.") + flag.StringVar(&webhookCertName, "webhook-cert-name", "tls.crt", "The name of the webhook certificate file.") + flag.StringVar(&webhookCertKey, "webhook-cert-key", "tls.key", "The name of the webhook key file.") + flag.StringVar(&metricsCertPath, "metrics-cert-path", "", + "The directory that contains the metrics server certificate.") + flag.StringVar(&metricsCertName, "metrics-cert-name", "tls.crt", "The name of the metrics server certificate file.") + flag.StringVar(&metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.") + flag.BoolVar(&enableHTTP2, "enable-http2", false, + "If set, HTTP/2 will be enabled for the metrics and webhook servers") + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) + flag.Parse() + + // Check that we're really running this shim without leader election enabled. + if enableLeaderElection { + err := errors.New("leader election should not be enabled for the shim") + setupLog.Error(err, "invalid configuration") + os.Exit(1) + } + + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + // if the enable-http2 flag is false (the default), http/2 should be disabled + // due to its vulnerabilities. More specifically, disabling http/2 will + // prevent from being vulnerable to the HTTP/2 Stream Cancellation and + // Rapid Reset CVEs. For more information see: + // - https://github.com/advisories/GHSA-qppj-fm5r-hxr3 + // - https://github.com/advisories/GHSA-4374-p667-p6c8 + disableHTTP2 := func(c *tls.Config) { + setupLog.Info("disabling http/2") + c.NextProtos = []string{"http/1.1"} + } + + if !enableHTTP2 { + tlsOpts = append(tlsOpts, disableHTTP2) + } + + // Create watchers for metrics and webhooks certificates + var metricsCertWatcher, webhookCertWatcher *certwatcher.CertWatcher + + // Initial webhook TLS options + webhookTLSOpts := tlsOpts + + if webhookCertPath != "" { + setupLog.Info("Initializing webhook certificate watcher using provided certificates", + "webhook-cert-path", webhookCertPath, "webhook-cert-name", webhookCertName, "webhook-cert-key", webhookCertKey) + + var err error + webhookCertWatcher, err = certwatcher.New( + filepath.Join(webhookCertPath, webhookCertName), + filepath.Join(webhookCertPath, webhookCertKey), + ) + if err != nil { + setupLog.Error(err, "Failed to initialize webhook certificate watcher") + os.Exit(1) + } + + webhookTLSOpts = append(webhookTLSOpts, func(config *tls.Config) { + config.GetCertificate = webhookCertWatcher.GetCertificate + }) + } + + webhookServer := webhook.NewServer(webhook.Options{ + TLSOpts: webhookTLSOpts, + }) + + // Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server. + // More info: + // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.21.0/pkg/metrics/server + // - https://book.kubebuilder.io/reference/metrics.html + metricsServerOptions := metricsserver.Options{ + BindAddress: metricsAddr, + SecureServing: secureMetrics, + TLSOpts: tlsOpts, + } + + if secureMetrics { + // FilterProvider is used to protect the metrics endpoint with authn/authz. + // These configurations ensure that only authorized users and service accounts + // can access the metrics endpoint. The RBAC are configured in 'config/rbac/kustomization.yaml'. More info: + // https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.21.0/pkg/metrics/filters#WithAuthenticationAndAuthorization + metricsServerOptions.FilterProvider = filters.WithAuthenticationAndAuthorization + } + + // If the certificate is not specified, controller-runtime will automatically + // generate self-signed certificates for the metrics server. While convenient for development and testing, + // this setup is not recommended for production. + // + // If you enable certManager, uncomment the following lines: + // - [METRICS-WITH-CERTS] at config/default/kustomization.yaml to generate and use certificates + // managed by cert-manager for the metrics server. + // - [PROMETHEUS-WITH-CERTS] at config/prometheus/kustomization.yaml for TLS certification. + if metricsCertPath != "" { + setupLog.Info("Initializing metrics certificate watcher using provided certificates", + "metrics-cert-path", metricsCertPath, "metrics-cert-name", metricsCertName, "metrics-cert-key", metricsCertKey) + + var err error + metricsCertWatcher, err = certwatcher.New( + filepath.Join(metricsCertPath, metricsCertName), + filepath.Join(metricsCertPath, metricsCertKey), + ) + if err != nil { + setupLog.Error(err, "to initialize metrics certificate watcher", "error", err) + os.Exit(1) + } + + metricsServerOptions.TLSOpts = append(metricsServerOptions.TLSOpts, func(config *tls.Config) { + config.GetCertificate = metricsCertWatcher.GetCertificate + }) + } + + mgr, err := ctrl.NewManager(restConfig, ctrl.Options{ + Scheme: scheme, + Metrics: metricsServerOptions, + WebhookServer: webhookServer, + HealthProbeBindAddress: probeAddr, + // Kept for consistency with kubebuilder scaffold, but the shim should + // always run with leader election disabled. + LeaderElection: enableLeaderElection, + }) + if err != nil { + setupLog.Error(err, "unable to start manager") + os.Exit(1) + } + + // TODO: Initialize multicluster client here. + + // Our custom monitoring registry can add prometheus labels to all metrics. + // This is useful to distinguish metrics from different deployments. + metricsConfig := conf.GetConfigOrDie[monitoring.Config]() + metrics.Registry = monitoring.WrapRegistry(metrics.Registry, metricsConfig) + + // API endpoint. + mux := http.NewServeMux() + + // +kubebuilder:scaffold:builder + + if metricsCertWatcher != nil { + setupLog.Info("Adding metrics certificate watcher to manager") + if err := mgr.Add(metricsCertWatcher); err != nil { + setupLog.Error(err, "unable to add metrics certificate watcher to manager") + os.Exit(1) + } + } + + if webhookCertWatcher != nil { + setupLog.Info("Adding webhook certificate watcher to manager") + if err := mgr.Add(webhookCertWatcher); err != nil { + setupLog.Error(err, "unable to add webhook certificate watcher to manager") + os.Exit(1) + } + } + + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up health check") + os.Exit(1) + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up ready check") + os.Exit(1) + } + + errchan := make(chan error) + go func() { + errchan <- func() error { + setupLog.Info("starting api server", "address", ":8080") + return httpext.ListenAndServeContext(ctx, ":8080", mux) + }() + }() + go func() { + if err := <-errchan; err != nil { + setupLog.Error(err, "problem running api server") + os.Exit(1) + } + }() + + setupLog.Info("starting manager") + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + setupLog.Error(err, "problem running manager") + os.Exit(1) + } } diff --git a/helm/bundles/cortex-placement-shim/values.yaml b/helm/bundles/cortex-placement-shim/values.yaml index 40aa9cb11..6dd793653 100644 --- a/helm/bundles/cortex-placement-shim/values.yaml +++ b/helm/bundles/cortex-placement-shim/values.yaml @@ -20,4 +20,8 @@ alerts: cortex-shim: namePrefix: cortex-placement - conf: {} # TODO + conf: + monitoring: + labels: + github_org: cobaltcore-dev + github_repo: cortex diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml index 6434e823a..1c45c2542 100644 --- a/helm/library/cortex-shim/values.yaml +++ b/helm/library/cortex-shim/values.yaml @@ -58,11 +58,4 @@ global: # Use this to unambiguate multiple cortex deployments in the same cluster. namePrefix: cortex -conf: - # The scheduling domain this operator is responsible for. - schedulingDomain: cortex - # Used to differentiate different cortex deployments in the same cluster (e.g. leader election ID) - leaderElectionID: cortex-unknown - enabledControllers: - # The explanation controller is available for all decision resources. - - explanation-controller +conf: {} # No config for now that's needed by all the shims. From 03fc00764340f722b9992c8062e573182ec0eeca Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 12:01:19 +0200 Subject: [PATCH 03/17] Remove alerts --- .../alerts/placement-shim.alerts.yaml | 735 +----------------- 1 file changed, 2 insertions(+), 733 deletions(-) diff --git a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml index 41bf29794..03aea7763 100644 --- a/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml +++ b/helm/bundles/cortex-placement-shim/alerts/placement-shim.alerts.yaml @@ -1,734 +1,3 @@ groups: -- name: cortex-nova-alerts - rules: - - alert: CortexNovaSchedulingDown - expr: | - up{pod=~"cortex-nova-scheduling-.*"} != 1 or - absent(up{pod=~"cortex-nova-scheduling-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex/cortex - service: cortex - severity: critical - support_group: workload-management - playbook: docs/support/playbook/cortex/down - annotations: - summary: "Cortex Scheduling for Nova is down" - description: > - The Cortex scheduling service is down. Scheduling requests from Nova will - not be served. This is non-critical for vmware virtual machines, but - blocks kvm virtual machines from being scheduled. Thus, it is - recommended to immediately investigate and resolve the issue. - - - alert: CortexNovaKnowledgeDown - expr: | - up{pod=~"cortex-nova-knowledge-.*"} != 1 or - absent(up{pod=~"cortex-nova-knowledge-.*"}) - for: 5m - labels: - context: liveness - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - playbook: docs/support/playbook/cortex/down - annotations: - summary: "Cortex Knowledge for Nova is down" - description: > - The Cortex Knowledge service is down. This is no immediate problem, - since cortex is still able to process requests, - but the quality of the responses may be affected. - - - alert: CortexNovaDeschedulerPipelineErroring - expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0 - for: 5m - labels: - context: descheduler - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Descheduler pipeline is erroring." - description: > - The Cortex descheduler pipeline is encountering errors during its execution. - This may indicate issues with the descheduling logic or the underlying infrastructure. - It is recommended to investigate the descheduler logs and the state of the VMs being processed. - - - alert: CortexNovaHttpRequest400sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Nova Scheduler HTTP request 400 errors too high" - description: > - Nova Scheduler is responding to placement requests with HTTP 4xx - errors. This is expected when the scheduling request cannot be served - by Cortex. However, it could also indicate that the request format has - changed and Cortex is unable to parse it. - - - alert: CortexNovaSchedulingHttpRequest500sTooHigh - expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1 - for: 5m - labels: - context: api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Nova Scheduler HTTP request 500 errors too high" - description: > - Nova Scheduler is responding to placement requests with HTTP 5xx errors. - This is not expected and indicates that Cortex is having some internal problem. - Nova will continue to place new VMs, but the placement will be less desirable. - Thus, no immediate action is needed. - - - alert: CortexNovaHighMemoryUsage - expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > 6000 * 1024 * 1024 - for: 5m - labels: - context: memory - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` uses too much memory" - description: > - `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it - should use much less, so there may be a memory leak or other changes - that are causing the memory usage to increase significantly. - - - alert: CortexNovaHighCPUUsage - expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5 - for: 5m - labels: - context: cpu - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` uses too much CPU" - description: > - `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually - it should use much less, so there may be a CPU leak or other changes - that are causing the CPU usage to increase significantly. - - - alert: CortexNovaTooManyDBConnectionAttempts - expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1 - for: 5m - labels: - context: db - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` is trying to connect to the database too often" - description: > - `{{$labels.component}}` is trying to connect to the database too often. This may happen - when the database is down or the connection parameters are misconfigured. - - - alert: CortexNovaSyncNotSuccessful - expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0 - for: 5m - labels: - context: syncstatus - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` Sync not successful" - description: > - `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may - happen when the datasource (OpenStack, Prometheus, etc.) is down or - the sync module is misconfigured. No immediate action is needed, since - the sync module will retry the sync operation and the currently synced - data will be kept. However, when this problem persists for a longer - time the service will have a less recent view of the datacenter. - - - alert: CortexNovaSyncObjectsDroppedToZero - expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0 - for: 60m - labels: - context: syncobjects - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`" - description: > - `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen - when the datasource (OpenStack, Prometheus, etc.) is down or the sync - module is misconfigured. No immediate action is needed, since the sync - module will retry the sync operation and the currently synced data will - be kept. However, when this problem persists for a longer time the - service will have a less recent view of the datacenter. - - - alert: CortexNovaDatasourceUnready - expr: cortex_datasource_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: datasources - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the datasource - connectivity or configuration. It is recommended to investigate the - datasource status and logs for more details. - - - alert: CortexNovaKnowledgeUnready - expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: knowledge - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the knowledge - configuration. It is recommended to investigate the - knowledge status and logs for more details. - - - alert: CortexNovaDecisionsWithErrors - expr: cortex_decision_state{domain="nova",state="error"} > 0 - for: 5m - labels: - context: decisions - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Some decisions are in error state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline generated decisions that are in error state. - This may indicate issues with the decision logic or the underlying infrastructure. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexNovaTooManyDecisionsWaiting - expr: cortex_decision_state{domain="nova",state="waiting"} > 10 - for: 5m - labels: - context: decisions - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`" - description: > - The cortex scheduling pipeline has a high number of decisions for which - no target host has been assigned yet. - - This may indicate a backlog in processing or issues with the decision logic. - It is recommended to investigate the decision logs and the state of the - VMs being processed. - - - alert: CortexNovaKPIUnready - expr: | - cortex_kpi_state{domain="nova",state!="ready"} != 0 - for: 60m - labels: - context: kpis - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the KPI - configuration. It is recommended to investigate the - KPI status and logs for more details. - - - alert: CortexNovaPipelineUnready - expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0 - for: 5m - labels: - context: pipelines - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state" - description: > - This may indicate issues with the pipeline - configuration. It is recommended to investigate the - pipeline status and logs for more details. - - # Committed Resource Info API Alerts - - alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh - expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource info API HTTP 500 errors too high" - description: > - The committed resource info API (Limes LIQUID integration) is responding - with HTTP 5xx errors. This indicates internal problems building service info, - such as invalid flavor group data. Limes will not be able to discover available - resources until the issue is resolved. - - # Committed Resource Change API Alerts - - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh - expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource change API HTTP 400 errors too high" - description: > - The committed resource change API (Limes LIQUID integration) is responding - with HTTP 4xx errors. This may happen when Limes sends a request with - an outdated info version (409), the API is temporarily unavailable, - or the request format is invalid. Limes will typically retry these - requests, so no immediate action is needed unless the errors persist. - - - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh - expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource change API HTTP 500 errors too high" - description: > - The committed resource change API (Limes LIQUID integration) is responding - with HTTP 5xx errors. This is not expected and indicates that Cortex - is having an internal problem processing commitment changes. Limes will - continue to retry, but new commitments may not be fulfilled until the - issue is resolved. - - - alert: CortexNovaCommittedResourceLatencyTooHigh - expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource change API latency too high" - description: > - The committed resource change API (Limes LIQUID integration) is experiencing - high latency (p95 > 30s). This may indicate that the scheduling pipeline - is under heavy load or that reservation scheduling is taking longer than - expected. Limes requests may time out, causing commitment changes to fail. - - - alert: CortexNovaCommittedResourceRejectionRateTooHigh - expr: | - sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m])) - / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource rejection rate too high" - description: > - More than 50% of commitment change requests are being rejected. - This may indicate insufficient capacity in the datacenter to fulfill - new commitments, or issues with the commitment scheduling logic. - Rejected commitments are rolled back, so Limes will see them as failed - and may retry or report the failure to users. - - - alert: CortexNovaCommittedResourceTimeoutsTooHigh - expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource change API timeouts too high" - description: > - The committed resource change API (Limes LIQUID integration) timed out - while waiting for reservations to become ready. This indicates that the - scheduling pipeline is overloaded or reservations are taking too long - to be scheduled. Affected commitment changes are rolled back and Limes - will see them as failed. Consider investigating the scheduler performance - or increasing the timeout configuration. - - # Committed Resource Usage API Alerts - - alert: CortexNovaCommittedResourceUsageHttpRequest400sTooHigh - expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource usage API HTTP 400 errors too high" - description: > - The committed resource usage API (Limes LIQUID integration) is responding - with HTTP 4xx errors. This may indicate invalid project IDs or malformed - requests from Limes. Limes will typically retry these requests. - - - alert: CortexNovaCommittedResourceUsageHttpRequest500sTooHigh - expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource usage API HTTP 500 errors too high" - description: > - The committed resource usage API (Limes LIQUID integration) is responding - with HTTP 5xx errors. This indicates internal problems fetching reservations - or Nova server data. Limes may receive stale or incomplete usage data. - - - alert: CortexNovaCommittedResourceUsageLatencyTooHigh - expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_usage_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource usage API latency too high" - description: > - The committed resource usage API (Limes LIQUID integration) is experiencing - high latency (p95 > 5s). This may indicate slow Nova API responses or - database queries. Limes scrapes may time out, affecting quota reporting. - - # Committed Resource Capacity API Alerts - - alert: CortexNovaCommittedResourceCapacityHttpRequest400sTooHigh - expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource capacity API HTTP 400 errors too high" - description: > - The committed resource capacity API (Limes LIQUID integration) is responding - with HTTP 4xx errors. This may indicate malformed requests from Limes. - - - alert: CortexNovaCommittedResourceCapacityHttpRequest500sTooHigh - expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource capacity API HTTP 500 errors too high" - description: > - The committed resource capacity API (Limes LIQUID integration) is responding - with HTTP 5xx errors. This indicates internal problems calculating cluster - capacity. Limes may receive stale or incomplete capacity data. - - - alert: CortexNovaCommittedResourceCapacityLatencyTooHigh - expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_capacity_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5 - for: 5m - labels: - context: committed-resource-api - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource capacity API latency too high" - description: > - The committed resource capacity API (Limes LIQUID integration) is experiencing - high latency (p95 > 5s). This may indicate slow database queries or knowledge - CRD retrieval. Limes scrapes may time out, affecting capacity reporting. - - # Committed Resource Syncer Alerts - - alert: CortexNovaCommittedResourceSyncerErrorsHigh - expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3 - for: 5m - labels: - context: committed-resource-syncer - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer experiencing errors" - description: > - The committed resource syncer has encountered multiple errors in the last hour. - This may indicate connectivity issues with Limes. Check the syncer logs for error details. - - - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh - expr: | - ( - sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h])) - / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) - ) > 0.05 - and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0 - for: 15m - labels: - context: committed-resource-syncer - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer unit mismatch rate >5%" - description: > - More than 5% of commitments are being skipped due to unit mismatches between - Limes and Cortex flavor groups. This happens when Limes has not yet been - updated to use the new unit format after a flavor group change. The affected - commitments will keep their existing reservations until Limes notices the update. - Check the logs if this error persists for longer time. - - - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh - expr: | - ( - sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h])) - / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) - ) > 0 - and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0 - for: 15m - labels: - context: committed-resource-syncer - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer unknown flavor group rate >0%" - description: > - Some commitments reference flavor groups that don't exist in - Cortex Knowledge (anymore). This may indicate that flavor group configuration is - out of sync between Limes and Cortex, or that Knowledge extraction is failing. - Check the flavor group Knowledge CRD and history to see what was changed. - - - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh - expr: | - ( - ( - rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) + - rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) + - rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) - ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) - ) > 0.01 - and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 - for: 15m - labels: - context: committed-resource-syncer - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer local change rate >1%" - description: > - More than 1% of synced commitments are requiring reservation changes - (creates, deletes, or repairs). This is higher than expected for steady-state - operation and may indicate data inconsistencies, external modifications to - reservations, or issues with the CRDs. Check Cortex logs for details. - - - alert: CortexNovaCommittedResourceSyncerRepairRateHigh - expr: | - ( - rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) - / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) - ) > 0 - and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 - for: 15m - labels: - context: committed-resource-syncer - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer repair rate >0%" - description: > - Some commitments have reservations that needed repair - (wrong metadata like project ID or flavor group). This may indicate data - corruption, bugs in reservation creation, or external modifications. - Reservations are automatically repaired, but the root cause should be - investigated if this alert persists. - - - alert: CortexNovaDoesntFindValidKVMHosts - expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0 - for: 5m - labels: - context: scheduling - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Nova scheduling cannot find valid KVM hosts" - description: > - Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling - failed to find a valid `{{$labels.hvtype}}` host. This may indicate - capacity issues, misconfigured filters, or resource constraints in the - datacenter. Investigate the affected VMs and hypervisor availability. - - - alert: CortexNovaNewDatasourcesNotReconciling - expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0 - for: 60m - labels: - context: datasources - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "New datasource `{{$labels.datasource}}` has not reconciled" - description: > - A new datasource `{{$labels.datasource}}` has been added but has not - completed its first reconciliation yet. This may indicate issues with - the datasource controller's workqueue overprioritizing other datasources. - - - alert: CortexNovaExistingDatasourcesLackingBehind - expr: | - sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600 - and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1 - for: 10m - labels: - context: datasources - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Existing datasource `{{$labels.datasource}}` is lacking behind" - description: > - An existing datasource `{{$labels.datasource}}` has been queued for - reconciliation for more than 10 minutes. This may indicate issues with - the datasource controller's workqueue or that this or another datasource - is taking an unusually long time to reconcile. - - - alert: CortexNovaReconcileErrorsHigh - expr: | - (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m]))) - / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1 - for: 15m - labels: - context: controller-errors - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller reconcile error rate >10%" - description: > - More than 10% of controller reconciles are resulting in errors. This may - indicate issues with the controller logic, connectivity problems, or - external factors causing failures. Check the controller logs for error - details and investigate the affected resources. - - - alert: CortexNovaReconcileDurationHigher10Min - expr: | - (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m]))) - / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > 600 - for: 15m - labels: - context: controller-duration - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})" - description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}" - - - alert: CortexNovaWorkqueueNotDrained - expr: | - sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0 - for: 60m - labels: - context: controller-workqueue - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller {{ $labels.name }}'s backlog is not being drained." - description: > - The workqueue for controller {{ $labels.name }} has a backlog that is - not being drained. This may indicate that the controller is overwhelmed - with work or is stuck on certain resources. Check the controller logs - and the state of the resources it manages for more details. - - - alert: CortexNovaWebhookLatencyHigh - expr: | - histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2 - for: 15m - labels: - context: controller-webhook - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller webhook {{ $labels.webhook }} latency is high" - description: > - The latency for webhook {{ $labels.webhook }} is higher than expected (p90 > 200ms). - This may indicate performance issues with the webhook server or the logic it executes. - Check the webhook server logs and monitor its resource usage for more insights. - - - alert: CortexNovaWebhookErrorsHigh - expr: | - (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m]))) - / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1 - for: 15m - labels: - context: controller-webhook - dashboard: cortex/cortex - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Controller webhook {{ $labels.webhook }} is experiencing errors" - description: > - The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes. - This may indicate issues with the webhook logic, connectivity problems, or - external factors causing failures. Check the webhook server logs for error - details and investigate the affected resources. \ No newline at end of file +- name: cortex-placement-shim-alerts + rules: [] \ No newline at end of file From 154338f9560e497ba9ff9191257f2dd79311a6c7 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 13:20:52 +0200 Subject: [PATCH 04/17] PR feedback --- .github/workflows/push-images.yaml | 7 ++++++- cmd/shim/main.go | 9 ++++----- helm/library/cortex-shim/templates/_helpers.tpl | 12 ++++++++---- helm/library/cortex-shim/templates/service.yaml | 4 ++-- helm/library/cortex-shim/values.yaml | 4 ++-- 5 files changed, 22 insertions(+), 14 deletions(-) diff --git a/.github/workflows/push-images.yaml b/.github/workflows/push-images.yaml index 3085b503b..f3be685ce 100644 --- a/.github/workflows/push-images.yaml +++ b/.github/workflows/push-images.yaml @@ -81,6 +81,11 @@ jobs: files: | cmd/shim/** internal/shim/** + api/** + pkg/** + go.mod + go.sum + Dockerfile - name: Docker Meta (Cortex Shim) if: steps.changed_shim_files.outputs.all_changed_files != '' id: meta_cortex_shim @@ -99,7 +104,7 @@ jobs: id: push_cortex_shim uses: docker/build-push-action@v7 with: - context: cmd/shim + context: . platforms: linux/amd64,linux/arm64 push: true tags: ${{ steps.meta_cortex_shim.outputs.tags }} diff --git a/cmd/shim/main.go b/cmd/shim/main.go index d59490c3c..970c8c934 100644 --- a/cmd/shim/main.go +++ b/cmd/shim/main.go @@ -4,7 +4,6 @@ package main import ( - "context" "crypto/tls" "errors" "flag" @@ -46,7 +45,7 @@ func init() { } func main() { - ctx := context.Background() + ctx := ctrl.SetupSignalHandler() restConfig := ctrl.GetConfigOrDie() var metricsAddr string @@ -110,7 +109,7 @@ func main() { var metricsCertWatcher, webhookCertWatcher *certwatcher.CertWatcher // Initial webhook TLS options - webhookTLSOpts := tlsOpts + webhookTLSOpts := append([]func(*tls.Config){}, tlsOpts...) if webhookCertPath != "" { setupLog.Info("Initializing webhook certificate watcher using provided certificates", @@ -142,7 +141,7 @@ func main() { metricsServerOptions := metricsserver.Options{ BindAddress: metricsAddr, SecureServing: secureMetrics, - TLSOpts: tlsOpts, + TLSOpts: append([]func(*tls.Config){}, tlsOpts...), } if secureMetrics { @@ -246,7 +245,7 @@ func main() { }() setupLog.Info("starting manager") - if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + if err := mgr.Start(ctx); err != nil { setupLog.Error(err, "problem running manager") os.Exit(1) } diff --git a/helm/library/cortex-shim/templates/_helpers.tpl b/helm/library/cortex-shim/templates/_helpers.tpl index 782e14eef..cca33d701 100644 --- a/helm/library/cortex-shim/templates/_helpers.tpl +++ b/helm/library/cortex-shim/templates/_helpers.tpl @@ -36,15 +36,19 @@ app.kubernetes.io/instance: {{ .Release.Name }} {{- $hasMutating := false }} {{- range . }} {{- if eq .type "mutating" }} - $hasMutating = true }}{{- end }} + {{- $hasMutating = true -}} + {{- end }} +{{- end }} +{{ $hasMutating }} {{- end }} -{{ $hasMutating }}}}{{- end }} {{- define "chart.hasValidatingWebhooks" -}} {{- $hasValidating := false }} {{- range . }} {{- if eq .type "validating" }} - $hasValidating = true }}{{- end }} + {{- $hasValidating = true -}} + {{- end }} +{{- end }} +{{ $hasValidating }} {{- end }} -{{ $hasValidating }}}}{{- end }} diff --git a/helm/library/cortex-shim/templates/service.yaml b/helm/library/cortex-shim/templates/service.yaml index 549ceed95..faf3082a3 100644 --- a/helm/library/cortex-shim/templates/service.yaml +++ b/helm/library/cortex-shim/templates/service.yaml @@ -12,7 +12,7 @@ spec: protocol: TCP name: api selector: - app.kubernetes.io/name: {{ include "chart.name" . }} + {{- include "chart.selectorLabels" . | nindent 4 }} {{- if .Values.metrics.enable }} --- apiVersion: v1 @@ -29,5 +29,5 @@ spec: protocol: TCP name: metrics selector: - app.kubernetes.io/name: {{ include "chart.name" . }} + {{- include "chart.selectorLabels" . | nindent 4 }} {{- end }} diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml index 1c45c2542..1d1bc844c 100644 --- a/helm/library/cortex-shim/values.yaml +++ b/helm/library/cortex-shim/values.yaml @@ -40,8 +40,8 @@ deployment: serviceAccountName: shim # [METRICS]: Set to true to generate manifests for exporting metrics. -# To disable metrics export set false, and ensure that the -# ControllerManager argument "--metrics-bind-address=:8443" is removed. +# To disable metrics export set false, and remove the container args +# "--metrics-bind-address=:2112" and "--metrics-secure=false". metrics: enable: true From 2305af318d99847d11ecae5598ae305f0a2d4092 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 13:36:10 +0200 Subject: [PATCH 05/17] PR feedback --- cmd/shim/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/shim/main.go b/cmd/shim/main.go index 970c8c934..9feea8d5f 100644 --- a/cmd/shim/main.go +++ b/cmd/shim/main.go @@ -170,7 +170,7 @@ func main() { filepath.Join(metricsCertPath, metricsCertKey), ) if err != nil { - setupLog.Error(err, "to initialize metrics certificate watcher", "error", err) + setupLog.Error(err, "Failed to initialize metrics certificate watcher") os.Exit(1) } From 6252f5c6d4fa944e71b9f3cbf7d25c2d8f93dcba Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 13:42:55 +0200 Subject: [PATCH 06/17] 3 replicas by default --- helm/library/cortex-shim/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml index 1d1bc844c..63574fbe4 100644 --- a/helm/library/cortex-shim/values.yaml +++ b/helm/library/cortex-shim/values.yaml @@ -1,6 +1,6 @@ deployment: enable: true - replicas: 1 + replicas: 3 container: image: repository: ghcr.io/cobaltcore-dev/cortex-shim From 33d181f84176b10128175381862621482568190d Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 14:20:37 +0200 Subject: [PATCH 07/17] Scaffold empty placement api handlers --- cmd/shim/main.go | 7 ++ .../handlers/allocation_candidates.go | 40 +++++++ .../shim/placement/handlers/allocations.go | 78 +++++++++++++ internal/shim/placement/handlers/reshaper.go | 30 +++++ .../placement/handlers/resource_classes.go | 72 ++++++++++++ .../handlers/resource_provider_aggregates.go | 45 ++++++++ .../handlers/resource_provider_allocations.go | 24 ++++ .../handlers/resource_provider_inventories.go | 103 ++++++++++++++++++ .../handlers/resource_provider_traits.go | 54 +++++++++ .../handlers/resource_provider_usages.go | 24 ++++ .../placement/handlers/resource_providers.go | 84 ++++++++++++++ internal/shim/placement/handlers/root.go | 23 ++++ internal/shim/placement/handlers/traits.go | 60 ++++++++++ internal/shim/placement/handlers/usages.go | 27 +++++ internal/shim/placement/handlers/zz_index.go | 73 +++++++++++++ 15 files changed, 744 insertions(+) create mode 100644 internal/shim/placement/handlers/allocation_candidates.go create mode 100644 internal/shim/placement/handlers/allocations.go create mode 100644 internal/shim/placement/handlers/reshaper.go create mode 100644 internal/shim/placement/handlers/resource_classes.go create mode 100644 internal/shim/placement/handlers/resource_provider_aggregates.go create mode 100644 internal/shim/placement/handlers/resource_provider_allocations.go create mode 100644 internal/shim/placement/handlers/resource_provider_inventories.go create mode 100644 internal/shim/placement/handlers/resource_provider_traits.go create mode 100644 internal/shim/placement/handlers/resource_provider_usages.go create mode 100644 internal/shim/placement/handlers/resource_providers.go create mode 100644 internal/shim/placement/handlers/root.go create mode 100644 internal/shim/placement/handlers/traits.go create mode 100644 internal/shim/placement/handlers/usages.go create mode 100644 internal/shim/placement/handlers/zz_index.go diff --git a/cmd/shim/main.go b/cmd/shim/main.go index 9feea8d5f..41be2c084 100644 --- a/cmd/shim/main.go +++ b/cmd/shim/main.go @@ -12,6 +12,7 @@ import ( "path/filepath" "github.com/cobaltcore-dev/cortex/api/v1alpha1" + placementhandlers "github.com/cobaltcore-dev/cortex/internal/shim/placement/handlers" "github.com/cobaltcore-dev/cortex/pkg/conf" "github.com/cobaltcore-dev/cortex/pkg/monitoring" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" @@ -57,6 +58,7 @@ func main() { var probeAddr string var secureMetrics bool var enableHTTP2 bool + var enablePlacementShim bool var tlsOpts []func(*tls.Config) flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") @@ -75,6 +77,8 @@ func main() { flag.StringVar(&metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.") flag.BoolVar(&enableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers") + flag.BoolVar(&enablePlacementShim, "placement-shim", false, + "If set, the placement API shim handlers are registered on the API server.") opts := zap.Options{ Development: true, } @@ -202,6 +206,9 @@ func main() { // API endpoint. mux := http.NewServeMux() + if enablePlacementShim { + placementhandlers.RegisterRoutes(mux) + } // +kubebuilder:scaffold:builder diff --git a/internal/shim/placement/handlers/allocation_candidates.go b/internal/shim/placement/handlers/allocation_candidates.go new file mode 100644 index 000000000..8d864a9a4 --- /dev/null +++ b/internal/shim/placement/handlers/allocation_candidates.go @@ -0,0 +1,40 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// HandleListAllocationCandidates handles GET /allocation_candidates requests. +// +// Returns a collection of allocation requests and resource provider summaries +// that can satisfy a given set of resource and trait requirements. This is the +// primary endpoint used by Nova's scheduler to find suitable hosts for +// instance placement. +// +// The resources query parameter specifies required capacity as a comma- +// separated list (e.g. VCPU:4,MEMORY_MB:2048,DISK_GB:64). The required +// parameter filters by traits, supporting forbidden traits via ! prefix +// (since 1.22) and the in: syntax for any-of semantics (since 1.39). +// The member_of parameter filters by aggregate membership with support for +// forbidden aggregates via ! prefix (since 1.32). +// +// Since microversion 1.25, granular request groups are supported via numbered +// suffixes (resourcesN, requiredN, member_ofN) to express requirements that +// may be satisfied by different providers. The group_policy parameter (1.26+) +// controls whether groups must each be satisfied by a single provider or may +// span multiple. The in_tree parameter (1.31+) constrains results to a +// specific provider tree. +// +// Each returned allocation request is directly usable as the body for +// PUT /allocations/{consumer_uuid}. The provider_summaries section includes +// inventory capacity and usage for informed decision-making. Available since +// microversion 1.10. +func HandleListAllocationCandidates(w http.ResponseWriter, r *http.Request) { + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path) +} diff --git a/internal/shim/placement/handlers/allocations.go b/internal/shim/placement/handlers/allocations.go new file mode 100644 index 000000000..aa4ba99f2 --- /dev/null +++ b/internal/shim/placement/handlers/allocations.go @@ -0,0 +1,78 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// HandleManageAllocations handles POST /allocations requests. +// +// Atomically creates, updates, or deletes allocations for multiple consumers +// in a single request. This is the primary mechanism for operations that must +// modify allocations across several consumers atomically, such as live +// migrations and move operations where resources are transferred from one +// consumer to another. Available since microversion 1.13. +// +// The request body is keyed by consumer UUID, each containing an allocations +// dictionary (keyed by resource provider UUID), along with project_id and +// user_id. Since microversion 1.28, consumer_generation enables consumer- +// level concurrency control. Since microversion 1.38, a consumer_type field +// (e.g. INSTANCE, MIGRATION) is supported. Returns 204 No Content on +// success, or 409 Conflict if inventory is insufficient or a concurrent +// update is detected (error code: placement.concurrent_update). +func HandleManageAllocations(w http.ResponseWriter, r *http.Request) { + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path) +} + +// HandleListAllocations handles GET /allocations/{consumer_uuid} requests. +// +// Returns all allocation records for the consumer identified by +// {consumer_uuid}, across all resource providers. The response contains an +// allocations dictionary keyed by resource provider UUID. If the consumer has +// no allocations, an empty dictionary is returned. +// +// The response has grown across microversions: project_id and user_id were +// added at 1.12, consumer_generation at 1.28, and consumer_type at 1.38. +// The consumer_generation and consumer_type fields are absent when the +// consumer has no allocations. +func HandleListAllocations(w http.ResponseWriter, r *http.Request) { + consumerUUID := r.PathValue("consumer_uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, + "consumer_uuid", consumerUUID) +} + +// HandleUpdateAllocations handles PUT /allocations/{consumer_uuid} requests. +// +// Creates or replaces all allocation records for a single consumer. If +// allocations already exist for this consumer, they are entirely replaced +// by the new set. The request format changed at microversion 1.12 from an +// array-based layout to an object keyed by resource provider UUID. +// Microversion 1.28 added consumer_generation for concurrency control, +// and 1.38 introduced consumer_type. +// +// Returns 204 No Content on success. Returns 409 Conflict if there is +// insufficient inventory or if a concurrent update was detected. +func HandleUpdateAllocations(w http.ResponseWriter, r *http.Request) { + consumerUUID := r.PathValue("consumer_uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, + "consumer_uuid", consumerUUID) +} + +// HandleDeleteAllocations handles DELETE /allocations/{consumer_uuid} requests. +// +// Removes all allocation records for the consumer across all resource +// providers. Returns 204 No Content on success, or 404 Not Found if the +// consumer has no existing allocations. +func HandleDeleteAllocations(w http.ResponseWriter, r *http.Request) { + consumerUUID := r.PathValue("consumer_uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, + "consumer_uuid", consumerUUID) +} diff --git a/internal/shim/placement/handlers/reshaper.go b/internal/shim/placement/handlers/reshaper.go new file mode 100644 index 000000000..59556c94a --- /dev/null +++ b/internal/shim/placement/handlers/reshaper.go @@ -0,0 +1,30 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// HandlePostReshaper handles POST /reshaper requests. +// +// Atomically migrates resource provider inventories and associated allocations +// in a single transaction. This endpoint is used when a provider tree needs to +// be restructured — for example, moving inventory from a root provider into +// newly created child providers — without leaving allocations in an +// inconsistent state during the transition. +// +// The request body contains the complete set of inventories (keyed by +// resource provider UUID) and allocations (keyed by consumer UUID) that +// should exist after the operation. The Placement service validates all +// inputs atomically and applies them in a single database transaction. +// Returns 204 No Content on success. Returns 409 Conflict if any referenced +// resource provider does not exist or if inventory/allocation constraints +// would be violated. Available since microversion 1.30. +func HandlePostReshaper(w http.ResponseWriter, r *http.Request) { + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path) +} diff --git a/internal/shim/placement/handlers/resource_classes.go b/internal/shim/placement/handlers/resource_classes.go new file mode 100644 index 000000000..81548b3b5 --- /dev/null +++ b/internal/shim/placement/handlers/resource_classes.go @@ -0,0 +1,72 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// HandleListResourceClasses handles GET /resource_classes requests. +// +// Returns the complete list of all resource classes, including both standard +// classes (e.g. VCPU, MEMORY_MB, DISK_GB, PCI_DEVICE, SRIOV_NET_VF) and +// deployer-defined custom classes prefixed with CUSTOM_. Resource classes +// categorize the types of resources that resource providers can offer as +// inventory. Available since microversion 1.2. +func HandleListResourceClasses(w http.ResponseWriter, r *http.Request) { + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path) +} + +// HandleCreateResourceClass handles POST /resource_classes requests. +// +// Creates a new custom resource class. The name must be prefixed with CUSTOM_ +// to distinguish it from standard resource classes. Returns 201 Created with +// a Location header on success. Returns 400 Bad Request if the CUSTOM_ prefix +// is missing, and 409 Conflict if a class with the same name already exists. +// Available since microversion 1.2. +func HandleCreateResourceClass(w http.ResponseWriter, r *http.Request) { + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path) +} + +// HandleShowResourceClass handles GET /resource_classes/{name} requests. +// +// Returns a representation of a single resource class identified by name. +// This can be used to verify the existence of a resource class. Returns 404 +// if the class does not exist. Available since microversion 1.2. +func HandleShowResourceClass(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) +} + +// HandleUpdateResourceClass handles PUT /resource_classes/{name} requests. +// +// Behavior differs by microversion. Since microversion 1.7, this endpoint +// creates or validates the existence of a single resource class: it returns +// 201 Created for a new class or 204 No Content if the class already exists. +// The name must carry the CUSTOM_ prefix. In earlier versions (1.2-1.6), the +// endpoint allowed renaming a class via a request body, but this usage is +// discouraged. Returns 400 Bad Request if the CUSTOM_ prefix is missing. +func HandleUpdateResourceClass(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) +} + +// HandleDeleteResourceClass handles DELETE /resource_classes/{name} requests. +// +// Deletes a custom resource class. Only custom classes (prefixed with CUSTOM_) +// may be deleted; attempting to delete a standard class returns 400 Bad +// Request. Returns 409 Conflict if any resource provider has inventory of this +// class, and 404 if the class does not exist. Returns 204 No Content on +// success. Available since microversion 1.2. +func HandleDeleteResourceClass(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) +} diff --git a/internal/shim/placement/handlers/resource_provider_aggregates.go b/internal/shim/placement/handlers/resource_provider_aggregates.go new file mode 100644 index 000000000..172c40b2f --- /dev/null +++ b/internal/shim/placement/handlers/resource_provider_aggregates.go @@ -0,0 +1,45 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// HandleListResourceProviderAggregates handles +// GET /resource_providers/{uuid}/aggregates requests. +// +// Returns the list of aggregate UUIDs associated with the resource provider. +// Aggregates model relationships among providers such as shared storage, +// affinity/anti-affinity groups, and availability zones. Returns an empty +// list if the provider has no aggregate associations. Available since +// microversion 1.1. +// +// The response format changed at microversion 1.19: earlier versions return +// only a flat array of UUIDs, while 1.19+ returns an object that also +// includes the resource_provider_generation for concurrency tracking. Returns +// 404 if the provider does not exist. +func HandleListResourceProviderAggregates(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) +} + +// HandleUpdateResourceProviderAggregates handles +// PUT /resource_providers/{uuid}/aggregates requests. +// +// Replaces the complete set of aggregate associations for a resource provider. +// Any aggregate UUIDs that do not yet exist are created automatically. The +// request format changed at microversion 1.19: earlier versions accept a +// plain array of UUIDs, while 1.19+ expects an object containing an +// aggregates array and a resource_provider_generation for optimistic +// concurrency control. Returns 409 Conflict if the generation does not match +// (1.19+). Returns 200 with the updated aggregate list on success. +func HandleUpdateResourceProviderAggregates(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) +} diff --git a/internal/shim/placement/handlers/resource_provider_allocations.go b/internal/shim/placement/handlers/resource_provider_allocations.go new file mode 100644 index 000000000..0e27ca7e6 --- /dev/null +++ b/internal/shim/placement/handlers/resource_provider_allocations.go @@ -0,0 +1,24 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// HandleListResourceProviderAllocations handles +// GET /resource_providers/{uuid}/allocations requests. +// +// Returns all allocations made against the resource provider identified by +// {uuid}, keyed by consumer UUID. This provides a provider-centric view of +// consumption, complementing the consumer-centric GET /allocations/{consumer} +// endpoint. The response includes the resource_provider_generation. Returns +// 404 if the provider does not exist. +func HandleListResourceProviderAllocations(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) +} diff --git a/internal/shim/placement/handlers/resource_provider_inventories.go b/internal/shim/placement/handlers/resource_provider_inventories.go new file mode 100644 index 000000000..303370e69 --- /dev/null +++ b/internal/shim/placement/handlers/resource_provider_inventories.go @@ -0,0 +1,103 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// HandleListResourceProviderInventories handles +// GET /resource_providers/{uuid}/inventories requests. +// +// Returns all inventory records for the resource provider identified by +// {uuid}. The response contains an inventories dictionary keyed by resource +// class, with each entry describing capacity constraints: total, reserved, +// min_unit, max_unit, step_size, and allocation_ratio. Also returns the +// resource_provider_generation, which is needed for subsequent update or +// delete operations. Returns 404 if the provider does not exist. +func HandleListResourceProviderInventories(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) +} + +// HandleUpdateResourceProviderInventories handles +// PUT /resource_providers/{uuid}/inventories requests. +// +// Atomically replaces the entire set of inventory records for a provider. +// The request must include the resource_provider_generation for optimistic +// concurrency control — if the generation does not match, the request fails +// with 409 Conflict. The inventories field is a dictionary keyed by resource +// class, each specifying at minimum a total value. Omitted inventory classes +// are deleted. Returns 409 Conflict if allocations exceed the new capacity +// or if a concurrent update has occurred. +func HandleUpdateResourceProviderInventories(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) +} + +// HandleDeleteResourceProviderInventories handles +// DELETE /resource_providers/{uuid}/inventories requests. +// +// Deletes all inventory records for a resource provider. This operation is +// not safe for concurrent use; the recommended alternative for concurrent +// environments is PUT with an empty inventories dictionary. Returns 409 +// Conflict if allocations exist against any of the provider's inventories. +// Returns 404 if the provider does not exist. Available since microversion +// 1.5. +func HandleDeleteResourceProviderInventories(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) +} + +// HandleShowResourceProviderInventory handles +// GET /resource_providers/{uuid}/inventories/{resource_class} requests. +// +// Returns a single inventory record for one resource class on the specified +// provider. The response includes total, reserved, min_unit, max_unit, +// step_size, allocation_ratio, and the resource_provider_generation. Returns +// 404 if the provider or inventory for that class does not exist. +func HandleShowResourceProviderInventory(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + resourceClass := r.PathValue("resource_class") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, + "uuid", uuid, "resource_class", resourceClass) +} + +// HandleUpdateResourceProviderInventory handles +// PUT /resource_providers/{uuid}/inventories/{resource_class} requests. +// +// Creates or replaces the inventory record for a single resource class on +// the provider. The request must include resource_provider_generation for +// concurrency control and a total value. Optional fields control allocation +// constraints (allocation_ratio, min_unit, max_unit, step_size, reserved). +// Since microversion 1.26, the reserved value must not exceed total. Returns +// 409 Conflict on generation mismatch or if allocations would be violated. +func HandleUpdateResourceProviderInventory(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + resourceClass := r.PathValue("resource_class") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, + "uuid", uuid, "resource_class", resourceClass) +} + +// HandleDeleteResourceProviderInventory handles +// DELETE /resource_providers/{uuid}/inventories/{resource_class} requests. +// +// Deletes the inventory record for a specific resource class on the provider. +// Returns 409 Conflict if allocations exist against this provider and resource +// class combination, or if a concurrent update has occurred. Returns 404 if +// the provider or inventory does not exist. Returns 204 No Content on success. +func HandleDeleteResourceProviderInventory(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + resourceClass := r.PathValue("resource_class") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, + "uuid", uuid, "resource_class", resourceClass) +} diff --git a/internal/shim/placement/handlers/resource_provider_traits.go b/internal/shim/placement/handlers/resource_provider_traits.go new file mode 100644 index 000000000..915ccda05 --- /dev/null +++ b/internal/shim/placement/handlers/resource_provider_traits.go @@ -0,0 +1,54 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// HandleListResourceProviderTraits handles +// GET /resource_providers/{uuid}/traits requests. +// +// Returns the list of traits associated with the resource provider identified +// by {uuid}. The response includes an array of trait name strings and the +// resource_provider_generation for concurrency tracking. Returns 404 if the +// provider does not exist. +func HandleListResourceProviderTraits(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) +} + +// HandleUpdateResourceProviderTraits handles +// PUT /resource_providers/{uuid}/traits requests. +// +// Replaces the complete set of trait associations for a resource provider. +// The request body must include a traits array and the +// resource_provider_generation for optimistic concurrency control. All +// previously associated traits are removed and replaced by the specified set. +// Returns 400 Bad Request if any of the specified traits are invalid (i.e. +// not returned by GET /traits). Returns 409 Conflict if the generation does +// not match. +func HandleUpdateResourceProviderTraits(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) +} + +// HandleDeleteResourceProviderTraits handles +// DELETE /resource_providers/{uuid}/traits requests. +// +// Removes all trait associations from a resource provider. Because this +// endpoint does not accept a resource_provider_generation, it is not safe +// for concurrent use. In environments where multiple clients manage traits +// for the same provider, prefer PUT with an empty traits list instead. +// Returns 404 if the provider does not exist. Returns 409 Conflict on +// concurrent modification. Returns 204 No Content on success. +func HandleDeleteResourceProviderTraits(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) +} diff --git a/internal/shim/placement/handlers/resource_provider_usages.go b/internal/shim/placement/handlers/resource_provider_usages.go new file mode 100644 index 000000000..319da4a70 --- /dev/null +++ b/internal/shim/placement/handlers/resource_provider_usages.go @@ -0,0 +1,24 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// HandleListResourceProviderUsages handles +// GET /resource_providers/{uuid}/usages requests. +// +// Returns aggregated resource consumption for the resource provider identified +// by {uuid}. The response contains a usages dictionary keyed by resource class +// with integer usage amounts, along with the resource_provider_generation. +// Unlike the provider allocations endpoint, this does not break down usage by +// individual consumer. Returns 404 if the provider does not exist. +func HandleListResourceProviderUsages(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) +} diff --git a/internal/shim/placement/handlers/resource_providers.go b/internal/shim/placement/handlers/resource_providers.go new file mode 100644 index 000000000..6f4f975fe --- /dev/null +++ b/internal/shim/placement/handlers/resource_providers.go @@ -0,0 +1,84 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// HandleListResourceProviders handles GET /resource_providers requests. +// +// Returns a filtered list of resource providers. Resource providers are +// entities that provide consumable inventory of one or more classes of +// resources (e.g. a compute node providing VCPU, MEMORY_MB, DISK_GB). +// +// Supports numerous filter parameters including name, uuid, member_of +// (aggregate membership), resources (capacity filtering), in_tree (provider +// tree membership), and required (trait filtering). Multiple filters are +// combined with boolean AND logic. Many of these filters were added in later +// microversions: resources filtering at 1.3, tree queries at 1.14, trait +// requirements at 1.18, forbidden traits at 1.22, forbidden aggregates at +// 1.32, and the in: syntax for required at 1.39. +func HandleListResourceProviders(w http.ResponseWriter, r *http.Request) { + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path) +} + +// HandleCreateResourceProvider handles POST /resource_providers requests. +// +// Creates a new resource provider. The request must include a name and may +// optionally specify a UUID and a parent_provider_uuid (since 1.14) to place +// the provider in a hierarchical tree. If no UUID is supplied, one is +// generated. Before microversion 1.37, the parent of a resource provider +// could not be changed after creation. +// +// The response changed at microversion 1.20: earlier versions return only +// an HTTP 201 with a Location header, while 1.20+ returns the full resource +// provider object in the body. Returns 409 Conflict if a provider with the +// same name or UUID already exists. +func HandleCreateResourceProvider(w http.ResponseWriter, r *http.Request) { + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path) +} + +// HandleShowResourceProvider handles GET /resource_providers/{uuid} requests. +// +// Returns a single resource provider identified by its UUID. The response +// includes the provider's name, generation (used for concurrency control in +// subsequent updates), and links. Starting at microversion 1.14, the response +// also includes parent_provider_uuid and root_provider_uuid to describe the +// provider's position in a hierarchical tree. Returns 404 if the provider +// does not exist. +func HandleShowResourceProvider(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) +} + +// HandleUpdateResourceProvider handles PUT /resource_providers/{uuid} requests. +// +// Updates a resource provider's name and, starting at microversion 1.14, its +// parent_provider_uuid. Since microversion 1.37, the parent may be changed to +// any existing provider UUID that would not create a loop in the tree, or set +// to null to make the provider a root. Returns 409 Conflict if another +// provider already has the requested name. +func HandleUpdateResourceProvider(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) +} + +// HandleDeleteResourceProvider handles DELETE /resource_providers/{uuid} requests. +// +// Deletes a resource provider and disassociates all its aggregates and +// inventories. The operation fails with 409 Conflict if there are any +// allocations against the provider's inventories or if the provider has +// child providers in a tree hierarchy. Returns 204 No Content on success. +func HandleDeleteResourceProvider(w http.ResponseWriter, r *http.Request) { + uuid := r.PathValue("uuid") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) +} diff --git a/internal/shim/placement/handlers/root.go b/internal/shim/placement/handlers/root.go new file mode 100644 index 000000000..bfef909b4 --- /dev/null +++ b/internal/shim/placement/handlers/root.go @@ -0,0 +1,23 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// HandleGetRoot handles GET / requests. +// +// Returns information about all known major versions of the Placement API, +// including the minimum and maximum supported microversions for each version. +// Currently only one major version (v1.0) exists. Each version entry includes +// its status (e.g. CURRENT), links for discovery, and the microversion range +// supported by the running service. Clients use this endpoint to discover API +// capabilities and negotiate microversions before making further requests. +func HandleGetRoot(w http.ResponseWriter, r *http.Request) { + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path) +} diff --git a/internal/shim/placement/handlers/traits.go b/internal/shim/placement/handlers/traits.go new file mode 100644 index 000000000..381b109b2 --- /dev/null +++ b/internal/shim/placement/handlers/traits.go @@ -0,0 +1,60 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// HandleListTraits handles GET /traits requests. +// +// Returns a list of valid trait strings. Traits describe qualitative aspects +// of a resource provider (e.g. HW_CPU_X86_AVX2, STORAGE_DISK_SSD). The list +// includes both standard traits from the os-traits library and custom traits +// prefixed with CUSTOM_. +// +// Supports optional query parameters: name allows filtering by prefix +// (startswith:CUSTOM) or by an explicit list (in:TRAIT1,TRAIT2), and +// associated filters to only traits that are or are not associated with at +// least one resource provider. +func HandleListTraits(w http.ResponseWriter, r *http.Request) { + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path) +} + +// HandleShowTrait handles GET /traits/{name} requests. +// +// Checks whether a trait with the given name exists. Returns 204 No Content +// (with no response body) if the trait is found, or 404 Not Found otherwise. +func HandleShowTrait(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) +} + +// HandleUpdateTrait handles PUT /traits/{name} requests. +// +// Creates a new custom trait. Only traits prefixed with CUSTOM_ may be +// created; standard traits are read-only. Returns 201 Created if the trait +// is newly inserted, or 204 No Content if it already exists. Returns 400 +// Bad Request if the name does not carry the CUSTOM_ prefix. +func HandleUpdateTrait(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) +} + +// HandleDeleteTrait handles DELETE /traits/{name} requests. +// +// Deletes a custom trait. Standard traits (those without the CUSTOM_ prefix) +// cannot be deleted and will return 400 Bad Request. Returns 409 Conflict if +// the trait is still associated with any resource provider. Returns 404 if +// the trait does not exist. Returns 204 No Content on success. +func HandleDeleteTrait(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) +} diff --git a/internal/shim/placement/handlers/usages.go b/internal/shim/placement/handlers/usages.go new file mode 100644 index 000000000..2fd5e66a6 --- /dev/null +++ b/internal/shim/placement/handlers/usages.go @@ -0,0 +1,27 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "net/http" + + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// HandleListUsages handles GET /usages requests. +// +// Returns a report of aggregated resource usage for a given project, and +// optionally a specific user within that project. The project_id query +// parameter is required; user_id is optional. +// +// The response format changed at microversion 1.38: earlier versions return +// a flat dictionary of resource class to usage totals, while 1.38+ groups +// usages by consumer_type (e.g. INSTANCE, MIGRATION, all, unknown), with +// each group containing resource totals and a consumer_count. Since +// microversion 1.38, an optional consumer_type query parameter allows +// filtering the results. Available since microversion 1.9. +func HandleListUsages(w http.ResponseWriter, r *http.Request) { + log := logf.FromContext(r.Context()) + log.Info("placement request", "method", r.Method, "path", r.URL.Path) +} diff --git a/internal/shim/placement/handlers/zz_index.go b/internal/shim/placement/handlers/zz_index.go new file mode 100644 index 000000000..fba5d76b1 --- /dev/null +++ b/internal/shim/placement/handlers/zz_index.go @@ -0,0 +1,73 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import "net/http" + +// RegisterRoutes binds all Placement API handlers to the given mux. The +// route patterns use the Go 1.22+ ServeMux syntax with explicit HTTP methods +// and path wildcards. The routes mirror the OpenStack Placement API surface +// as documented at https://docs.openstack.org/api-ref/placement/. +func RegisterRoutes(mux *http.ServeMux) { + // Root + mux.HandleFunc("GET /{$}", HandleGetRoot) + + // Resource providers + mux.HandleFunc("GET /resource_providers", HandleListResourceProviders) + mux.HandleFunc("POST /resource_providers", HandleCreateResourceProvider) + mux.HandleFunc("GET /resource_providers/{uuid}", HandleShowResourceProvider) + mux.HandleFunc("PUT /resource_providers/{uuid}", HandleUpdateResourceProvider) + mux.HandleFunc("DELETE /resource_providers/{uuid}", HandleDeleteResourceProvider) + + // Resource classes + mux.HandleFunc("GET /resource_classes", HandleListResourceClasses) + mux.HandleFunc("POST /resource_classes", HandleCreateResourceClass) + mux.HandleFunc("GET /resource_classes/{name}", HandleShowResourceClass) + mux.HandleFunc("PUT /resource_classes/{name}", HandleUpdateResourceClass) + mux.HandleFunc("DELETE /resource_classes/{name}", HandleDeleteResourceClass) + + // Resource provider inventories + mux.HandleFunc("GET /resource_providers/{uuid}/inventories", HandleListResourceProviderInventories) + mux.HandleFunc("PUT /resource_providers/{uuid}/inventories", HandleUpdateResourceProviderInventories) + mux.HandleFunc("DELETE /resource_providers/{uuid}/inventories", HandleDeleteResourceProviderInventories) + mux.HandleFunc("GET /resource_providers/{uuid}/inventories/{resource_class}", HandleShowResourceProviderInventory) + mux.HandleFunc("PUT /resource_providers/{uuid}/inventories/{resource_class}", HandleUpdateResourceProviderInventory) + mux.HandleFunc("DELETE /resource_providers/{uuid}/inventories/{resource_class}", HandleDeleteResourceProviderInventory) + + // Resource provider aggregates + mux.HandleFunc("GET /resource_providers/{uuid}/aggregates", HandleListResourceProviderAggregates) + mux.HandleFunc("PUT /resource_providers/{uuid}/aggregates", HandleUpdateResourceProviderAggregates) + + // Traits + mux.HandleFunc("GET /traits", HandleListTraits) + mux.HandleFunc("GET /traits/{name}", HandleShowTrait) + mux.HandleFunc("PUT /traits/{name}", HandleUpdateTrait) + mux.HandleFunc("DELETE /traits/{name}", HandleDeleteTrait) + + // Resource provider traits + mux.HandleFunc("GET /resource_providers/{uuid}/traits", HandleListResourceProviderTraits) + mux.HandleFunc("PUT /resource_providers/{uuid}/traits", HandleUpdateResourceProviderTraits) + mux.HandleFunc("DELETE /resource_providers/{uuid}/traits", HandleDeleteResourceProviderTraits) + + // Allocations + mux.HandleFunc("POST /allocations", HandleManageAllocations) + mux.HandleFunc("GET /allocations/{consumer_uuid}", HandleListAllocations) + mux.HandleFunc("PUT /allocations/{consumer_uuid}", HandleUpdateAllocations) + mux.HandleFunc("DELETE /allocations/{consumer_uuid}", HandleDeleteAllocations) + + // Resource provider allocations + mux.HandleFunc("GET /resource_providers/{uuid}/allocations", HandleListResourceProviderAllocations) + + // Usages + mux.HandleFunc("GET /usages", HandleListUsages) + + // Resource provider usages + mux.HandleFunc("GET /resource_providers/{uuid}/usages", HandleListResourceProviderUsages) + + // Allocation candidates + mux.HandleFunc("GET /allocation_candidates", HandleListAllocationCandidates) + + // Reshaper + mux.HandleFunc("POST /reshaper", HandlePostReshaper) +} From 7ba4f545089e7c6f5087756fb7e568477cf85590 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 14:30:17 +0200 Subject: [PATCH 08/17] Add path parameter validation --- .../shim/placement/handlers/allocations.go | 15 +++++-- .../placement/handlers/resource_classes.go | 15 +++++-- .../handlers/resource_provider_aggregates.go | 10 ++++- .../handlers/resource_provider_allocations.go | 5 ++- .../handlers/resource_provider_inventories.go | 45 +++++++++++++++---- .../handlers/resource_provider_traits.go | 15 +++++-- .../handlers/resource_provider_usages.go | 5 ++- .../placement/handlers/resource_providers.go | 15 +++++-- internal/shim/placement/handlers/traits.go | 15 +++++-- .../shim/placement/handlers/validation.go | 38 ++++++++++++++++ 10 files changed, 150 insertions(+), 28 deletions(-) create mode 100644 internal/shim/placement/handlers/validation.go diff --git a/internal/shim/placement/handlers/allocations.go b/internal/shim/placement/handlers/allocations.go index aa4ba99f2..0e125e126 100644 --- a/internal/shim/placement/handlers/allocations.go +++ b/internal/shim/placement/handlers/allocations.go @@ -41,7 +41,10 @@ func HandleManageAllocations(w http.ResponseWriter, r *http.Request) { // The consumer_generation and consumer_type fields are absent when the // consumer has no allocations. func HandleListAllocations(w http.ResponseWriter, r *http.Request) { - consumerUUID := r.PathValue("consumer_uuid") + consumerUUID, ok := requiredUUIDPathParam(w, r, "consumer_uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "consumer_uuid", consumerUUID) @@ -59,7 +62,10 @@ func HandleListAllocations(w http.ResponseWriter, r *http.Request) { // Returns 204 No Content on success. Returns 409 Conflict if there is // insufficient inventory or if a concurrent update was detected. func HandleUpdateAllocations(w http.ResponseWriter, r *http.Request) { - consumerUUID := r.PathValue("consumer_uuid") + consumerUUID, ok := requiredUUIDPathParam(w, r, "consumer_uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "consumer_uuid", consumerUUID) @@ -71,7 +77,10 @@ func HandleUpdateAllocations(w http.ResponseWriter, r *http.Request) { // providers. Returns 204 No Content on success, or 404 Not Found if the // consumer has no existing allocations. func HandleDeleteAllocations(w http.ResponseWriter, r *http.Request) { - consumerUUID := r.PathValue("consumer_uuid") + consumerUUID, ok := requiredUUIDPathParam(w, r, "consumer_uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "consumer_uuid", consumerUUID) diff --git a/internal/shim/placement/handlers/resource_classes.go b/internal/shim/placement/handlers/resource_classes.go index 81548b3b5..f5f8453c0 100644 --- a/internal/shim/placement/handlers/resource_classes.go +++ b/internal/shim/placement/handlers/resource_classes.go @@ -39,7 +39,10 @@ func HandleCreateResourceClass(w http.ResponseWriter, r *http.Request) { // This can be used to verify the existence of a resource class. Returns 404 // if the class does not exist. Available since microversion 1.2. func HandleShowResourceClass(w http.ResponseWriter, r *http.Request) { - name := r.PathValue("name") + name, ok := requiredPathParam(w, r, "name") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) } @@ -53,7 +56,10 @@ func HandleShowResourceClass(w http.ResponseWriter, r *http.Request) { // endpoint allowed renaming a class via a request body, but this usage is // discouraged. Returns 400 Bad Request if the CUSTOM_ prefix is missing. func HandleUpdateResourceClass(w http.ResponseWriter, r *http.Request) { - name := r.PathValue("name") + name, ok := requiredPathParam(w, r, "name") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) } @@ -66,7 +72,10 @@ func HandleUpdateResourceClass(w http.ResponseWriter, r *http.Request) { // class, and 404 if the class does not exist. Returns 204 No Content on // success. Available since microversion 1.2. func HandleDeleteResourceClass(w http.ResponseWriter, r *http.Request) { - name := r.PathValue("name") + name, ok := requiredPathParam(w, r, "name") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) } diff --git a/internal/shim/placement/handlers/resource_provider_aggregates.go b/internal/shim/placement/handlers/resource_provider_aggregates.go index 172c40b2f..131969fb1 100644 --- a/internal/shim/placement/handlers/resource_provider_aggregates.go +++ b/internal/shim/placement/handlers/resource_provider_aggregates.go @@ -23,7 +23,10 @@ import ( // includes the resource_provider_generation for concurrency tracking. Returns // 404 if the provider does not exist. func HandleListResourceProviderAggregates(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -39,7 +42,10 @@ func HandleListResourceProviderAggregates(w http.ResponseWriter, r *http.Request // concurrency control. Returns 409 Conflict if the generation does not match // (1.19+). Returns 200 with the updated aggregate list on success. func HandleUpdateResourceProviderAggregates(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } diff --git a/internal/shim/placement/handlers/resource_provider_allocations.go b/internal/shim/placement/handlers/resource_provider_allocations.go index 0e27ca7e6..c7b0dfe70 100644 --- a/internal/shim/placement/handlers/resource_provider_allocations.go +++ b/internal/shim/placement/handlers/resource_provider_allocations.go @@ -18,7 +18,10 @@ import ( // endpoint. The response includes the resource_provider_generation. Returns // 404 if the provider does not exist. func HandleListResourceProviderAllocations(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } diff --git a/internal/shim/placement/handlers/resource_provider_inventories.go b/internal/shim/placement/handlers/resource_provider_inventories.go index 303370e69..9c23e56d3 100644 --- a/internal/shim/placement/handlers/resource_provider_inventories.go +++ b/internal/shim/placement/handlers/resource_provider_inventories.go @@ -19,7 +19,10 @@ import ( // resource_provider_generation, which is needed for subsequent update or // delete operations. Returns 404 if the provider does not exist. func HandleListResourceProviderInventories(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -35,7 +38,10 @@ func HandleListResourceProviderInventories(w http.ResponseWriter, r *http.Reques // are deleted. Returns 409 Conflict if allocations exceed the new capacity // or if a concurrent update has occurred. func HandleUpdateResourceProviderInventories(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -50,7 +56,10 @@ func HandleUpdateResourceProviderInventories(w http.ResponseWriter, r *http.Requ // Returns 404 if the provider does not exist. Available since microversion // 1.5. func HandleDeleteResourceProviderInventories(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -63,8 +72,14 @@ func HandleDeleteResourceProviderInventories(w http.ResponseWriter, r *http.Requ // step_size, allocation_ratio, and the resource_provider_generation. Returns // 404 if the provider or inventory for that class does not exist. func HandleShowResourceProviderInventory(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") - resourceClass := r.PathValue("resource_class") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } + resourceClass, ok := requiredPathParam(w, r, "resource_class") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid, "resource_class", resourceClass) @@ -80,8 +95,14 @@ func HandleShowResourceProviderInventory(w http.ResponseWriter, r *http.Request) // Since microversion 1.26, the reserved value must not exceed total. Returns // 409 Conflict on generation mismatch or if allocations would be violated. func HandleUpdateResourceProviderInventory(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") - resourceClass := r.PathValue("resource_class") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } + resourceClass, ok := requiredPathParam(w, r, "resource_class") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid, "resource_class", resourceClass) @@ -95,8 +116,14 @@ func HandleUpdateResourceProviderInventory(w http.ResponseWriter, r *http.Reques // class combination, or if a concurrent update has occurred. Returns 404 if // the provider or inventory does not exist. Returns 204 No Content on success. func HandleDeleteResourceProviderInventory(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") - resourceClass := r.PathValue("resource_class") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } + resourceClass, ok := requiredPathParam(w, r, "resource_class") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid, "resource_class", resourceClass) diff --git a/internal/shim/placement/handlers/resource_provider_traits.go b/internal/shim/placement/handlers/resource_provider_traits.go index 915ccda05..a21b35ddc 100644 --- a/internal/shim/placement/handlers/resource_provider_traits.go +++ b/internal/shim/placement/handlers/resource_provider_traits.go @@ -17,7 +17,10 @@ import ( // resource_provider_generation for concurrency tracking. Returns 404 if the // provider does not exist. func HandleListResourceProviderTraits(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -33,7 +36,10 @@ func HandleListResourceProviderTraits(w http.ResponseWriter, r *http.Request) { // not returned by GET /traits). Returns 409 Conflict if the generation does // not match. func HandleUpdateResourceProviderTraits(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -48,7 +54,10 @@ func HandleUpdateResourceProviderTraits(w http.ResponseWriter, r *http.Request) // Returns 404 if the provider does not exist. Returns 409 Conflict on // concurrent modification. Returns 204 No Content on success. func HandleDeleteResourceProviderTraits(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } diff --git a/internal/shim/placement/handlers/resource_provider_usages.go b/internal/shim/placement/handlers/resource_provider_usages.go index 319da4a70..418862b4a 100644 --- a/internal/shim/placement/handlers/resource_provider_usages.go +++ b/internal/shim/placement/handlers/resource_provider_usages.go @@ -18,7 +18,10 @@ import ( // Unlike the provider allocations endpoint, this does not break down usage by // individual consumer. Returns 404 if the provider does not exist. func HandleListResourceProviderUsages(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } diff --git a/internal/shim/placement/handlers/resource_providers.go b/internal/shim/placement/handlers/resource_providers.go index 6f4f975fe..3ecc6428e 100644 --- a/internal/shim/placement/handlers/resource_providers.go +++ b/internal/shim/placement/handlers/resource_providers.go @@ -53,7 +53,10 @@ func HandleCreateResourceProvider(w http.ResponseWriter, r *http.Request) { // provider's position in a hierarchical tree. Returns 404 if the provider // does not exist. func HandleShowResourceProvider(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -66,7 +69,10 @@ func HandleShowResourceProvider(w http.ResponseWriter, r *http.Request) { // to null to make the provider a root. Returns 409 Conflict if another // provider already has the requested name. func HandleUpdateResourceProvider(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -78,7 +84,10 @@ func HandleUpdateResourceProvider(w http.ResponseWriter, r *http.Request) { // allocations against the provider's inventories or if the provider has // child providers in a tree hierarchy. Returns 204 No Content on success. func HandleDeleteResourceProvider(w http.ResponseWriter, r *http.Request) { - uuid := r.PathValue("uuid") + uuid, ok := requiredUUIDPathParam(w, r, "uuid") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } diff --git a/internal/shim/placement/handlers/traits.go b/internal/shim/placement/handlers/traits.go index 381b109b2..92ea4b017 100644 --- a/internal/shim/placement/handlers/traits.go +++ b/internal/shim/placement/handlers/traits.go @@ -30,7 +30,10 @@ func HandleListTraits(w http.ResponseWriter, r *http.Request) { // Checks whether a trait with the given name exists. Returns 204 No Content // (with no response body) if the trait is found, or 404 Not Found otherwise. func HandleShowTrait(w http.ResponseWriter, r *http.Request) { - name := r.PathValue("name") + name, ok := requiredPathParam(w, r, "name") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) } @@ -42,7 +45,10 @@ func HandleShowTrait(w http.ResponseWriter, r *http.Request) { // is newly inserted, or 204 No Content if it already exists. Returns 400 // Bad Request if the name does not carry the CUSTOM_ prefix. func HandleUpdateTrait(w http.ResponseWriter, r *http.Request) { - name := r.PathValue("name") + name, ok := requiredPathParam(w, r, "name") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) } @@ -54,7 +60,10 @@ func HandleUpdateTrait(w http.ResponseWriter, r *http.Request) { // the trait is still associated with any resource provider. Returns 404 if // the trait does not exist. Returns 204 No Content on success. func HandleDeleteTrait(w http.ResponseWriter, r *http.Request) { - name := r.PathValue("name") + name, ok := requiredPathParam(w, r, "name") + if !ok { + return + } log := logf.FromContext(r.Context()) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) } diff --git a/internal/shim/placement/handlers/validation.go b/internal/shim/placement/handlers/validation.go new file mode 100644 index 000000000..5f8bcf2b9 --- /dev/null +++ b/internal/shim/placement/handlers/validation.go @@ -0,0 +1,38 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "fmt" + "net/http" + + "github.com/google/uuid" +) + +// requiredPathParam extracts a path parameter by name and verifies that it is +// non-empty. If the value is missing, it writes a 400 response and returns +// an empty string. +func requiredPathParam(w http.ResponseWriter, r *http.Request, name string) (string, bool) { + v := r.PathValue(name) + if v == "" { + http.Error(w, fmt.Sprintf("missing path parameter: %s", name), http.StatusBadRequest) + return "", false + } + return v, true +} + +// requiredUUIDPathParam extracts a path parameter by name and verifies that it +// is a valid UUID. If the value is missing or not a valid UUID, it writes a +// 400 response and returns an empty string. +func requiredUUIDPathParam(w http.ResponseWriter, r *http.Request, name string) (string, bool) { + v, ok := requiredPathParam(w, r, name) + if !ok { + return "", false + } + if err := uuid.Validate(v); err != nil { + http.Error(w, fmt.Sprintf("invalid UUID in path parameter %s: %s", name, v), http.StatusBadRequest) + return "", false + } + return v, true +} From 41c28b775074aef29f71c781e05d88709edb5ad8 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 17:01:42 +0200 Subject: [PATCH 09/17] Add hypervisor informer cache --- cmd/shim/main.go | 9 +- internal/shim/placement/.gitkeep | 0 ...tes.go => handle_allocation_candidates.go} | 7 +- .../allocations.go => handle_allocations.go} | 22 ++-- .../reshaper.go => handle_reshaper.go} | 7 +- ..._classes.go => handle_resource_classes.go} | 27 ++-- ...=> handle_resource_provider_aggregates.go} | 12 +- ...> handle_resource_provider_allocations.go} | 7 +- ...> handle_resource_provider_inventories.go} | 32 +++-- ....go => handle_resource_provider_traits.go} | 17 +-- ....go => handle_resource_provider_usages.go} | 7 +- ...viders.go => handle_resource_providers.go} | 27 ++-- .../{handlers/root.go => handle_root.go} | 7 +- .../{handlers/traits.go => handle_traits.go} | 22 ++-- .../{handlers/usages.go => handle_usages.go} | 7 +- internal/shim/placement/handlers/zz_index.go | 73 ----------- internal/shim/placement/shim.go | 118 ++++++++++++++++++ .../placement/{handlers => }/validation.go | 2 +- 18 files changed, 244 insertions(+), 159 deletions(-) delete mode 100644 internal/shim/placement/.gitkeep rename internal/shim/placement/{handlers/allocation_candidates.go => handle_allocation_candidates.go} (91%) rename internal/shim/placement/{handlers/allocations.go => handle_allocations.go} (85%) rename internal/shim/placement/{handlers/reshaper.go => handle_reshaper.go} (88%) rename internal/shim/placement/{handlers/resource_classes.go => handle_resource_classes.go} (80%) rename internal/shim/placement/{handlers/resource_provider_aggregates.go => handle_resource_provider_aggregates.go} (85%) rename internal/shim/placement/{handlers/resource_provider_allocations.go => handle_resource_provider_allocations.go} (82%) rename internal/shim/placement/{handlers/resource_provider_inventories.go => handle_resource_provider_inventories.go} (83%) rename internal/shim/placement/{handlers/resource_provider_traits.go => handle_resource_provider_traits.go} (82%) rename internal/shim/placement/{handlers/resource_provider_usages.go => handle_resource_provider_usages.go} (83%) rename internal/shim/placement/{handlers/resource_providers.go => handle_resource_providers.go} (83%) rename internal/shim/placement/{handlers/root.go => handle_root.go} (83%) rename internal/shim/placement/{handlers/traits.go => handle_traits.go} (80%) rename internal/shim/placement/{handlers/usages.go => handle_usages.go} (86%) delete mode 100644 internal/shim/placement/handlers/zz_index.go create mode 100644 internal/shim/placement/shim.go rename internal/shim/placement/{handlers => }/validation.go (98%) diff --git a/cmd/shim/main.go b/cmd/shim/main.go index 41be2c084..ce81d6b44 100644 --- a/cmd/shim/main.go +++ b/cmd/shim/main.go @@ -12,7 +12,7 @@ import ( "path/filepath" "github.com/cobaltcore-dev/cortex/api/v1alpha1" - placementhandlers "github.com/cobaltcore-dev/cortex/internal/shim/placement/handlers" + "github.com/cobaltcore-dev/cortex/internal/shim/placement" "github.com/cobaltcore-dev/cortex/pkg/conf" "github.com/cobaltcore-dev/cortex/pkg/monitoring" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" @@ -207,7 +207,12 @@ func main() { // API endpoint. mux := http.NewServeMux() if enablePlacementShim { - placementhandlers.RegisterRoutes(mux) + placementShim := &placement.Shim{Client: mgr.GetClient()} + if err := placementShim.SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to set up placement shim") + os.Exit(1) + } + placementShim.RegisterRoutes(mux) } // +kubebuilder:scaffold:builder diff --git a/internal/shim/placement/.gitkeep b/internal/shim/placement/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/internal/shim/placement/handlers/allocation_candidates.go b/internal/shim/placement/handle_allocation_candidates.go similarity index 91% rename from internal/shim/placement/handlers/allocation_candidates.go rename to internal/shim/placement/handle_allocation_candidates.go index 8d864a9a4..b252d0026 100644 --- a/internal/shim/placement/handlers/allocation_candidates.go +++ b/internal/shim/placement/handle_allocation_candidates.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "net/http" @@ -34,7 +34,8 @@ import ( // PUT /allocations/{consumer_uuid}. The provider_summaries section includes // inventory capacity and usage for informed decision-making. Available since // microversion 1.10. -func HandleListAllocationCandidates(w http.ResponseWriter, r *http.Request) { - log := logf.FromContext(r.Context()) +func (s *Shim) HandleListAllocationCandidates(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) } diff --git a/internal/shim/placement/handlers/allocations.go b/internal/shim/placement/handle_allocations.go similarity index 85% rename from internal/shim/placement/handlers/allocations.go rename to internal/shim/placement/handle_allocations.go index 0e125e126..f406ea5da 100644 --- a/internal/shim/placement/handlers/allocations.go +++ b/internal/shim/placement/handle_allocations.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "net/http" @@ -24,8 +24,9 @@ import ( // (e.g. INSTANCE, MIGRATION) is supported. Returns 204 No Content on // success, or 409 Conflict if inventory is insufficient or a concurrent // update is detected (error code: placement.concurrent_update). -func HandleManageAllocations(w http.ResponseWriter, r *http.Request) { - log := logf.FromContext(r.Context()) +func (s *Shim) HandleManageAllocations(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) } @@ -40,12 +41,13 @@ func HandleManageAllocations(w http.ResponseWriter, r *http.Request) { // added at 1.12, consumer_generation at 1.28, and consumer_type at 1.38. // The consumer_generation and consumer_type fields are absent when the // consumer has no allocations. -func HandleListAllocations(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleListAllocations(w http.ResponseWriter, r *http.Request) { consumerUUID, ok := requiredUUIDPathParam(w, r, "consumer_uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "consumer_uuid", consumerUUID) } @@ -61,12 +63,13 @@ func HandleListAllocations(w http.ResponseWriter, r *http.Request) { // // Returns 204 No Content on success. Returns 409 Conflict if there is // insufficient inventory or if a concurrent update was detected. -func HandleUpdateAllocations(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleUpdateAllocations(w http.ResponseWriter, r *http.Request) { consumerUUID, ok := requiredUUIDPathParam(w, r, "consumer_uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "consumer_uuid", consumerUUID) } @@ -76,12 +79,13 @@ func HandleUpdateAllocations(w http.ResponseWriter, r *http.Request) { // Removes all allocation records for the consumer across all resource // providers. Returns 204 No Content on success, or 404 Not Found if the // consumer has no existing allocations. -func HandleDeleteAllocations(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleDeleteAllocations(w http.ResponseWriter, r *http.Request) { consumerUUID, ok := requiredUUIDPathParam(w, r, "consumer_uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "consumer_uuid", consumerUUID) } diff --git a/internal/shim/placement/handlers/reshaper.go b/internal/shim/placement/handle_reshaper.go similarity index 88% rename from internal/shim/placement/handlers/reshaper.go rename to internal/shim/placement/handle_reshaper.go index 59556c94a..fe0d85069 100644 --- a/internal/shim/placement/handlers/reshaper.go +++ b/internal/shim/placement/handle_reshaper.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "net/http" @@ -24,7 +24,8 @@ import ( // Returns 204 No Content on success. Returns 409 Conflict if any referenced // resource provider does not exist or if inventory/allocation constraints // would be violated. Available since microversion 1.30. -func HandlePostReshaper(w http.ResponseWriter, r *http.Request) { - log := logf.FromContext(r.Context()) +func (s *Shim) HandlePostReshaper(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) } diff --git a/internal/shim/placement/handlers/resource_classes.go b/internal/shim/placement/handle_resource_classes.go similarity index 80% rename from internal/shim/placement/handlers/resource_classes.go rename to internal/shim/placement/handle_resource_classes.go index f5f8453c0..554c43034 100644 --- a/internal/shim/placement/handlers/resource_classes.go +++ b/internal/shim/placement/handle_resource_classes.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "net/http" @@ -16,8 +16,9 @@ import ( // deployer-defined custom classes prefixed with CUSTOM_. Resource classes // categorize the types of resources that resource providers can offer as // inventory. Available since microversion 1.2. -func HandleListResourceClasses(w http.ResponseWriter, r *http.Request) { - log := logf.FromContext(r.Context()) +func (s *Shim) HandleListResourceClasses(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) } @@ -28,8 +29,9 @@ func HandleListResourceClasses(w http.ResponseWriter, r *http.Request) { // a Location header on success. Returns 400 Bad Request if the CUSTOM_ prefix // is missing, and 409 Conflict if a class with the same name already exists. // Available since microversion 1.2. -func HandleCreateResourceClass(w http.ResponseWriter, r *http.Request) { - log := logf.FromContext(r.Context()) +func (s *Shim) HandleCreateResourceClass(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) } @@ -38,12 +40,13 @@ func HandleCreateResourceClass(w http.ResponseWriter, r *http.Request) { // Returns a representation of a single resource class identified by name. // This can be used to verify the existence of a resource class. Returns 404 // if the class does not exist. Available since microversion 1.2. -func HandleShowResourceClass(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleShowResourceClass(w http.ResponseWriter, r *http.Request) { name, ok := requiredPathParam(w, r, "name") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) } @@ -55,12 +58,13 @@ func HandleShowResourceClass(w http.ResponseWriter, r *http.Request) { // The name must carry the CUSTOM_ prefix. In earlier versions (1.2-1.6), the // endpoint allowed renaming a class via a request body, but this usage is // discouraged. Returns 400 Bad Request if the CUSTOM_ prefix is missing. -func HandleUpdateResourceClass(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleUpdateResourceClass(w http.ResponseWriter, r *http.Request) { name, ok := requiredPathParam(w, r, "name") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) } @@ -71,11 +75,12 @@ func HandleUpdateResourceClass(w http.ResponseWriter, r *http.Request) { // Request. Returns 409 Conflict if any resource provider has inventory of this // class, and 404 if the class does not exist. Returns 204 No Content on // success. Available since microversion 1.2. -func HandleDeleteResourceClass(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleDeleteResourceClass(w http.ResponseWriter, r *http.Request) { name, ok := requiredPathParam(w, r, "name") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) } diff --git a/internal/shim/placement/handlers/resource_provider_aggregates.go b/internal/shim/placement/handle_resource_provider_aggregates.go similarity index 85% rename from internal/shim/placement/handlers/resource_provider_aggregates.go rename to internal/shim/placement/handle_resource_provider_aggregates.go index 131969fb1..ce8febe50 100644 --- a/internal/shim/placement/handlers/resource_provider_aggregates.go +++ b/internal/shim/placement/handle_resource_provider_aggregates.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "net/http" @@ -22,12 +22,13 @@ import ( // only a flat array of UUIDs, while 1.19+ returns an object that also // includes the resource_provider_generation for concurrency tracking. Returns // 404 if the provider does not exist. -func HandleListResourceProviderAggregates(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleListResourceProviderAggregates(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -41,11 +42,12 @@ func HandleListResourceProviderAggregates(w http.ResponseWriter, r *http.Request // aggregates array and a resource_provider_generation for optimistic // concurrency control. Returns 409 Conflict if the generation does not match // (1.19+). Returns 200 with the updated aggregate list on success. -func HandleUpdateResourceProviderAggregates(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleUpdateResourceProviderAggregates(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } diff --git a/internal/shim/placement/handlers/resource_provider_allocations.go b/internal/shim/placement/handle_resource_provider_allocations.go similarity index 82% rename from internal/shim/placement/handlers/resource_provider_allocations.go rename to internal/shim/placement/handle_resource_provider_allocations.go index c7b0dfe70..b3f6dcd68 100644 --- a/internal/shim/placement/handlers/resource_provider_allocations.go +++ b/internal/shim/placement/handle_resource_provider_allocations.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "net/http" @@ -17,11 +17,12 @@ import ( // consumption, complementing the consumer-centric GET /allocations/{consumer} // endpoint. The response includes the resource_provider_generation. Returns // 404 if the provider does not exist. -func HandleListResourceProviderAllocations(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleListResourceProviderAllocations(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } diff --git a/internal/shim/placement/handlers/resource_provider_inventories.go b/internal/shim/placement/handle_resource_provider_inventories.go similarity index 83% rename from internal/shim/placement/handlers/resource_provider_inventories.go rename to internal/shim/placement/handle_resource_provider_inventories.go index 9c23e56d3..c79f924b6 100644 --- a/internal/shim/placement/handlers/resource_provider_inventories.go +++ b/internal/shim/placement/handle_resource_provider_inventories.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "net/http" @@ -18,12 +18,13 @@ import ( // min_unit, max_unit, step_size, and allocation_ratio. Also returns the // resource_provider_generation, which is needed for subsequent update or // delete operations. Returns 404 if the provider does not exist. -func HandleListResourceProviderInventories(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleListResourceProviderInventories(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -37,12 +38,13 @@ func HandleListResourceProviderInventories(w http.ResponseWriter, r *http.Reques // class, each specifying at minimum a total value. Omitted inventory classes // are deleted. Returns 409 Conflict if allocations exceed the new capacity // or if a concurrent update has occurred. -func HandleUpdateResourceProviderInventories(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleUpdateResourceProviderInventories(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -55,12 +57,13 @@ func HandleUpdateResourceProviderInventories(w http.ResponseWriter, r *http.Requ // Conflict if allocations exist against any of the provider's inventories. // Returns 404 if the provider does not exist. Available since microversion // 1.5. -func HandleDeleteResourceProviderInventories(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleDeleteResourceProviderInventories(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -71,7 +74,7 @@ func HandleDeleteResourceProviderInventories(w http.ResponseWriter, r *http.Requ // provider. The response includes total, reserved, min_unit, max_unit, // step_size, allocation_ratio, and the resource_provider_generation. Returns // 404 if the provider or inventory for that class does not exist. -func HandleShowResourceProviderInventory(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleShowResourceProviderInventory(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return @@ -80,7 +83,8 @@ func HandleShowResourceProviderInventory(w http.ResponseWriter, r *http.Request) if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid, "resource_class", resourceClass) } @@ -94,7 +98,7 @@ func HandleShowResourceProviderInventory(w http.ResponseWriter, r *http.Request) // constraints (allocation_ratio, min_unit, max_unit, step_size, reserved). // Since microversion 1.26, the reserved value must not exceed total. Returns // 409 Conflict on generation mismatch or if allocations would be violated. -func HandleUpdateResourceProviderInventory(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleUpdateResourceProviderInventory(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return @@ -103,7 +107,8 @@ func HandleUpdateResourceProviderInventory(w http.ResponseWriter, r *http.Reques if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid, "resource_class", resourceClass) } @@ -115,7 +120,7 @@ func HandleUpdateResourceProviderInventory(w http.ResponseWriter, r *http.Reques // Returns 409 Conflict if allocations exist against this provider and resource // class combination, or if a concurrent update has occurred. Returns 404 if // the provider or inventory does not exist. Returns 204 No Content on success. -func HandleDeleteResourceProviderInventory(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleDeleteResourceProviderInventory(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return @@ -124,7 +129,8 @@ func HandleDeleteResourceProviderInventory(w http.ResponseWriter, r *http.Reques if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid, "resource_class", resourceClass) } diff --git a/internal/shim/placement/handlers/resource_provider_traits.go b/internal/shim/placement/handle_resource_provider_traits.go similarity index 82% rename from internal/shim/placement/handlers/resource_provider_traits.go rename to internal/shim/placement/handle_resource_provider_traits.go index a21b35ddc..5c18bf85b 100644 --- a/internal/shim/placement/handlers/resource_provider_traits.go +++ b/internal/shim/placement/handle_resource_provider_traits.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "net/http" @@ -16,12 +16,13 @@ import ( // by {uuid}. The response includes an array of trait name strings and the // resource_provider_generation for concurrency tracking. Returns 404 if the // provider does not exist. -func HandleListResourceProviderTraits(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleListResourceProviderTraits(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -35,12 +36,13 @@ func HandleListResourceProviderTraits(w http.ResponseWriter, r *http.Request) { // Returns 400 Bad Request if any of the specified traits are invalid (i.e. // not returned by GET /traits). Returns 409 Conflict if the generation does // not match. -func HandleUpdateResourceProviderTraits(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleUpdateResourceProviderTraits(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -53,11 +55,12 @@ func HandleUpdateResourceProviderTraits(w http.ResponseWriter, r *http.Request) // for the same provider, prefer PUT with an empty traits list instead. // Returns 404 if the provider does not exist. Returns 409 Conflict on // concurrent modification. Returns 204 No Content on success. -func HandleDeleteResourceProviderTraits(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleDeleteResourceProviderTraits(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } diff --git a/internal/shim/placement/handlers/resource_provider_usages.go b/internal/shim/placement/handle_resource_provider_usages.go similarity index 83% rename from internal/shim/placement/handlers/resource_provider_usages.go rename to internal/shim/placement/handle_resource_provider_usages.go index 418862b4a..78c7cc450 100644 --- a/internal/shim/placement/handlers/resource_provider_usages.go +++ b/internal/shim/placement/handle_resource_provider_usages.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "net/http" @@ -17,11 +17,12 @@ import ( // with integer usage amounts, along with the resource_provider_generation. // Unlike the provider allocations endpoint, this does not break down usage by // individual consumer. Returns 404 if the provider does not exist. -func HandleListResourceProviderUsages(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleListResourceProviderUsages(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } diff --git a/internal/shim/placement/handlers/resource_providers.go b/internal/shim/placement/handle_resource_providers.go similarity index 83% rename from internal/shim/placement/handlers/resource_providers.go rename to internal/shim/placement/handle_resource_providers.go index 3ecc6428e..de6213e43 100644 --- a/internal/shim/placement/handlers/resource_providers.go +++ b/internal/shim/placement/handle_resource_providers.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "net/http" @@ -22,8 +22,9 @@ import ( // microversions: resources filtering at 1.3, tree queries at 1.14, trait // requirements at 1.18, forbidden traits at 1.22, forbidden aggregates at // 1.32, and the in: syntax for required at 1.39. -func HandleListResourceProviders(w http.ResponseWriter, r *http.Request) { - log := logf.FromContext(r.Context()) +func (s *Shim) HandleListResourceProviders(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) } @@ -39,8 +40,9 @@ func HandleListResourceProviders(w http.ResponseWriter, r *http.Request) { // an HTTP 201 with a Location header, while 1.20+ returns the full resource // provider object in the body. Returns 409 Conflict if a provider with the // same name or UUID already exists. -func HandleCreateResourceProvider(w http.ResponseWriter, r *http.Request) { - log := logf.FromContext(r.Context()) +func (s *Shim) HandleCreateResourceProvider(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) } @@ -52,12 +54,13 @@ func HandleCreateResourceProvider(w http.ResponseWriter, r *http.Request) { // also includes parent_provider_uuid and root_provider_uuid to describe the // provider's position in a hierarchical tree. Returns 404 if the provider // does not exist. -func HandleShowResourceProvider(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleShowResourceProvider(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -68,12 +71,13 @@ func HandleShowResourceProvider(w http.ResponseWriter, r *http.Request) { // any existing provider UUID that would not create a loop in the tree, or set // to null to make the provider a root. Returns 409 Conflict if another // provider already has the requested name. -func HandleUpdateResourceProvider(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleUpdateResourceProvider(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } @@ -83,11 +87,12 @@ func HandleUpdateResourceProvider(w http.ResponseWriter, r *http.Request) { // inventories. The operation fails with 409 Conflict if there are any // allocations against the provider's inventories or if the provider has // child providers in a tree hierarchy. Returns 204 No Content on success. -func HandleDeleteResourceProvider(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleDeleteResourceProvider(w http.ResponseWriter, r *http.Request) { uuid, ok := requiredUUIDPathParam(w, r, "uuid") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) } diff --git a/internal/shim/placement/handlers/root.go b/internal/shim/placement/handle_root.go similarity index 83% rename from internal/shim/placement/handlers/root.go rename to internal/shim/placement/handle_root.go index bfef909b4..d734d4528 100644 --- a/internal/shim/placement/handlers/root.go +++ b/internal/shim/placement/handle_root.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "net/http" @@ -17,7 +17,8 @@ import ( // its status (e.g. CURRENT), links for discovery, and the microversion range // supported by the running service. Clients use this endpoint to discover API // capabilities and negotiate microversions before making further requests. -func HandleGetRoot(w http.ResponseWriter, r *http.Request) { - log := logf.FromContext(r.Context()) +func (s *Shim) HandleGetRoot(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) } diff --git a/internal/shim/placement/handlers/traits.go b/internal/shim/placement/handle_traits.go similarity index 80% rename from internal/shim/placement/handlers/traits.go rename to internal/shim/placement/handle_traits.go index 92ea4b017..0d6ba8e32 100644 --- a/internal/shim/placement/handlers/traits.go +++ b/internal/shim/placement/handle_traits.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "net/http" @@ -20,8 +20,9 @@ import ( // (startswith:CUSTOM) or by an explicit list (in:TRAIT1,TRAIT2), and // associated filters to only traits that are or are not associated with at // least one resource provider. -func HandleListTraits(w http.ResponseWriter, r *http.Request) { - log := logf.FromContext(r.Context()) +func (s *Shim) HandleListTraits(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) } @@ -29,12 +30,13 @@ func HandleListTraits(w http.ResponseWriter, r *http.Request) { // // Checks whether a trait with the given name exists. Returns 204 No Content // (with no response body) if the trait is found, or 404 Not Found otherwise. -func HandleShowTrait(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleShowTrait(w http.ResponseWriter, r *http.Request) { name, ok := requiredPathParam(w, r, "name") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) } @@ -44,12 +46,13 @@ func HandleShowTrait(w http.ResponseWriter, r *http.Request) { // created; standard traits are read-only. Returns 201 Created if the trait // is newly inserted, or 204 No Content if it already exists. Returns 400 // Bad Request if the name does not carry the CUSTOM_ prefix. -func HandleUpdateTrait(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleUpdateTrait(w http.ResponseWriter, r *http.Request) { name, ok := requiredPathParam(w, r, "name") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) } @@ -59,11 +62,12 @@ func HandleUpdateTrait(w http.ResponseWriter, r *http.Request) { // cannot be deleted and will return 400 Bad Request. Returns 409 Conflict if // the trait is still associated with any resource provider. Returns 404 if // the trait does not exist. Returns 204 No Content on success. -func HandleDeleteTrait(w http.ResponseWriter, r *http.Request) { +func (s *Shim) HandleDeleteTrait(w http.ResponseWriter, r *http.Request) { name, ok := requiredPathParam(w, r, "name") if !ok { return } - log := logf.FromContext(r.Context()) + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) } diff --git a/internal/shim/placement/handlers/usages.go b/internal/shim/placement/handle_usages.go similarity index 86% rename from internal/shim/placement/handlers/usages.go rename to internal/shim/placement/handle_usages.go index 2fd5e66a6..32e276c8d 100644 --- a/internal/shim/placement/handlers/usages.go +++ b/internal/shim/placement/handle_usages.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "net/http" @@ -21,7 +21,8 @@ import ( // each group containing resource totals and a consumer_count. Since // microversion 1.38, an optional consumer_type query parameter allows // filtering the results. Available since microversion 1.9. -func HandleListUsages(w http.ResponseWriter, r *http.Request) { - log := logf.FromContext(r.Context()) +func (s *Shim) HandleListUsages(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) } diff --git a/internal/shim/placement/handlers/zz_index.go b/internal/shim/placement/handlers/zz_index.go deleted file mode 100644 index fba5d76b1..000000000 --- a/internal/shim/placement/handlers/zz_index.go +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package handlers - -import "net/http" - -// RegisterRoutes binds all Placement API handlers to the given mux. The -// route patterns use the Go 1.22+ ServeMux syntax with explicit HTTP methods -// and path wildcards. The routes mirror the OpenStack Placement API surface -// as documented at https://docs.openstack.org/api-ref/placement/. -func RegisterRoutes(mux *http.ServeMux) { - // Root - mux.HandleFunc("GET /{$}", HandleGetRoot) - - // Resource providers - mux.HandleFunc("GET /resource_providers", HandleListResourceProviders) - mux.HandleFunc("POST /resource_providers", HandleCreateResourceProvider) - mux.HandleFunc("GET /resource_providers/{uuid}", HandleShowResourceProvider) - mux.HandleFunc("PUT /resource_providers/{uuid}", HandleUpdateResourceProvider) - mux.HandleFunc("DELETE /resource_providers/{uuid}", HandleDeleteResourceProvider) - - // Resource classes - mux.HandleFunc("GET /resource_classes", HandleListResourceClasses) - mux.HandleFunc("POST /resource_classes", HandleCreateResourceClass) - mux.HandleFunc("GET /resource_classes/{name}", HandleShowResourceClass) - mux.HandleFunc("PUT /resource_classes/{name}", HandleUpdateResourceClass) - mux.HandleFunc("DELETE /resource_classes/{name}", HandleDeleteResourceClass) - - // Resource provider inventories - mux.HandleFunc("GET /resource_providers/{uuid}/inventories", HandleListResourceProviderInventories) - mux.HandleFunc("PUT /resource_providers/{uuid}/inventories", HandleUpdateResourceProviderInventories) - mux.HandleFunc("DELETE /resource_providers/{uuid}/inventories", HandleDeleteResourceProviderInventories) - mux.HandleFunc("GET /resource_providers/{uuid}/inventories/{resource_class}", HandleShowResourceProviderInventory) - mux.HandleFunc("PUT /resource_providers/{uuid}/inventories/{resource_class}", HandleUpdateResourceProviderInventory) - mux.HandleFunc("DELETE /resource_providers/{uuid}/inventories/{resource_class}", HandleDeleteResourceProviderInventory) - - // Resource provider aggregates - mux.HandleFunc("GET /resource_providers/{uuid}/aggregates", HandleListResourceProviderAggregates) - mux.HandleFunc("PUT /resource_providers/{uuid}/aggregates", HandleUpdateResourceProviderAggregates) - - // Traits - mux.HandleFunc("GET /traits", HandleListTraits) - mux.HandleFunc("GET /traits/{name}", HandleShowTrait) - mux.HandleFunc("PUT /traits/{name}", HandleUpdateTrait) - mux.HandleFunc("DELETE /traits/{name}", HandleDeleteTrait) - - // Resource provider traits - mux.HandleFunc("GET /resource_providers/{uuid}/traits", HandleListResourceProviderTraits) - mux.HandleFunc("PUT /resource_providers/{uuid}/traits", HandleUpdateResourceProviderTraits) - mux.HandleFunc("DELETE /resource_providers/{uuid}/traits", HandleDeleteResourceProviderTraits) - - // Allocations - mux.HandleFunc("POST /allocations", HandleManageAllocations) - mux.HandleFunc("GET /allocations/{consumer_uuid}", HandleListAllocations) - mux.HandleFunc("PUT /allocations/{consumer_uuid}", HandleUpdateAllocations) - mux.HandleFunc("DELETE /allocations/{consumer_uuid}", HandleDeleteAllocations) - - // Resource provider allocations - mux.HandleFunc("GET /resource_providers/{uuid}/allocations", HandleListResourceProviderAllocations) - - // Usages - mux.HandleFunc("GET /usages", HandleListUsages) - - // Resource provider usages - mux.HandleFunc("GET /resource_providers/{uuid}/usages", HandleListResourceProviderUsages) - - // Allocation candidates - mux.HandleFunc("GET /allocation_candidates", HandleListAllocationCandidates) - - // Reshaper - mux.HandleFunc("POST /reshaper", HandlePostReshaper) -} diff --git a/internal/shim/placement/shim.go b/internal/shim/placement/shim.go new file mode 100644 index 000000000..e4acf1642 --- /dev/null +++ b/internal/shim/placement/shim.go @@ -0,0 +1,118 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "context" + "net/http" + + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// IndexHypervisorByID is the field index key for looking up Hypervisor +// objects by their OpenStack hypervisor ID (status.hypervisorId). +const IndexHypervisorByID = ".status.hypervisorId" + +// Shim is the placement API shim. It holds a controller-runtime client for +// making Kubernetes API calls and exposes HTTP handlers that mirror the +// OpenStack Placement API surface. +type Shim struct { + client.Client +} + +// SetupWithManager registers field indexes on the manager's cache so that +// subsequent list calls are served from the informer cache rather than +// hitting the API server. This must be called before the manager is started. +// +// Calling IndexField internally invokes GetInformer, which creates and +// registers a shared informer for the indexed type (hv1.Hypervisor) with the +// cache. The informer is started later when mgr.Start() is called. This +// means no separate controller or empty Reconcile loop is needed — the +// index registration alone is sufficient to warm the cache. +func (s *Shim) SetupWithManager(mgr ctrl.Manager) error { + return mgr.GetFieldIndexer().IndexField( + context.Background(), + &hv1.Hypervisor{}, + IndexHypervisorByID, + func(obj client.Object) []string { + h, ok := obj.(*hv1.Hypervisor) + if !ok { + return nil + } + if h.Status.HypervisorID == "" { + return nil + } + return []string{h.Status.HypervisorID} + }, + ) +} + +// RegisterRoutes binds all Placement API handlers to the given mux. The +// route patterns use the Go 1.22+ ServeMux syntax with explicit HTTP methods +// and path wildcards. The routes mirror the OpenStack Placement API surface +// as documented at https://docs.openstack.org/api-ref/placement/. +func (s *Shim) RegisterRoutes(mux *http.ServeMux) { + // Root + mux.HandleFunc("GET /{$}", s.HandleGetRoot) + + // Resource providers + mux.HandleFunc("GET /resource_providers", s.HandleListResourceProviders) + mux.HandleFunc("POST /resource_providers", s.HandleCreateResourceProvider) + mux.HandleFunc("GET /resource_providers/{uuid}", s.HandleShowResourceProvider) + mux.HandleFunc("PUT /resource_providers/{uuid}", s.HandleUpdateResourceProvider) + mux.HandleFunc("DELETE /resource_providers/{uuid}", s.HandleDeleteResourceProvider) + + // Resource classes + mux.HandleFunc("GET /resource_classes", s.HandleListResourceClasses) + mux.HandleFunc("POST /resource_classes", s.HandleCreateResourceClass) + mux.HandleFunc("GET /resource_classes/{name}", s.HandleShowResourceClass) + mux.HandleFunc("PUT /resource_classes/{name}", s.HandleUpdateResourceClass) + mux.HandleFunc("DELETE /resource_classes/{name}", s.HandleDeleteResourceClass) + + // Resource provider inventories + mux.HandleFunc("GET /resource_providers/{uuid}/inventories", s.HandleListResourceProviderInventories) + mux.HandleFunc("PUT /resource_providers/{uuid}/inventories", s.HandleUpdateResourceProviderInventories) + mux.HandleFunc("DELETE /resource_providers/{uuid}/inventories", s.HandleDeleteResourceProviderInventories) + mux.HandleFunc("GET /resource_providers/{uuid}/inventories/{resource_class}", s.HandleShowResourceProviderInventory) + mux.HandleFunc("PUT /resource_providers/{uuid}/inventories/{resource_class}", s.HandleUpdateResourceProviderInventory) + mux.HandleFunc("DELETE /resource_providers/{uuid}/inventories/{resource_class}", s.HandleDeleteResourceProviderInventory) + + // Resource provider aggregates + mux.HandleFunc("GET /resource_providers/{uuid}/aggregates", s.HandleListResourceProviderAggregates) + mux.HandleFunc("PUT /resource_providers/{uuid}/aggregates", s.HandleUpdateResourceProviderAggregates) + + // Traits + mux.HandleFunc("GET /traits", s.HandleListTraits) + mux.HandleFunc("GET /traits/{name}", s.HandleShowTrait) + mux.HandleFunc("PUT /traits/{name}", s.HandleUpdateTrait) + mux.HandleFunc("DELETE /traits/{name}", s.HandleDeleteTrait) + + // Resource provider traits + mux.HandleFunc("GET /resource_providers/{uuid}/traits", s.HandleListResourceProviderTraits) + mux.HandleFunc("PUT /resource_providers/{uuid}/traits", s.HandleUpdateResourceProviderTraits) + mux.HandleFunc("DELETE /resource_providers/{uuid}/traits", s.HandleDeleteResourceProviderTraits) + + // Allocations + mux.HandleFunc("POST /allocations", s.HandleManageAllocations) + mux.HandleFunc("GET /allocations/{consumer_uuid}", s.HandleListAllocations) + mux.HandleFunc("PUT /allocations/{consumer_uuid}", s.HandleUpdateAllocations) + mux.HandleFunc("DELETE /allocations/{consumer_uuid}", s.HandleDeleteAllocations) + + // Resource provider allocations + mux.HandleFunc("GET /resource_providers/{uuid}/allocations", s.HandleListResourceProviderAllocations) + + // Usages + mux.HandleFunc("GET /usages", s.HandleListUsages) + + // Resource provider usages + mux.HandleFunc("GET /resource_providers/{uuid}/usages", s.HandleListResourceProviderUsages) + + // Allocation candidates + mux.HandleFunc("GET /allocation_candidates", s.HandleListAllocationCandidates) + + // Reshaper + mux.HandleFunc("POST /reshaper", s.HandlePostReshaper) +} diff --git a/internal/shim/placement/handlers/validation.go b/internal/shim/placement/validation.go similarity index 98% rename from internal/shim/placement/handlers/validation.go rename to internal/shim/placement/validation.go index 5f8bcf2b9..55a46c406 100644 --- a/internal/shim/placement/handlers/validation.go +++ b/internal/shim/placement/validation.go @@ -1,7 +1,7 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -package handlers +package placement import ( "fmt" From 45778bacd7b214d8b4deca912062b26b48bbf1a7 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Thu, 9 Apr 2026 19:10:53 +0200 Subject: [PATCH 10/17] Connect shim to placement on startup --- cmd/shim/main.go | 3 +- .../bundles/cortex-placement-shim/values.yaml | 16 ++ .../cortex-shim/templates/deployment.yaml | 6 +- helm/library/cortex-shim/values.yaml | 4 +- internal/shim/placement/shim.go | 200 ++++++++++++------ 5 files changed, 157 insertions(+), 72 deletions(-) diff --git a/cmd/shim/main.go b/cmd/shim/main.go index ce81d6b44..0a68c7298 100644 --- a/cmd/shim/main.go +++ b/cmd/shim/main.go @@ -208,7 +208,8 @@ func main() { mux := http.NewServeMux() if enablePlacementShim { placementShim := &placement.Shim{Client: mgr.GetClient()} - if err := placementShim.SetupWithManager(mgr); err != nil { + setupLog.Info("Adding placement shim to manager") + if err := placementShim.SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "unable to set up placement shim") os.Exit(1) } diff --git a/helm/bundles/cortex-placement-shim/values.yaml b/helm/bundles/cortex-placement-shim/values.yaml index 6dd793653..2fa998d6b 100644 --- a/helm/bundles/cortex-placement-shim/values.yaml +++ b/helm/bundles/cortex-placement-shim/values.yaml @@ -20,8 +20,24 @@ alerts: cortex-shim: namePrefix: cortex-placement + deployment: + container: + extraArgs: ["--placement-shim=true"] conf: monitoring: labels: github_org: cobaltcore-dev github_repo: cortex + # Uncomment and set the following values to enable SSO for the placement + # shim. The shim will use the provided SSO credentials to talk to openstack + # over ingress. + # sso: + # cert: | + # -----BEGIN CERTIFICATE----- + # Your certificate here + # -----END CERTIFICATE----- + # certKey: | + # -----BEGIN PRIVATE KEY----- + # Your private key here + # -----END PRIVATE KEY----- + # selfSigned: "false" diff --git a/helm/library/cortex-shim/templates/deployment.yaml b/helm/library/cortex-shim/templates/deployment.yaml index b38eb3c02..7d658e87c 100644 --- a/helm/library/cortex-shim/templates/deployment.yaml +++ b/helm/library/cortex-shim/templates/deployment.yaml @@ -1,6 +1,3 @@ -# This file is safe from kubebuilder edit --plugins=helm/v1-alpha -# If you want to re-generate, add the --force flag. - {{- if .Values.deployment.enable }} apiVersion: apps/v1 kind: Deployment @@ -32,6 +29,9 @@ spec: {{- range .Values.deployment.container.args }} - {{ . }} {{- end }} + {{- range .Values.deployment.container.extraArgs }} + - {{ . }} + {{- end }} ports: - name: api containerPort: 8080 diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml index 63574fbe4..3acead93b 100644 --- a/helm/library/cortex-shim/values.yaml +++ b/helm/library/cortex-shim/values.yaml @@ -8,6 +8,7 @@ deployment: - "--metrics-bind-address=:2112" - "--health-probe-bind-address=:8081" - "--metrics-secure=false" + extraArgs: [] resources: limits: cpu: 500m @@ -53,9 +54,6 @@ rbac: prometheus: enable: true -global: - conf: {} - # Use this to unambiguate multiple cortex deployments in the same cluster. namePrefix: cortex conf: {} # No config for now that's needed by all the shims. diff --git a/internal/shim/placement/shim.go b/internal/shim/placement/shim.go index e4acf1642..c15808291 100644 --- a/internal/shim/placement/shim.go +++ b/internal/shim/placement/shim.go @@ -5,8 +5,11 @@ package placement import ( "context" + "errors" "net/http" + "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/cobaltcore-dev/cortex/pkg/sso" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -16,11 +19,77 @@ import ( // objects by their OpenStack hypervisor ID (status.hypervisorId). const IndexHypervisorByID = ".status.hypervisorId" +var ( + // setupLog is a controller-runtime logger used for setup and route + // registration. Individual handlers should use their own loggers derived + // from the request context. + setupLog = ctrl.Log.WithName("placement-shim") +) + +// config holds configuration for the placement shim. +type config struct { + // SSO is an optional reference to a Kubernetes secret containing + // credentials to talk to openstack over ingress via single-sign-on. + SSO *sso.SSOConfig `json:"sso,omitempty"` + // PlacementURL is the URL of the OpenStack Placement API the shim + // should forward requests to. + PlacementURL string `json:"placementURL,omitempty"` +} + // Shim is the placement API shim. It holds a controller-runtime client for // making Kubernetes API calls and exposes HTTP handlers that mirror the // OpenStack Placement API surface. type Shim struct { client.Client + config config + // HTTP client that can talk to openstack placement, if needed, over + // ingress with single-sign-on. + httpClient *http.Client +} + +// Start is called after the manager has started and the cache is running. +// It can be used to perform any initialization that requires the cache to be +// running. +func (s *Shim) Start(ctx context.Context) (err error) { + setupLog.Info("Starting placement shim") + s.httpClient = http.DefaultClient + if s.config.SSO != nil { + setupLog.Info("SSO config provided, creating HTTP client for placement API") + s.httpClient, err = sso.NewHTTPClient(*s.config.SSO) + if err != nil { + setupLog.Error(err, "Failed to create HTTP client from SSO config") + return err + } + setupLog.Info("Successfully created HTTP client from SSO config") + } else { + setupLog.Info("No SSO config provided, using default HTTP client for placement API") + } + // Try establish a connection to the placement API to fail fast if the + // configuration is invalid. Directly call the root endpoint for that. + setupLog.Info("Testing connection to placement API", "url", s.config.PlacementURL) + if s.config.PlacementURL == "" { + err := errors.New("placement URL is not configured") + setupLog.Error(err, "Invalid configuration for placement shim") + return err + } + req, err := http.NewRequestWithContext(ctx, "GET", s.config.PlacementURL, nil) + if err != nil { + setupLog.Error(err, "Failed to create HTTP request to placement API") + return err + } + resp, err := s.httpClient.Do(req) + if err != nil { + setupLog.Error(err, "Failed to connect to placement API") + return err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + err := errors.New("unexpected response from placement API") + setupLog.Error(err, "Failed to call placement API", "status", resp.Status) + return err + } + setupLog.Info("Successfully connected to placement API") + return nil } // SetupWithManager registers field indexes on the manager's cache so that @@ -32,11 +101,19 @@ type Shim struct { // cache. The informer is started later when mgr.Start() is called. This // means no separate controller or empty Reconcile loop is needed — the // index registration alone is sufficient to warm the cache. -func (s *Shim) SetupWithManager(mgr ctrl.Manager) error { - return mgr.GetFieldIndexer().IndexField( - context.Background(), - &hv1.Hypervisor{}, - IndexHypervisorByID, +func (s *Shim) SetupWithManager(ctx context.Context, mgr ctrl.Manager) (err error) { + setupLog.Info("Setting up placement shim with manager") + if err := mgr.Add(s); err != nil { // Bind Start(ctx) + setupLog.Error(err, "Failed to bind start routine") + return err + } + s.config, err = conf.GetConfig[config]() + if err != nil { + setupLog.Error(err, "Failed to load placement shim config") + return err + } + setupLog.Info("Indexing Hypervisors by hypervisor ID") + err = mgr.GetFieldIndexer().IndexField(ctx, &hv1.Hypervisor{}, IndexHypervisorByID, func(obj client.Object) []string { h, ok := obj.(*hv1.Hypervisor) if !ok { @@ -48,6 +125,12 @@ func (s *Shim) SetupWithManager(mgr ctrl.Manager) error { return []string{h.Status.HypervisorID} }, ) + if err != nil { + setupLog.Error(err, "Failed to index Hypervisors by hypervisor ID") + return err + } + setupLog.Info("Successfully indexed Hypervisors by hypervisor ID") + return nil } // RegisterRoutes binds all Placement API handlers to the given mux. The @@ -55,64 +138,51 @@ func (s *Shim) SetupWithManager(mgr ctrl.Manager) error { // and path wildcards. The routes mirror the OpenStack Placement API surface // as documented at https://docs.openstack.org/api-ref/placement/. func (s *Shim) RegisterRoutes(mux *http.ServeMux) { - // Root - mux.HandleFunc("GET /{$}", s.HandleGetRoot) - - // Resource providers - mux.HandleFunc("GET /resource_providers", s.HandleListResourceProviders) - mux.HandleFunc("POST /resource_providers", s.HandleCreateResourceProvider) - mux.HandleFunc("GET /resource_providers/{uuid}", s.HandleShowResourceProvider) - mux.HandleFunc("PUT /resource_providers/{uuid}", s.HandleUpdateResourceProvider) - mux.HandleFunc("DELETE /resource_providers/{uuid}", s.HandleDeleteResourceProvider) - - // Resource classes - mux.HandleFunc("GET /resource_classes", s.HandleListResourceClasses) - mux.HandleFunc("POST /resource_classes", s.HandleCreateResourceClass) - mux.HandleFunc("GET /resource_classes/{name}", s.HandleShowResourceClass) - mux.HandleFunc("PUT /resource_classes/{name}", s.HandleUpdateResourceClass) - mux.HandleFunc("DELETE /resource_classes/{name}", s.HandleDeleteResourceClass) - - // Resource provider inventories - mux.HandleFunc("GET /resource_providers/{uuid}/inventories", s.HandleListResourceProviderInventories) - mux.HandleFunc("PUT /resource_providers/{uuid}/inventories", s.HandleUpdateResourceProviderInventories) - mux.HandleFunc("DELETE /resource_providers/{uuid}/inventories", s.HandleDeleteResourceProviderInventories) - mux.HandleFunc("GET /resource_providers/{uuid}/inventories/{resource_class}", s.HandleShowResourceProviderInventory) - mux.HandleFunc("PUT /resource_providers/{uuid}/inventories/{resource_class}", s.HandleUpdateResourceProviderInventory) - mux.HandleFunc("DELETE /resource_providers/{uuid}/inventories/{resource_class}", s.HandleDeleteResourceProviderInventory) - - // Resource provider aggregates - mux.HandleFunc("GET /resource_providers/{uuid}/aggregates", s.HandleListResourceProviderAggregates) - mux.HandleFunc("PUT /resource_providers/{uuid}/aggregates", s.HandleUpdateResourceProviderAggregates) - - // Traits - mux.HandleFunc("GET /traits", s.HandleListTraits) - mux.HandleFunc("GET /traits/{name}", s.HandleShowTrait) - mux.HandleFunc("PUT /traits/{name}", s.HandleUpdateTrait) - mux.HandleFunc("DELETE /traits/{name}", s.HandleDeleteTrait) - - // Resource provider traits - mux.HandleFunc("GET /resource_providers/{uuid}/traits", s.HandleListResourceProviderTraits) - mux.HandleFunc("PUT /resource_providers/{uuid}/traits", s.HandleUpdateResourceProviderTraits) - mux.HandleFunc("DELETE /resource_providers/{uuid}/traits", s.HandleDeleteResourceProviderTraits) - - // Allocations - mux.HandleFunc("POST /allocations", s.HandleManageAllocations) - mux.HandleFunc("GET /allocations/{consumer_uuid}", s.HandleListAllocations) - mux.HandleFunc("PUT /allocations/{consumer_uuid}", s.HandleUpdateAllocations) - mux.HandleFunc("DELETE /allocations/{consumer_uuid}", s.HandleDeleteAllocations) - - // Resource provider allocations - mux.HandleFunc("GET /resource_providers/{uuid}/allocations", s.HandleListResourceProviderAllocations) - - // Usages - mux.HandleFunc("GET /usages", s.HandleListUsages) - - // Resource provider usages - mux.HandleFunc("GET /resource_providers/{uuid}/usages", s.HandleListResourceProviderUsages) - - // Allocation candidates - mux.HandleFunc("GET /allocation_candidates", s.HandleListAllocationCandidates) - - // Reshaper - mux.HandleFunc("POST /reshaper", s.HandlePostReshaper) + setupLog.Info("Registering placement API routes") + handlers := []struct { + method string + pattern string + handler http.HandlerFunc + }{ + {"GET", "/{$}", s.HandleGetRoot}, + {"GET", "/resource_providers", s.HandleListResourceProviders}, + {"POST", "/resource_providers", s.HandleCreateResourceProvider}, + {"GET", "/resource_providers/{uuid}", s.HandleShowResourceProvider}, + {"PUT", "/resource_providers/{uuid}", s.HandleUpdateResourceProvider}, + {"DELETE", "/resource_providers/{uuid}", s.HandleDeleteResourceProvider}, + {"GET", "/resource_classes", s.HandleListResourceClasses}, + {"POST", "/resource_classes", s.HandleCreateResourceClass}, + {"GET", "/resource_classes/{name}", s.HandleShowResourceClass}, + {"PUT", "/resource_classes/{name}", s.HandleUpdateResourceClass}, + {"DELETE", "/resource_classes/{name}", s.HandleDeleteResourceClass}, + {"GET", "/resource_providers/{uuid}/inventories", s.HandleListResourceProviderInventories}, + {"PUT", "/resource_providers/{uuid}/inventories", s.HandleUpdateResourceProviderInventories}, + {"DELETE", "/resource_providers/{uuid}/inventories", s.HandleDeleteResourceProviderInventories}, + {"GET", "/resource_providers/{uuid}/inventories/{resource_class}", s.HandleShowResourceProviderInventory}, + {"PUT", "/resource_providers/{uuid}/inventories/{resource_class}", s.HandleUpdateResourceProviderInventory}, + {"DELETE", "/resource_providers/{uuid}/inventories/{resource_class}", s.HandleDeleteResourceProviderInventory}, + {"GET", "/resource_providers/{uuid}/aggregates", s.HandleListResourceProviderAggregates}, + {"PUT", "/resource_providers/{uuid}/aggregates", s.HandleUpdateResourceProviderAggregates}, + {"GET", "/traits", s.HandleListTraits}, + {"GET", "/traits/{name}", s.HandleShowTrait}, + {"PUT", "/traits/{name}", s.HandleUpdateTrait}, + {"DELETE", "/traits/{name}", s.HandleDeleteTrait}, + {"GET", "/resource_providers/{uuid}/traits", s.HandleListResourceProviderTraits}, + {"PUT", "/resource_providers/{uuid}/traits", s.HandleUpdateResourceProviderTraits}, + {"DELETE", "/resource_providers/{uuid}/traits", s.HandleDeleteResourceProviderTraits}, + {"POST", "/allocations", s.HandleManageAllocations}, + {"GET", "/allocations/{consumer_uuid}", s.HandleListAllocations}, + {"PUT", "/allocations/{consumer_uuid}", s.HandleUpdateAllocations}, + {"DELETE", "/allocations/{consumer_uuid}", s.HandleDeleteAllocations}, + {"GET", "/resource_providers/{uuid}/allocations", s.HandleListResourceProviderAllocations}, + {"GET", "/usages", s.HandleListUsages}, + {"GET", "/resource_providers/{uuid}/usages", s.HandleListResourceProviderUsages}, + {"GET", "/allocation_candidates", s.HandleListAllocationCandidates}, + {"POST", "/reshaper", s.HandlePostReshaper}, + } + for _, h := range handlers { + setupLog.Info("Registering route", "method", h.method, "pattern", h.pattern) + mux.HandleFunc(h.method+" "+h.pattern, h.handler) + } + setupLog.Info("Successfully registered placement API routes") } From e5cdc1d93a26fb78fec3da81e7491b3f0c0c53ee Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Fri, 10 Apr 2026 08:12:45 +0200 Subject: [PATCH 11/17] Implement passthrough for all http handlers --- .../placement/handle_allocation_candidates.go | 1 + internal/shim/placement/handle_allocations.go | 4 ++ internal/shim/placement/handle_reshaper.go | 1 + .../shim/placement/handle_resource_classes.go | 5 ++ .../handle_resource_provider_aggregates.go | 2 + .../handle_resource_provider_allocations.go | 1 + .../handle_resource_provider_inventories.go | 6 ++ .../handle_resource_provider_traits.go | 3 + .../handle_resource_provider_usages.go | 1 + .../placement/handle_resource_providers.go | 5 ++ internal/shim/placement/handle_root.go | 1 + internal/shim/placement/handle_traits.go | 4 ++ internal/shim/placement/handle_usages.go | 1 + internal/shim/placement/shim.go | 68 +++++++++++++++++-- pkg/sso/sso.go | 29 +++++--- 15 files changed, 117 insertions(+), 15 deletions(-) diff --git a/internal/shim/placement/handle_allocation_candidates.go b/internal/shim/placement/handle_allocation_candidates.go index b252d0026..f80b9aa0f 100644 --- a/internal/shim/placement/handle_allocation_candidates.go +++ b/internal/shim/placement/handle_allocation_candidates.go @@ -38,4 +38,5 @@ func (s *Shim) HandleListAllocationCandidates(w http.ResponseWriter, r *http.Req ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) + s.forward(w, r) } diff --git a/internal/shim/placement/handle_allocations.go b/internal/shim/placement/handle_allocations.go index f406ea5da..ee365d109 100644 --- a/internal/shim/placement/handle_allocations.go +++ b/internal/shim/placement/handle_allocations.go @@ -28,6 +28,7 @@ func (s *Shim) HandleManageAllocations(w http.ResponseWriter, r *http.Request) { ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) + s.forward(w, r) } // HandleListAllocations handles GET /allocations/{consumer_uuid} requests. @@ -50,6 +51,7 @@ func (s *Shim) HandleListAllocations(w http.ResponseWriter, r *http.Request) { log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "consumer_uuid", consumerUUID) + s.forward(w, r) } // HandleUpdateAllocations handles PUT /allocations/{consumer_uuid} requests. @@ -72,6 +74,7 @@ func (s *Shim) HandleUpdateAllocations(w http.ResponseWriter, r *http.Request) { log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "consumer_uuid", consumerUUID) + s.forward(w, r) } // HandleDeleteAllocations handles DELETE /allocations/{consumer_uuid} requests. @@ -88,4 +91,5 @@ func (s *Shim) HandleDeleteAllocations(w http.ResponseWriter, r *http.Request) { log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "consumer_uuid", consumerUUID) + s.forward(w, r) } diff --git a/internal/shim/placement/handle_reshaper.go b/internal/shim/placement/handle_reshaper.go index fe0d85069..f08af7f9a 100644 --- a/internal/shim/placement/handle_reshaper.go +++ b/internal/shim/placement/handle_reshaper.go @@ -28,4 +28,5 @@ func (s *Shim) HandlePostReshaper(w http.ResponseWriter, r *http.Request) { ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) + s.forward(w, r) } diff --git a/internal/shim/placement/handle_resource_classes.go b/internal/shim/placement/handle_resource_classes.go index 554c43034..407071e26 100644 --- a/internal/shim/placement/handle_resource_classes.go +++ b/internal/shim/placement/handle_resource_classes.go @@ -20,6 +20,7 @@ func (s *Shim) HandleListResourceClasses(w http.ResponseWriter, r *http.Request) ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) + s.forward(w, r) } // HandleCreateResourceClass handles POST /resource_classes requests. @@ -33,6 +34,7 @@ func (s *Shim) HandleCreateResourceClass(w http.ResponseWriter, r *http.Request) ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) + s.forward(w, r) } // HandleShowResourceClass handles GET /resource_classes/{name} requests. @@ -48,6 +50,7 @@ func (s *Shim) HandleShowResourceClass(w http.ResponseWriter, r *http.Request) { ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) + s.forward(w, r) } // HandleUpdateResourceClass handles PUT /resource_classes/{name} requests. @@ -66,6 +69,7 @@ func (s *Shim) HandleUpdateResourceClass(w http.ResponseWriter, r *http.Request) ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) + s.forward(w, r) } // HandleDeleteResourceClass handles DELETE /resource_classes/{name} requests. @@ -83,4 +87,5 @@ func (s *Shim) HandleDeleteResourceClass(w http.ResponseWriter, r *http.Request) ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) + s.forward(w, r) } diff --git a/internal/shim/placement/handle_resource_provider_aggregates.go b/internal/shim/placement/handle_resource_provider_aggregates.go index ce8febe50..c270f6730 100644 --- a/internal/shim/placement/handle_resource_provider_aggregates.go +++ b/internal/shim/placement/handle_resource_provider_aggregates.go @@ -30,6 +30,7 @@ func (s *Shim) HandleListResourceProviderAggregates(w http.ResponseWriter, r *ht ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) + s.forward(w, r) } // HandleUpdateResourceProviderAggregates handles @@ -50,4 +51,5 @@ func (s *Shim) HandleUpdateResourceProviderAggregates(w http.ResponseWriter, r * ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) + s.forward(w, r) } diff --git a/internal/shim/placement/handle_resource_provider_allocations.go b/internal/shim/placement/handle_resource_provider_allocations.go index b3f6dcd68..e36bbebd9 100644 --- a/internal/shim/placement/handle_resource_provider_allocations.go +++ b/internal/shim/placement/handle_resource_provider_allocations.go @@ -25,4 +25,5 @@ func (s *Shim) HandleListResourceProviderAllocations(w http.ResponseWriter, r *h ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) + s.forward(w, r) } diff --git a/internal/shim/placement/handle_resource_provider_inventories.go b/internal/shim/placement/handle_resource_provider_inventories.go index c79f924b6..20d1c52dc 100644 --- a/internal/shim/placement/handle_resource_provider_inventories.go +++ b/internal/shim/placement/handle_resource_provider_inventories.go @@ -26,6 +26,7 @@ func (s *Shim) HandleListResourceProviderInventories(w http.ResponseWriter, r *h ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) + s.forward(w, r) } // HandleUpdateResourceProviderInventories handles @@ -46,6 +47,7 @@ func (s *Shim) HandleUpdateResourceProviderInventories(w http.ResponseWriter, r ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) + s.forward(w, r) } // HandleDeleteResourceProviderInventories handles @@ -65,6 +67,7 @@ func (s *Shim) HandleDeleteResourceProviderInventories(w http.ResponseWriter, r ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) + s.forward(w, r) } // HandleShowResourceProviderInventory handles @@ -87,6 +90,7 @@ func (s *Shim) HandleShowResourceProviderInventory(w http.ResponseWriter, r *htt log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid, "resource_class", resourceClass) + s.forward(w, r) } // HandleUpdateResourceProviderInventory handles @@ -111,6 +115,7 @@ func (s *Shim) HandleUpdateResourceProviderInventory(w http.ResponseWriter, r *h log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid, "resource_class", resourceClass) + s.forward(w, r) } // HandleDeleteResourceProviderInventory handles @@ -133,4 +138,5 @@ func (s *Shim) HandleDeleteResourceProviderInventory(w http.ResponseWriter, r *h log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid, "resource_class", resourceClass) + s.forward(w, r) } diff --git a/internal/shim/placement/handle_resource_provider_traits.go b/internal/shim/placement/handle_resource_provider_traits.go index 5c18bf85b..75250a76e 100644 --- a/internal/shim/placement/handle_resource_provider_traits.go +++ b/internal/shim/placement/handle_resource_provider_traits.go @@ -24,6 +24,7 @@ func (s *Shim) HandleListResourceProviderTraits(w http.ResponseWriter, r *http.R ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) + s.forward(w, r) } // HandleUpdateResourceProviderTraits handles @@ -44,6 +45,7 @@ func (s *Shim) HandleUpdateResourceProviderTraits(w http.ResponseWriter, r *http ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) + s.forward(w, r) } // HandleDeleteResourceProviderTraits handles @@ -63,4 +65,5 @@ func (s *Shim) HandleDeleteResourceProviderTraits(w http.ResponseWriter, r *http ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) + s.forward(w, r) } diff --git a/internal/shim/placement/handle_resource_provider_usages.go b/internal/shim/placement/handle_resource_provider_usages.go index 78c7cc450..c13d0ae65 100644 --- a/internal/shim/placement/handle_resource_provider_usages.go +++ b/internal/shim/placement/handle_resource_provider_usages.go @@ -25,4 +25,5 @@ func (s *Shim) HandleListResourceProviderUsages(w http.ResponseWriter, r *http.R ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) + s.forward(w, r) } diff --git a/internal/shim/placement/handle_resource_providers.go b/internal/shim/placement/handle_resource_providers.go index de6213e43..b7a21018f 100644 --- a/internal/shim/placement/handle_resource_providers.go +++ b/internal/shim/placement/handle_resource_providers.go @@ -26,6 +26,7 @@ func (s *Shim) HandleListResourceProviders(w http.ResponseWriter, r *http.Reques ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) + s.forward(w, r) } // HandleCreateResourceProvider handles POST /resource_providers requests. @@ -44,6 +45,7 @@ func (s *Shim) HandleCreateResourceProvider(w http.ResponseWriter, r *http.Reque ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) + s.forward(w, r) } // HandleShowResourceProvider handles GET /resource_providers/{uuid} requests. @@ -62,6 +64,7 @@ func (s *Shim) HandleShowResourceProvider(w http.ResponseWriter, r *http.Request ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) + s.forward(w, r) } // HandleUpdateResourceProvider handles PUT /resource_providers/{uuid} requests. @@ -79,6 +82,7 @@ func (s *Shim) HandleUpdateResourceProvider(w http.ResponseWriter, r *http.Reque ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) + s.forward(w, r) } // HandleDeleteResourceProvider handles DELETE /resource_providers/{uuid} requests. @@ -95,4 +99,5 @@ func (s *Shim) HandleDeleteResourceProvider(w http.ResponseWriter, r *http.Reque ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "uuid", uuid) + s.forward(w, r) } diff --git a/internal/shim/placement/handle_root.go b/internal/shim/placement/handle_root.go index d734d4528..10821bf42 100644 --- a/internal/shim/placement/handle_root.go +++ b/internal/shim/placement/handle_root.go @@ -21,4 +21,5 @@ func (s *Shim) HandleGetRoot(w http.ResponseWriter, r *http.Request) { ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) + s.forward(w, r) } diff --git a/internal/shim/placement/handle_traits.go b/internal/shim/placement/handle_traits.go index 0d6ba8e32..7cb645552 100644 --- a/internal/shim/placement/handle_traits.go +++ b/internal/shim/placement/handle_traits.go @@ -24,6 +24,7 @@ func (s *Shim) HandleListTraits(w http.ResponseWriter, r *http.Request) { ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) + s.forward(w, r) } // HandleShowTrait handles GET /traits/{name} requests. @@ -38,6 +39,7 @@ func (s *Shim) HandleShowTrait(w http.ResponseWriter, r *http.Request) { ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) + s.forward(w, r) } // HandleUpdateTrait handles PUT /traits/{name} requests. @@ -54,6 +56,7 @@ func (s *Shim) HandleUpdateTrait(w http.ResponseWriter, r *http.Request) { ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) + s.forward(w, r) } // HandleDeleteTrait handles DELETE /traits/{name} requests. @@ -70,4 +73,5 @@ func (s *Shim) HandleDeleteTrait(w http.ResponseWriter, r *http.Request) { ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path, "name", name) + s.forward(w, r) } diff --git a/internal/shim/placement/handle_usages.go b/internal/shim/placement/handle_usages.go index 32e276c8d..2e3308c1e 100644 --- a/internal/shim/placement/handle_usages.go +++ b/internal/shim/placement/handle_usages.go @@ -25,4 +25,5 @@ func (s *Shim) HandleListUsages(w http.ResponseWriter, r *http.Request) { ctx := r.Context() log := logf.FromContext(ctx) log.Info("placement request", "method", r.Method, "path", r.URL.Path) + s.forward(w, r) } diff --git a/internal/shim/placement/shim.go b/internal/shim/placement/shim.go index c15808291..4ef93c66a 100644 --- a/internal/shim/placement/shim.go +++ b/internal/shim/placement/shim.go @@ -6,13 +6,16 @@ package placement import ( "context" "errors" + "io" "net/http" + "time" "github.com/cobaltcore-dev/cortex/pkg/conf" "github.com/cobaltcore-dev/cortex/pkg/sso" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" ) // IndexHypervisorByID is the field index key for looking up Hypervisor @@ -52,18 +55,29 @@ type Shim struct { // running. func (s *Shim) Start(ctx context.Context) (err error) { setupLog.Info("Starting placement shim") - s.httpClient = http.DefaultClient + // Build the transport with optional SSO TLS credentials. + var transport *http.Transport if s.config.SSO != nil { - setupLog.Info("SSO config provided, creating HTTP client for placement API") - s.httpClient, err = sso.NewHTTPClient(*s.config.SSO) + setupLog.Info("SSO config provided, creating transport for placement API") + transport, err = sso.NewTransport(*s.config.SSO) if err != nil { - setupLog.Error(err, "Failed to create HTTP client from SSO config") + setupLog.Error(err, "Failed to create transport from SSO config") return err } - setupLog.Info("Successfully created HTTP client from SSO config") } else { - setupLog.Info("No SSO config provided, using default HTTP client for placement API") + setupLog.Info("No SSO config provided, using plain transport for placement API") + transport = &http.Transport{} } + // All proxy traffic goes to one placement API host, so raise the + // per-host idle connection limit from the default of 2. + transport.MaxIdleConns = 100 + transport.MaxIdleConnsPerHost = 100 + // Guard against a hung upstream or slow TLS negotiation. + transport.TLSHandshakeTimeout = 10 * time.Second + transport.ResponseHeaderTimeout = 60 * time.Second + transport.ExpectContinueTimeout = 1 * time.Second + transport.IdleConnTimeout = 90 * time.Second + s.httpClient = &http.Client{Transport: transport} // Try establish a connection to the placement API to fail fast if the // configuration is invalid. Directly call the root endpoint for that. setupLog.Info("Testing connection to placement API", "url", s.config.PlacementURL) @@ -92,6 +106,48 @@ func (s *Shim) Start(ctx context.Context) (err error) { return nil } +// forward proxies the incoming HTTP request to the upstream placement API +// and copies the response (status, headers, body) back to the client. +func (s *Shim) forward(w http.ResponseWriter, r *http.Request) { + log := logf.FromContext(r.Context()) + + // Build upstream URL: config.PlacementURL + original path + query string. + upstreamURL := s.config.PlacementURL + r.URL.Path + if r.URL.RawQuery != "" { + upstreamURL += "?" + r.URL.RawQuery + } + + // Create upstream request preserving method, body, and context. + upstreamReq, err := http.NewRequestWithContext(r.Context(), r.Method, upstreamURL, r.Body) + if err != nil { + log.Error(err, "failed to create upstream request", "url", upstreamURL) + http.Error(w, "failed to create upstream request", http.StatusBadGateway) + return + } + + // Copy all incoming headers. + upstreamReq.Header = r.Header.Clone() + + resp, err := s.httpClient.Do(upstreamReq) + if err != nil { + log.Error(err, "failed to reach placement API", "url", upstreamURL) + http.Error(w, "failed to reach placement API", http.StatusBadGateway) + return + } + defer resp.Body.Close() + + // Copy response headers, status code, and body back to the caller. + for k, vs := range resp.Header { + for _, v := range vs { + w.Header().Add(k, v) + } + } + w.WriteHeader(resp.StatusCode) + if _, err := io.Copy(w, resp.Body); err != nil { + log.Error(err, "failed to copy upstream response body") + } +} + // SetupWithManager registers field indexes on the manager's cache so that // subsequent list calls are served from the informer cache rather than // hitting the API server. This must be called before the manager is started. diff --git a/pkg/sso/sso.go b/pkg/sso/sso.go index 069533518..c5535d915 100644 --- a/pkg/sso/sso.go +++ b/pkg/sso/sso.go @@ -70,15 +70,13 @@ func (c Connector) FromSecretRef(ctx context.Context, ref corev1.SecretReference return NewHTTPClient(conf) } -// Create a new HTTP client with the given SSO configuration -// and logging for each request. -func NewHTTPClient(conf SSOConfig) (*http.Client, error) { +// NewTransport returns an *http.Transport configured with TLS client +// certificates from the given SSO config. If no certificate is provided, +// a plain *http.Transport is returned. +func NewTransport(conf SSOConfig) (*http.Transport, error) { if conf.Cert == "" { - // Disable SSO if no certificate is provided. - slog.Debug("making http requests without SSO") - return &http.Client{Transport: &requestLogger{T: &http.Transport{}}}, nil + return &http.Transport{}, nil } - // If we have a public key, we also need a private key. if conf.CertKey == "" { return nil, errors.New("missing cert key for SSO") } @@ -91,7 +89,7 @@ func NewHTTPClient(conf SSOConfig) (*http.Client, error) { } caCertPool := x509.NewCertPool() caCertPool.AddCert(cert.Leaf) - return &http.Client{Transport: &requestLogger{T: &http.Transport{ + return &http.Transport{ TLSClientConfig: &tls.Config{ Certificates: []tls.Certificate{cert}, RootCAs: caCertPool, @@ -99,5 +97,18 @@ func NewHTTPClient(conf SSOConfig) (*http.Client, error) { //nolint:gosec InsecureSkipVerify: conf.SelfSigned, }, - }}}, nil + }, nil +} + +// Create a new HTTP client with the given SSO configuration +// and logging for each request. +func NewHTTPClient(conf SSOConfig) (*http.Client, error) { + transport, err := NewTransport(conf) + if err != nil { + return nil, err + } + if conf.Cert == "" { + slog.Debug("making http requests without SSO") + } + return &http.Client{Transport: &requestLogger{T: transport}}, nil } From 5822dafea7b0ec8668f0e02a22ff123e110233d8 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Fri, 10 Apr 2026 08:34:21 +0200 Subject: [PATCH 12/17] Unit tests and linting --- .../handle_allocation_candidates_test.go | 21 ++ .../shim/placement/handle_allocations_test.go | 78 +++++++ .../shim/placement/handle_reshaper_test.go | 21 ++ .../placement/handle_resource_classes_test.go | 57 +++++ ...andle_resource_provider_aggregates_test.go | 51 +++++ ...ndle_resource_provider_allocations_test.go | 30 +++ ...ndle_resource_provider_inventories_test.go | 139 ++++++++++++ .../handle_resource_provider_traits_test.go | 72 ++++++ .../handle_resource_provider_usages_test.go | 30 +++ .../handle_resource_providers_test.go | 86 +++++++ internal/shim/placement/handle_root_test.go | 21 ++ internal/shim/placement/handle_traits_test.go | 49 ++++ internal/shim/placement/handle_usages_test.go | 21 ++ internal/shim/placement/shim.go | 2 +- internal/shim/placement/shim_test.go | 209 ++++++++++++++++++ internal/shim/placement/validation.go | 2 +- internal/shim/placement/validation_test.go | 89 ++++++++ 17 files changed, 976 insertions(+), 2 deletions(-) create mode 100644 internal/shim/placement/handle_allocation_candidates_test.go create mode 100644 internal/shim/placement/handle_allocations_test.go create mode 100644 internal/shim/placement/handle_reshaper_test.go create mode 100644 internal/shim/placement/handle_resource_classes_test.go create mode 100644 internal/shim/placement/handle_resource_provider_aggregates_test.go create mode 100644 internal/shim/placement/handle_resource_provider_allocations_test.go create mode 100644 internal/shim/placement/handle_resource_provider_inventories_test.go create mode 100644 internal/shim/placement/handle_resource_provider_traits_test.go create mode 100644 internal/shim/placement/handle_resource_provider_usages_test.go create mode 100644 internal/shim/placement/handle_resource_providers_test.go create mode 100644 internal/shim/placement/handle_root_test.go create mode 100644 internal/shim/placement/handle_traits_test.go create mode 100644 internal/shim/placement/handle_usages_test.go create mode 100644 internal/shim/placement/shim_test.go create mode 100644 internal/shim/placement/validation_test.go diff --git a/internal/shim/placement/handle_allocation_candidates_test.go b/internal/shim/placement/handle_allocation_candidates_test.go new file mode 100644 index 000000000..de75a96af --- /dev/null +++ b/internal/shim/placement/handle_allocation_candidates_test.go @@ -0,0 +1,21 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "testing" +) + +func TestHandleListAllocationCandidates(t *testing.T) { + var gotPath string + s := newTestShim(t, http.StatusOK, `{"allocation_requests":[]}`, &gotPath) + w := serveHandler(t, "GET", "/allocation_candidates", s.HandleListAllocationCandidates, "/allocation_candidates") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + if gotPath != "/allocation_candidates" { + t.Fatalf("upstream path = %q, want /allocation_candidates", gotPath) + } +} diff --git a/internal/shim/placement/handle_allocations_test.go b/internal/shim/placement/handle_allocations_test.go new file mode 100644 index 000000000..c42cf86e0 --- /dev/null +++ b/internal/shim/placement/handle_allocations_test.go @@ -0,0 +1,78 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "testing" +) + +func TestHandleManageAllocations(t *testing.T) { + var gotPath string + s := newTestShim(t, http.StatusNoContent, "", &gotPath) + w := serveHandler(t, "POST", "/allocations", s.HandleManageAllocations, "/allocations") + if w.Code != http.StatusNoContent { + t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent) + } + if gotPath != "/allocations" { + t.Fatalf("upstream path = %q, want /allocations", gotPath) + } +} + +func TestHandleListAllocations(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/allocations/{consumer_uuid}", + s.HandleListAllocations, "/allocations/"+validUUID) + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/allocations/{consumer_uuid}", + s.HandleListAllocations, "/allocations/bad") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} + +func TestHandleUpdateAllocations(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusNoContent, "", nil) + w := serveHandler(t, "PUT", "/allocations/{consumer_uuid}", + s.HandleUpdateAllocations, "/allocations/"+validUUID) + if w.Code != http.StatusNoContent { + t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "PUT", "/allocations/{consumer_uuid}", + s.HandleUpdateAllocations, "/allocations/bad") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} + +func TestHandleDeleteAllocations(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusNoContent, "", nil) + w := serveHandler(t, "DELETE", "/allocations/{consumer_uuid}", + s.HandleDeleteAllocations, "/allocations/"+validUUID) + if w.Code != http.StatusNoContent { + t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "DELETE", "/allocations/{consumer_uuid}", + s.HandleDeleteAllocations, "/allocations/bad") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} diff --git a/internal/shim/placement/handle_reshaper_test.go b/internal/shim/placement/handle_reshaper_test.go new file mode 100644 index 000000000..e00eff2e2 --- /dev/null +++ b/internal/shim/placement/handle_reshaper_test.go @@ -0,0 +1,21 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "testing" +) + +func TestHandlePostReshaper(t *testing.T) { + var gotPath string + s := newTestShim(t, http.StatusNoContent, "", &gotPath) + w := serveHandler(t, "POST", "/reshaper", s.HandlePostReshaper, "/reshaper") + if w.Code != http.StatusNoContent { + t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent) + } + if gotPath != "/reshaper" { + t.Fatalf("upstream path = %q, want /reshaper", gotPath) + } +} diff --git a/internal/shim/placement/handle_resource_classes_test.go b/internal/shim/placement/handle_resource_classes_test.go new file mode 100644 index 000000000..80ffdf40e --- /dev/null +++ b/internal/shim/placement/handle_resource_classes_test.go @@ -0,0 +1,57 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "testing" +) + +func TestHandleListResourceClasses(t *testing.T) { + var gotPath string + s := newTestShim(t, http.StatusOK, `{"resource_classes":[]}`, &gotPath) + w := serveHandler(t, "GET", "/resource_classes", s.HandleListResourceClasses, "/resource_classes") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + if gotPath != "/resource_classes" { + t.Fatalf("upstream path = %q, want /resource_classes", gotPath) + } +} + +func TestHandleCreateResourceClass(t *testing.T) { + s := newTestShim(t, http.StatusCreated, "{}", nil) + w := serveHandler(t, "POST", "/resource_classes", s.HandleCreateResourceClass, "/resource_classes") + if w.Code != http.StatusCreated { + t.Fatalf("status = %d, want %d", w.Code, http.StatusCreated) + } +} + +func TestHandleShowResourceClass(t *testing.T) { + var gotPath string + s := newTestShim(t, http.StatusOK, "{}", &gotPath) + w := serveHandler(t, "GET", "/resource_classes/{name}", s.HandleShowResourceClass, "/resource_classes/VCPU") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + if gotPath != "/resource_classes/VCPU" { + t.Fatalf("upstream path = %q, want /resource_classes/VCPU", gotPath) + } +} + +func TestHandleUpdateResourceClass(t *testing.T) { + s := newTestShim(t, http.StatusNoContent, "", nil) + w := serveHandler(t, "PUT", "/resource_classes/{name}", s.HandleUpdateResourceClass, "/resource_classes/CUSTOM_FOO") + if w.Code != http.StatusNoContent { + t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent) + } +} + +func TestHandleDeleteResourceClass(t *testing.T) { + s := newTestShim(t, http.StatusNoContent, "", nil) + w := serveHandler(t, "DELETE", "/resource_classes/{name}", s.HandleDeleteResourceClass, "/resource_classes/CUSTOM_BAR") + if w.Code != http.StatusNoContent { + t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent) + } +} diff --git a/internal/shim/placement/handle_resource_provider_aggregates_test.go b/internal/shim/placement/handle_resource_provider_aggregates_test.go new file mode 100644 index 000000000..f55b09fed --- /dev/null +++ b/internal/shim/placement/handle_resource_provider_aggregates_test.go @@ -0,0 +1,51 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "testing" +) + +func TestHandleListResourceProviderAggregates(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/resource_providers/{uuid}/aggregates", + s.HandleListResourceProviderAggregates, + "/resource_providers/"+validUUID+"/aggregates") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/resource_providers/{uuid}/aggregates", + s.HandleListResourceProviderAggregates, + "/resource_providers/not-a-uuid/aggregates") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} + +func TestHandleUpdateResourceProviderAggregates(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "PUT", "/resource_providers/{uuid}/aggregates", + s.HandleUpdateResourceProviderAggregates, + "/resource_providers/"+validUUID+"/aggregates") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "PUT", "/resource_providers/{uuid}/aggregates", + s.HandleUpdateResourceProviderAggregates, + "/resource_providers/not-a-uuid/aggregates") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} diff --git a/internal/shim/placement/handle_resource_provider_allocations_test.go b/internal/shim/placement/handle_resource_provider_allocations_test.go new file mode 100644 index 000000000..98834afab --- /dev/null +++ b/internal/shim/placement/handle_resource_provider_allocations_test.go @@ -0,0 +1,30 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "testing" +) + +func TestHandleListResourceProviderAllocations(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/resource_providers/{uuid}/allocations", + s.HandleListResourceProviderAllocations, + "/resource_providers/"+validUUID+"/allocations") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/resource_providers/{uuid}/allocations", + s.HandleListResourceProviderAllocations, + "/resource_providers/not-a-uuid/allocations") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} diff --git a/internal/shim/placement/handle_resource_provider_inventories_test.go b/internal/shim/placement/handle_resource_provider_inventories_test.go new file mode 100644 index 000000000..054e48e32 --- /dev/null +++ b/internal/shim/placement/handle_resource_provider_inventories_test.go @@ -0,0 +1,139 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "testing" +) + +func TestHandleListResourceProviderInventories(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/resource_providers/{uuid}/inventories", + s.HandleListResourceProviderInventories, + "/resource_providers/"+validUUID+"/inventories") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/resource_providers/{uuid}/inventories", + s.HandleListResourceProviderInventories, + "/resource_providers/not-a-uuid/inventories") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} + +func TestHandleUpdateResourceProviderInventories(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "PUT", "/resource_providers/{uuid}/inventories", + s.HandleUpdateResourceProviderInventories, + "/resource_providers/"+validUUID+"/inventories") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "PUT", "/resource_providers/{uuid}/inventories", + s.HandleUpdateResourceProviderInventories, + "/resource_providers/not-a-uuid/inventories") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} + +func TestHandleDeleteResourceProviderInventories(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusNoContent, "", nil) + w := serveHandler(t, "DELETE", "/resource_providers/{uuid}/inventories", + s.HandleDeleteResourceProviderInventories, + "/resource_providers/"+validUUID+"/inventories") + if w.Code != http.StatusNoContent { + t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "DELETE", "/resource_providers/{uuid}/inventories", + s.HandleDeleteResourceProviderInventories, + "/resource_providers/not-a-uuid/inventories") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} + +func TestHandleShowResourceProviderInventory(t *testing.T) { + t.Run("valid", func(t *testing.T) { + var gotPath string + s := newTestShim(t, http.StatusOK, "{}", &gotPath) + path := "/resource_providers/" + validUUID + "/inventories/VCPU" + w := serveHandler(t, "GET", "/resource_providers/{uuid}/inventories/{resource_class}", + s.HandleShowResourceProviderInventory, path) + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + if gotPath != path { + t.Fatalf("upstream path = %q, want %q", gotPath, path) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/resource_providers/{uuid}/inventories/{resource_class}", + s.HandleShowResourceProviderInventory, + "/resource_providers/not-a-uuid/inventories/VCPU") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} + +func TestHandleUpdateResourceProviderInventory(t *testing.T) { + t.Run("valid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "PUT", "/resource_providers/{uuid}/inventories/{resource_class}", + s.HandleUpdateResourceProviderInventory, + "/resource_providers/"+validUUID+"/inventories/VCPU") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "PUT", "/resource_providers/{uuid}/inventories/{resource_class}", + s.HandleUpdateResourceProviderInventory, + "/resource_providers/not-a-uuid/inventories/VCPU") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} + +func TestHandleDeleteResourceProviderInventory(t *testing.T) { + t.Run("valid", func(t *testing.T) { + s := newTestShim(t, http.StatusNoContent, "", nil) + w := serveHandler(t, "DELETE", "/resource_providers/{uuid}/inventories/{resource_class}", + s.HandleDeleteResourceProviderInventory, + "/resource_providers/"+validUUID+"/inventories/VCPU") + if w.Code != http.StatusNoContent { + t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "DELETE", "/resource_providers/{uuid}/inventories/{resource_class}", + s.HandleDeleteResourceProviderInventory, + "/resource_providers/not-a-uuid/inventories/VCPU") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} diff --git a/internal/shim/placement/handle_resource_provider_traits_test.go b/internal/shim/placement/handle_resource_provider_traits_test.go new file mode 100644 index 000000000..809f0503f --- /dev/null +++ b/internal/shim/placement/handle_resource_provider_traits_test.go @@ -0,0 +1,72 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "testing" +) + +func TestHandleListResourceProviderTraits(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/resource_providers/{uuid}/traits", + s.HandleListResourceProviderTraits, + "/resource_providers/"+validUUID+"/traits") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/resource_providers/{uuid}/traits", + s.HandleListResourceProviderTraits, + "/resource_providers/not-a-uuid/traits") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} + +func TestHandleUpdateResourceProviderTraits(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "PUT", "/resource_providers/{uuid}/traits", + s.HandleUpdateResourceProviderTraits, + "/resource_providers/"+validUUID+"/traits") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "PUT", "/resource_providers/{uuid}/traits", + s.HandleUpdateResourceProviderTraits, + "/resource_providers/not-a-uuid/traits") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} + +func TestHandleDeleteResourceProviderTraits(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusNoContent, "", nil) + w := serveHandler(t, "DELETE", "/resource_providers/{uuid}/traits", + s.HandleDeleteResourceProviderTraits, + "/resource_providers/"+validUUID+"/traits") + if w.Code != http.StatusNoContent { + t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "DELETE", "/resource_providers/{uuid}/traits", + s.HandleDeleteResourceProviderTraits, + "/resource_providers/not-a-uuid/traits") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} diff --git a/internal/shim/placement/handle_resource_provider_usages_test.go b/internal/shim/placement/handle_resource_provider_usages_test.go new file mode 100644 index 000000000..76541a993 --- /dev/null +++ b/internal/shim/placement/handle_resource_provider_usages_test.go @@ -0,0 +1,30 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "testing" +) + +func TestHandleListResourceProviderUsages(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/resource_providers/{uuid}/usages", + s.HandleListResourceProviderUsages, + "/resource_providers/"+validUUID+"/usages") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/resource_providers/{uuid}/usages", + s.HandleListResourceProviderUsages, + "/resource_providers/not-a-uuid/usages") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} diff --git a/internal/shim/placement/handle_resource_providers_test.go b/internal/shim/placement/handle_resource_providers_test.go new file mode 100644 index 000000000..520a32c0b --- /dev/null +++ b/internal/shim/placement/handle_resource_providers_test.go @@ -0,0 +1,86 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "testing" +) + +func TestHandleListResourceProviders(t *testing.T) { + var gotPath string + s := newTestShim(t, http.StatusOK, `{"resource_providers":[]}`, &gotPath) + w := serveHandler(t, "GET", "/resource_providers", s.HandleListResourceProviders, "/resource_providers") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + if gotPath != "/resource_providers" { + t.Fatalf("upstream path = %q, want /resource_providers", gotPath) + } +} + +func TestHandleCreateResourceProvider(t *testing.T) { + s := newTestShim(t, http.StatusCreated, "{}", nil) + w := serveHandler(t, "POST", "/resource_providers", s.HandleCreateResourceProvider, "/resource_providers") + if w.Code != http.StatusCreated { + t.Fatalf("status = %d, want %d", w.Code, http.StatusCreated) + } +} + +func TestHandleShowResourceProvider(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/resource_providers/{uuid}", s.HandleShowResourceProvider, + "/resource_providers/"+validUUID) + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "GET", "/resource_providers/{uuid}", s.HandleShowResourceProvider, + "/resource_providers/not-a-uuid") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} + +func TestHandleUpdateResourceProvider(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "PUT", "/resource_providers/{uuid}", s.HandleUpdateResourceProvider, + "/resource_providers/"+validUUID) + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "PUT", "/resource_providers/{uuid}", s.HandleUpdateResourceProvider, + "/resource_providers/not-a-uuid") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} + +func TestHandleDeleteResourceProvider(t *testing.T) { + t.Run("valid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusNoContent, "", nil) + w := serveHandler(t, "DELETE", "/resource_providers/{uuid}", s.HandleDeleteResourceProvider, + "/resource_providers/"+validUUID) + if w.Code != http.StatusNoContent { + t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent) + } + }) + t.Run("invalid uuid", func(t *testing.T) { + s := newTestShim(t, http.StatusOK, "{}", nil) + w := serveHandler(t, "DELETE", "/resource_providers/{uuid}", s.HandleDeleteResourceProvider, + "/resource_providers/not-a-uuid") + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} diff --git a/internal/shim/placement/handle_root_test.go b/internal/shim/placement/handle_root_test.go new file mode 100644 index 000000000..e342f6a68 --- /dev/null +++ b/internal/shim/placement/handle_root_test.go @@ -0,0 +1,21 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "testing" +) + +func TestHandleGetRoot(t *testing.T) { + var gotPath string + s := newTestShim(t, http.StatusOK, `{"versions":[]}`, &gotPath) + w := serveHandler(t, "GET", "/{$}", s.HandleGetRoot, "/") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + if gotPath != "/" { + t.Fatalf("upstream path = %q, want %q", gotPath, "/") + } +} diff --git a/internal/shim/placement/handle_traits_test.go b/internal/shim/placement/handle_traits_test.go new file mode 100644 index 000000000..09d5a8586 --- /dev/null +++ b/internal/shim/placement/handle_traits_test.go @@ -0,0 +1,49 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "testing" +) + +func TestHandleListTraits(t *testing.T) { + var gotPath string + s := newTestShim(t, http.StatusOK, `{"traits":[]}`, &gotPath) + w := serveHandler(t, "GET", "/traits", s.HandleListTraits, "/traits") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + if gotPath != "/traits" { + t.Fatalf("upstream path = %q, want /traits", gotPath) + } +} + +func TestHandleShowTrait(t *testing.T) { + var gotPath string + s := newTestShim(t, http.StatusNoContent, "", &gotPath) + w := serveHandler(t, "GET", "/traits/{name}", s.HandleShowTrait, "/traits/HW_CPU_X86_AVX2") + if w.Code != http.StatusNoContent { + t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent) + } + if gotPath != "/traits/HW_CPU_X86_AVX2" { + t.Fatalf("upstream path = %q, want /traits/HW_CPU_X86_AVX2", gotPath) + } +} + +func TestHandleUpdateTrait(t *testing.T) { + s := newTestShim(t, http.StatusCreated, "", nil) + w := serveHandler(t, "PUT", "/traits/{name}", s.HandleUpdateTrait, "/traits/CUSTOM_TRAIT") + if w.Code != http.StatusCreated { + t.Fatalf("status = %d, want %d", w.Code, http.StatusCreated) + } +} + +func TestHandleDeleteTrait(t *testing.T) { + s := newTestShim(t, http.StatusNoContent, "", nil) + w := serveHandler(t, "DELETE", "/traits/{name}", s.HandleDeleteTrait, "/traits/CUSTOM_TRAIT") + if w.Code != http.StatusNoContent { + t.Fatalf("status = %d, want %d", w.Code, http.StatusNoContent) + } +} diff --git a/internal/shim/placement/handle_usages_test.go b/internal/shim/placement/handle_usages_test.go new file mode 100644 index 000000000..46d91681b --- /dev/null +++ b/internal/shim/placement/handle_usages_test.go @@ -0,0 +1,21 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "testing" +) + +func TestHandleListUsages(t *testing.T) { + var gotPath string + s := newTestShim(t, http.StatusOK, `{"usages":{}}`, &gotPath) + w := serveHandler(t, "GET", "/usages", s.HandleListUsages, "/usages") + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", w.Code, http.StatusOK) + } + if gotPath != "/usages" { + t.Fatalf("upstream path = %q, want /usages", gotPath) + } +} diff --git a/internal/shim/placement/shim.go b/internal/shim/placement/shim.go index 4ef93c66a..2623a3baa 100644 --- a/internal/shim/placement/shim.go +++ b/internal/shim/placement/shim.go @@ -86,7 +86,7 @@ func (s *Shim) Start(ctx context.Context) (err error) { setupLog.Error(err, "Invalid configuration for placement shim") return err } - req, err := http.NewRequestWithContext(ctx, "GET", s.config.PlacementURL, nil) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, s.config.PlacementURL, http.NoBody) if err != nil { setupLog.Error(err, "Failed to create HTTP request to placement API") return err diff --git a/internal/shim/placement/shim_test.go b/internal/shim/placement/shim_test.go new file mode 100644 index 000000000..fc81d9b1f --- /dev/null +++ b/internal/shim/placement/shim_test.go @@ -0,0 +1,209 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +const validUUID = "d9b3a520-2a3c-4f6b-8b9a-1c2d3e4f5a6b" + +// newTestShim creates a Shim backed by an upstream test server that returns +// the given status and body for every request. It records the last request +// path in *gotPath when non-nil. +func newTestShim(t *testing.T, status int, body string, gotPath *string) *Shim { + t.Helper() + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if gotPath != nil { + *gotPath = r.URL.Path + } + w.WriteHeader(status) + if _, err := w.Write([]byte(body)); err != nil { + t.Errorf("failed to write response body: %v", err) + } + })) + t.Cleanup(upstream.Close) + return &Shim{ + config: config{PlacementURL: upstream.URL}, + httpClient: upstream.Client(), + } +} + +// serveHandler registers a single handler on a fresh mux and serves the +// request through it, returning the recorded response. +func serveHandler(t *testing.T, method, pattern string, handler http.HandlerFunc, reqPath string) *httptest.ResponseRecorder { + t.Helper() + mux := http.NewServeMux() + mux.HandleFunc(method+" "+pattern, handler) + req := httptest.NewRequest(method, reqPath, http.NoBody) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + return w +} + +func TestForward(t *testing.T) { + tests := []struct { + name string + path string + query string + method string + body string + reqHeaders map[string]string + upstreamStatus int + upstreamBody string + upstreamHeader map[string]string + }{ + { + name: "GET with query string", + path: "/resource_providers", + query: "name=test", + method: "GET", + upstreamStatus: http.StatusOK, + upstreamBody: `{"resource_providers":[]}`, + upstreamHeader: map[string]string{"Content-Type": "application/json"}, + }, + { + name: "PUT with body and headers", + path: "/resource_providers/abc", + method: "PUT", + body: `{"name":"new"}`, + reqHeaders: map[string]string{"X-Custom": "val"}, + upstreamStatus: http.StatusOK, + upstreamBody: `{"uuid":"abc"}`, + }, + { + name: "upstream error", + path: "/fail", + method: "GET", + upstreamStatus: http.StatusNotFound, + upstreamBody: "not found", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Verify the path and query were forwarded. + if r.URL.Path != tt.path { + t.Errorf("upstream path = %q, want %q", r.URL.Path, tt.path) + } + if r.URL.RawQuery != tt.query { + t.Errorf("upstream query = %q, want %q", r.URL.RawQuery, tt.query) + } + if r.Method != tt.method { + t.Errorf("upstream method = %q, want %q", r.Method, tt.method) + } + // Verify headers were copied. + for k, v := range tt.reqHeaders { + if got := r.Header.Get(k); got != v { + t.Errorf("upstream header %q = %q, want %q", k, got, v) + } + } + // Verify body was copied. + if tt.body != "" { + b, err := io.ReadAll(r.Body) + if err != nil { + t.Fatalf("failed to read upstream body: %v", err) + } + if string(b) != tt.body { + t.Errorf("upstream body = %q, want %q", string(b), tt.body) + } + } + for k, v := range tt.upstreamHeader { + w.Header().Set(k, v) + } + w.WriteHeader(tt.upstreamStatus) + if _, err := w.Write([]byte(tt.upstreamBody)); err != nil { + t.Fatalf("failed to write upstream body: %v", err) + } + })) + defer upstream.Close() + + s := &Shim{ + config: config{PlacementURL: upstream.URL}, + httpClient: upstream.Client(), + } + target := tt.path + if tt.query != "" { + target += "?" + tt.query + } + var bodyReader io.Reader + if tt.body != "" { + bodyReader = strings.NewReader(tt.body) + } + req := httptest.NewRequest(tt.method, target, bodyReader) + for k, v := range tt.reqHeaders { + req.Header.Set(k, v) + } + w := httptest.NewRecorder() + s.forward(w, req) + + if w.Code != tt.upstreamStatus { + t.Fatalf("status = %d, want %d", w.Code, tt.upstreamStatus) + } + if got := w.Body.String(); got != tt.upstreamBody { + t.Fatalf("body = %q, want %q", got, tt.upstreamBody) + } + for k, v := range tt.upstreamHeader { + if got := w.Header().Get(k); got != v { + t.Errorf("response header %q = %q, want %q", k, got, v) + } + } + }) + } +} + +func TestForwardUpstreamUnreachable(t *testing.T) { + s := &Shim{ + config: config{PlacementURL: "http://127.0.0.1:1"}, + httpClient: &http.Client{}, + } + req := httptest.NewRequest(http.MethodGet, "/", http.NoBody) + w := httptest.NewRecorder() + s.forward(w, req) + if w.Code != http.StatusBadGateway { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadGateway) + } +} + +func TestRegisterRoutes(t *testing.T) { + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer upstream.Close() + s := &Shim{ + config: config{PlacementURL: upstream.URL}, + httpClient: upstream.Client(), + } + mux := http.NewServeMux() + s.RegisterRoutes(mux) + // Verify a sample of routes are registered. Unregistered patterns + // return 404 from the default mux; registered ones reach the upstream. + routes := []struct { + method string + path string + }{ + {"GET", "/"}, + {"GET", "/resource_providers"}, + {"POST", "/resource_providers"}, + {"GET", "/traits"}, + {"GET", "/allocation_candidates"}, + {"POST", "/reshaper"}, + {"POST", "/allocations"}, + {"GET", "/usages"}, + } + for _, rt := range routes { + t.Run(rt.method+" "+rt.path, func(t *testing.T) { + req := httptest.NewRequest(rt.method, rt.path, http.NoBody) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + if w.Code == http.StatusNotFound { + t.Fatalf("route %s %s returned 404, expected it to be registered", rt.method, rt.path) + } + }) + } +} diff --git a/internal/shim/placement/validation.go b/internal/shim/placement/validation.go index 55a46c406..b025cbfd7 100644 --- a/internal/shim/placement/validation.go +++ b/internal/shim/placement/validation.go @@ -16,7 +16,7 @@ import ( func requiredPathParam(w http.ResponseWriter, r *http.Request, name string) (string, bool) { v := r.PathValue(name) if v == "" { - http.Error(w, fmt.Sprintf("missing path parameter: %s", name), http.StatusBadRequest) + http.Error(w, "missing path parameter: "+name, http.StatusBadRequest) return "", false } return v, true diff --git a/internal/shim/placement/validation_test.go b/internal/shim/placement/validation_test.go new file mode 100644 index 000000000..b0b39c27e --- /dev/null +++ b/internal/shim/placement/validation_test.go @@ -0,0 +1,89 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package placement + +import ( + "net/http" + "net/http/httptest" + "testing" +) + +func TestRequiredPathParam(t *testing.T) { + t.Run("valid param", func(t *testing.T) { + mux := http.NewServeMux() + var gotValue string + var gotOK bool + mux.HandleFunc("GET /test/{name}", func(w http.ResponseWriter, r *http.Request) { + gotValue, gotOK = requiredPathParam(w, r, "name") + }) + req := httptest.NewRequest(http.MethodGet, "/test/VCPU", http.NoBody) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + if !gotOK { + t.Fatal("expected ok = true") + } + if gotValue != "VCPU" { + t.Fatalf("value = %q, want %q", gotValue, "VCPU") + } + }) + t.Run("wrong param name returns empty", func(t *testing.T) { + mux := http.NewServeMux() + var gotOK bool + mux.HandleFunc("GET /test/{name}", func(w http.ResponseWriter, r *http.Request) { + _, gotOK = requiredPathParam(w, r, "nonexistent") + }) + req := httptest.NewRequest(http.MethodGet, "/test/VCPU", http.NoBody) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + if gotOK { + t.Fatal("expected ok = false for wrong param name") + } + if w.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", w.Code, http.StatusBadRequest) + } + }) +} + +func TestRequiredUUIDPathParam(t *testing.T) { + tests := []struct { + name string + paramValue string + wantOK bool + wantCode int + }{ + { + name: "valid uuid", + paramValue: "d9b3a520-2a3c-4f6b-8b9a-1c2d3e4f5a6b", + wantOK: true, + }, + { + name: "invalid uuid", + paramValue: "not-a-uuid", + wantOK: false, + wantCode: http.StatusBadRequest, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mux := http.NewServeMux() + var gotValue string + var gotOK bool + mux.HandleFunc("GET /test/{uuid}", func(w http.ResponseWriter, r *http.Request) { + gotValue, gotOK = requiredUUIDPathParam(w, r, "uuid") + }) + req := httptest.NewRequest(http.MethodGet, "/test/"+tt.paramValue, http.NoBody) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + if gotOK != tt.wantOK { + t.Fatalf("ok = %v, want %v", gotOK, tt.wantOK) + } + if tt.wantOK && gotValue != tt.paramValue { + t.Fatalf("value = %q, want %q", gotValue, tt.paramValue) + } + if !tt.wantOK && w.Code != tt.wantCode { + t.Fatalf("status = %d, want %d", w.Code, tt.wantCode) + } + }) + } +} From af9841cfd5074a1dc775563dc11db3b2036d445e Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Fri, 10 Apr 2026 08:51:49 +0200 Subject: [PATCH 13/17] Fix SSRF --- internal/shim/placement/shim.go | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/internal/shim/placement/shim.go b/internal/shim/placement/shim.go index 2623a3baa..cd709db9c 100644 --- a/internal/shim/placement/shim.go +++ b/internal/shim/placement/shim.go @@ -8,6 +8,7 @@ import ( "errors" "io" "net/http" + "net/url" "time" "github.com/cobaltcore-dev/cortex/pkg/conf" @@ -111,16 +112,26 @@ func (s *Shim) Start(ctx context.Context) (err error) { func (s *Shim) forward(w http.ResponseWriter, r *http.Request) { log := logf.FromContext(r.Context()) - // Build upstream URL: config.PlacementURL + original path + query string. - upstreamURL := s.config.PlacementURL + r.URL.Path - if r.URL.RawQuery != "" { - upstreamURL += "?" + r.URL.RawQuery + // Parse the trusted base URL and resolve the request path against it + // so the upstream target is always anchored to the configured host. + upstream, err := url.Parse(s.config.PlacementURL) + if err != nil { + log.Error(err, "failed to parse placement URL", "url", s.config.PlacementURL) + http.Error(w, "failed to parse placement URL", http.StatusBadGateway) + return + } + upstream.Path, err = url.JoinPath(upstream.Path, r.URL.Path) + if err != nil { + log.Error(err, "failed to join upstream path", "path", r.URL.Path) + http.Error(w, "failed to join upstream path", http.StatusBadGateway) + return } + upstream.RawQuery = r.URL.RawQuery // Create upstream request preserving method, body, and context. - upstreamReq, err := http.NewRequestWithContext(r.Context(), r.Method, upstreamURL, r.Body) + upstreamReq, err := http.NewRequestWithContext(r.Context(), r.Method, upstream.String(), r.Body) if err != nil { - log.Error(err, "failed to create upstream request", "url", upstreamURL) + log.Error(err, "failed to create upstream request", "url", upstream.String()) http.Error(w, "failed to create upstream request", http.StatusBadGateway) return } @@ -128,9 +139,9 @@ func (s *Shim) forward(w http.ResponseWriter, r *http.Request) { // Copy all incoming headers. upstreamReq.Header = r.Header.Clone() - resp, err := s.httpClient.Do(upstreamReq) + resp, err := s.httpClient.Do(upstreamReq) //nolint:gosec // G704: intentional reverse proxy; host is fixed by operator config, only path varies if err != nil { - log.Error(err, "failed to reach placement API", "url", upstreamURL) + log.Error(err, "failed to reach placement API", "url", upstream.String()) http.Error(w, "failed to reach placement API", http.StatusBadGateway) return } From 271a91045581e9cb986fd2e677956ca150a5572a Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Fri, 10 Apr 2026 10:10:34 +0200 Subject: [PATCH 14/17] Add multicluster client setup --- cmd/shim/main.go | 25 +++- .../bundles/cortex-placement-shim/values.yaml | 5 + internal/shim/placement/shim.go | 119 +++++++++++------- pkg/multicluster/routers.go | 10 ++ 4 files changed, 111 insertions(+), 48 deletions(-) diff --git a/cmd/shim/main.go b/cmd/shim/main.go index 0a68c7298..5ebf902a7 100644 --- a/cmd/shim/main.go +++ b/cmd/shim/main.go @@ -15,6 +15,7 @@ import ( "github.com/cobaltcore-dev/cortex/internal/shim/placement" "github.com/cobaltcore-dev/cortex/pkg/conf" "github.com/cobaltcore-dev/cortex/pkg/monitoring" + "github.com/cobaltcore-dev/cortex/pkg/multicluster" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "github.com/sapcc/go-bits/httpext" "k8s.io/apimachinery/pkg/runtime" @@ -22,6 +23,7 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/certwatcher" + "sigs.k8s.io/controller-runtime/pkg/cluster" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/metrics" @@ -197,7 +199,26 @@ func main() { os.Exit(1) } - // TODO: Initialize multicluster client here. + homeCluster, err := cluster.New(restConfig, func(o *cluster.Options) { o.Scheme = scheme }) + if err != nil { + setupLog.Error(err, "unable to create home cluster") + os.Exit(1) + } + if err := mgr.Add(homeCluster); err != nil { + setupLog.Error(err, "unable to add home cluster") + os.Exit(1) + } + multiclusterClient := &multicluster.Client{ + HomeCluster: homeCluster, + HomeRestConfig: restConfig, + HomeScheme: scheme, + ResourceRouters: multicluster.DefaultResourceRouters, + } + multiclusterClientConfig := conf.GetConfigOrDie[multicluster.ClientConfig]() + if err := multiclusterClient.InitFromConf(ctx, mgr, multiclusterClientConfig); err != nil { + setupLog.Error(err, "unable to initialize multicluster client") + os.Exit(1) + } // Our custom monitoring registry can add prometheus labels to all metrics. // This is useful to distinguish metrics from different deployments. @@ -207,7 +228,7 @@ func main() { // API endpoint. mux := http.NewServeMux() if enablePlacementShim { - placementShim := &placement.Shim{Client: mgr.GetClient()} + placementShim := &placement.Shim{Client: multiclusterClient} setupLog.Info("Adding placement shim to manager") if err := placementShim.SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "unable to set up placement shim") diff --git a/helm/bundles/cortex-placement-shim/values.yaml b/helm/bundles/cortex-placement-shim/values.yaml index 2fa998d6b..2facf6848 100644 --- a/helm/bundles/cortex-placement-shim/values.yaml +++ b/helm/bundles/cortex-placement-shim/values.yaml @@ -24,6 +24,11 @@ cortex-shim: container: extraArgs: ["--placement-shim=true"] conf: + apiservers: + home: + gvks: + - kvm.cloud.sap/v1/Hypervisor + - kvm.cloud.sap/v1/HypervisorList monitoring: labels: github_org: cobaltcore-dev diff --git a/internal/shim/placement/shim.go b/internal/shim/placement/shim.go index cd709db9c..c7518aacb 100644 --- a/internal/shim/placement/shim.go +++ b/internal/shim/placement/shim.go @@ -12,11 +12,14 @@ import ( "time" "github.com/cobaltcore-dev/cortex/pkg/conf" + "github.com/cobaltcore-dev/cortex/pkg/multicluster" "github.com/cobaltcore-dev/cortex/pkg/sso" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" ) // IndexHypervisorByID is the field index key for looking up Hypervisor @@ -40,6 +43,15 @@ type config struct { PlacementURL string `json:"placementURL,omitempty"` } +// validate checks the config for required fields and returns an error if the +// config is invalid. +func (c *config) validate() error { + if c.PlacementURL == "" { + return errors.New("placement URL is required") + } + return nil +} + // Shim is the placement API shim. It holds a controller-runtime client for // making Kubernetes API calls and exposes HTTP handlers that mirror the // OpenStack Placement API surface. @@ -82,11 +94,6 @@ func (s *Shim) Start(ctx context.Context) (err error) { // Try establish a connection to the placement API to fail fast if the // configuration is invalid. Directly call the root endpoint for that. setupLog.Info("Testing connection to placement API", "url", s.config.PlacementURL) - if s.config.PlacementURL == "" { - err := errors.New("placement URL is not configured") - setupLog.Error(err, "Invalid configuration for placement shim") - return err - } req, err := http.NewRequestWithContext(ctx, http.MethodGet, s.config.PlacementURL, http.NoBody) if err != nil { setupLog.Error(err, "Failed to create HTTP request to placement API") @@ -107,6 +114,67 @@ func (s *Shim) Start(ctx context.Context) (err error) { return nil } +// Reconcile is not used by the shim, but must be implemented to satisfy the +// controller-runtime Reconciler interface. +func (s *Shim) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + return ctrl.Result{}, nil +} + +// handleRemoteHypervisor is called by watches in remote clusters and triggers +// a reconcile on the hypervisor resource that was changed in the remote cluster. +func (s *Shim) handleRemoteHypervisor() handler.EventHandler { + handler := handler.Funcs{} + // For now, the shim doesn't need to do anything on hypervisor events. + return handler +} + +// predicateRemoteHypervisor is used to filter events from remote clusters, +// so that only events for hypervisors that should be processed by the shim. +func (s *Shim) predicateRemoteHypervisor() predicate.Predicate { + // For now, the shim doesn't need to process any hypervisor events. + return predicate.NewPredicateFuncs(func(object client.Object) bool { + return false + }) +} + +// SetupWithManager registers field indexes on the manager's cache so that +// subsequent list calls are served from the informer cache rather than +// hitting the API server. This must be called before the manager is started. +// +// Calling IndexField internally invokes GetInformer, which creates and +// registers a shared informer for the indexed type (hv1.Hypervisor) with the +// cache. The informer is started later when mgr.Start() is called. This +// means no separate controller or empty Reconcile loop is needed — the +// index registration alone is sufficient to warm the cache. +func (s *Shim) SetupWithManager(ctx context.Context, mgr ctrl.Manager) (err error) { + setupLog.Info("Setting up placement shim with manager") + s.config, err = conf.GetConfig[config]() + if err != nil { + setupLog.Error(err, "Failed to load placement shim config") + return err + } + // Validate we don't have any weird values in the config. + if err := s.config.validate(); err != nil { + return err + } + // Check that the provided client is a multicluster client, since we need + // that to watch for hypervisors across clusters. + mcl, ok := s.Client.(*multicluster.Client) + if !ok { + return errors.New("provided client must be a multicluster client") + } + bldr := multicluster.BuildController(mcl, mgr) + // The hypervisor crd may be distributed across multiple remote clusters. + bldr, err = bldr.WatchesMulticluster(&hv1.Hypervisor{}, + s.handleRemoteHypervisor(), + s.predicateRemoteHypervisor(), + ) + if err != nil { + return err + } + return bldr.Named("placement-shim").Complete(s) +} + // forward proxies the incoming HTTP request to the upstream placement API // and copies the response (status, headers, body) back to the client. func (s *Shim) forward(w http.ResponseWriter, r *http.Request) { @@ -159,47 +227,6 @@ func (s *Shim) forward(w http.ResponseWriter, r *http.Request) { } } -// SetupWithManager registers field indexes on the manager's cache so that -// subsequent list calls are served from the informer cache rather than -// hitting the API server. This must be called before the manager is started. -// -// Calling IndexField internally invokes GetInformer, which creates and -// registers a shared informer for the indexed type (hv1.Hypervisor) with the -// cache. The informer is started later when mgr.Start() is called. This -// means no separate controller or empty Reconcile loop is needed — the -// index registration alone is sufficient to warm the cache. -func (s *Shim) SetupWithManager(ctx context.Context, mgr ctrl.Manager) (err error) { - setupLog.Info("Setting up placement shim with manager") - if err := mgr.Add(s); err != nil { // Bind Start(ctx) - setupLog.Error(err, "Failed to bind start routine") - return err - } - s.config, err = conf.GetConfig[config]() - if err != nil { - setupLog.Error(err, "Failed to load placement shim config") - return err - } - setupLog.Info("Indexing Hypervisors by hypervisor ID") - err = mgr.GetFieldIndexer().IndexField(ctx, &hv1.Hypervisor{}, IndexHypervisorByID, - func(obj client.Object) []string { - h, ok := obj.(*hv1.Hypervisor) - if !ok { - return nil - } - if h.Status.HypervisorID == "" { - return nil - } - return []string{h.Status.HypervisorID} - }, - ) - if err != nil { - setupLog.Error(err, "Failed to index Hypervisors by hypervisor ID") - return err - } - setupLog.Info("Successfully indexed Hypervisors by hypervisor ID") - return nil -} - // RegisterRoutes binds all Placement API handlers to the given mux. The // route patterns use the Go 1.22+ ServeMux syntax with explicit HTTP methods // and path wildcards. The routes mirror the OpenStack Placement API surface diff --git a/pkg/multicluster/routers.go b/pkg/multicluster/routers.go index 5eb693f2d..8c41e822a 100644 --- a/pkg/multicluster/routers.go +++ b/pkg/multicluster/routers.go @@ -9,8 +9,18 @@ import ( "github.com/cobaltcore-dev/cortex/api/v1alpha1" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime/schema" ) +// DefaultResourceRouters defines all mappings of GroupVersionKinds to RRs +// for the multicluster client that cortex supports by default. This is used to +// route resources to the correct cluster in a multicluster setup. +var DefaultResourceRouters = map[schema.GroupVersionKind]ResourceRouter{ + {Group: "kvm.cloud.sap", Version: "v1", Kind: "Hypervisor"}: HypervisorResourceRouter{}, + {Group: "cortex.cloud", Version: "v1alpha1", Kind: "Reservation"}: ReservationsResourceRouter{}, + {Group: "cortex.cloud", Version: "v1alpha1", Kind: "History"}: HistoryResourceRouter{}, +} + // ResourceRouter determines which remote cluster a resource should be written to // by matching the resource content against the cluster's labels. type ResourceRouter interface { From 2b91b6fe0bd2bdc013c7bbe31938ca8b492a5193 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Fri, 10 Apr 2026 10:27:03 +0200 Subject: [PATCH 15/17] Remove unused IndexHypervisorByID [skip ci] --- internal/shim/placement/shim.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/internal/shim/placement/shim.go b/internal/shim/placement/shim.go index c7518aacb..623f5ed1d 100644 --- a/internal/shim/placement/shim.go +++ b/internal/shim/placement/shim.go @@ -22,10 +22,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/predicate" ) -// IndexHypervisorByID is the field index key for looking up Hypervisor -// objects by their OpenStack hypervisor ID (status.hypervisorId). -const IndexHypervisorByID = ".status.hypervisorId" - var ( // setupLog is a controller-runtime logger used for setup and route // registration. Individual handlers should use their own loggers derived From 84c98ed1e62d57e06fcd3b0f936d2ff8322c9850 Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Fri, 10 Apr 2026 12:51:44 +0200 Subject: [PATCH 16/17] PR Feedback --- cmd/shim/main.go | 9 +++++++-- internal/shim/placement/shim.go | 32 +++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/cmd/shim/main.go b/cmd/shim/main.go index 5ebf902a7..afaafb86c 100644 --- a/cmd/shim/main.go +++ b/cmd/shim/main.go @@ -227,8 +227,9 @@ func main() { // API endpoint. mux := http.NewServeMux() + var placementShim *placement.Shim if enablePlacementShim { - placementShim := &placement.Shim{Client: multiclusterClient} + placementShim = &placement.Shim{Client: multiclusterClient} setupLog.Info("Adding placement shim to manager") if err := placementShim.SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "unable to set up placement shim") @@ -259,7 +260,11 @@ func main() { setupLog.Error(err, "unable to set up health check") os.Exit(1) } - if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + readyzCheck := healthz.Ping + if placementShim != nil { + readyzCheck = placementShim.ReadyzCheck() + } + if err := mgr.AddReadyzCheck("readyz", readyzCheck); err != nil { setupLog.Error(err, "unable to set up ready check") os.Exit(1) } diff --git a/internal/shim/placement/shim.go b/internal/shim/placement/shim.go index 623f5ed1d..87123ad22 100644 --- a/internal/shim/placement/shim.go +++ b/internal/shim/placement/shim.go @@ -7,8 +7,10 @@ import ( "context" "errors" "io" + "net" "net/http" "net/url" + "sync/atomic" "time" "github.com/cobaltcore-dev/cortex/pkg/conf" @@ -57,6 +59,10 @@ type Shim struct { // HTTP client that can talk to openstack placement, if needed, over // ingress with single-sign-on. httpClient *http.Client + // ready is set to true once Start() has completed successfully. It is + // used by the readiness check to prevent traffic from reaching the shim + // before the HTTP client and upstream connection are established. + ready atomic.Bool } // Start is called after the manager has started and the cache is running. @@ -82,11 +88,15 @@ func (s *Shim) Start(ctx context.Context) (err error) { transport.MaxIdleConns = 100 transport.MaxIdleConnsPerHost = 100 // Guard against a hung upstream or slow TLS negotiation. + transport.DialContext = (&net.Dialer{ + Timeout: 10 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext transport.TLSHandshakeTimeout = 10 * time.Second transport.ResponseHeaderTimeout = 60 * time.Second transport.ExpectContinueTimeout = 1 * time.Second transport.IdleConnTimeout = 90 * time.Second - s.httpClient = &http.Client{Transport: transport} + s.httpClient = &http.Client{Transport: transport, Timeout: 60 * time.Second} // Try establish a connection to the placement API to fail fast if the // configuration is invalid. Directly call the root endpoint for that. setupLog.Info("Testing connection to placement API", "url", s.config.PlacementURL) @@ -107,9 +117,23 @@ func (s *Shim) Start(ctx context.Context) (err error) { return err } setupLog.Info("Successfully connected to placement API") + s.ready.Store(true) return nil } +// ReadyzCheck returns a healthz.Checker that reports healthy only after +// Start() has completed successfully. Wire this into the manager's readiness +// endpoint so that Kubernetes does not route traffic to the pod before the +// shim's HTTP client and upstream connection are established. +func (s *Shim) ReadyzCheck() func(*http.Request) error { + return func(_ *http.Request) error { + if !s.ready.Load() { + return errors.New("placement shim not yet initialized") + } + return nil + } +} + // Reconcile is not used by the shim, but must be implemented to satisfy the // controller-runtime Reconciler interface. func (s *Shim) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -176,6 +200,12 @@ func (s *Shim) SetupWithManager(ctx context.Context, mgr ctrl.Manager) (err erro func (s *Shim) forward(w http.ResponseWriter, r *http.Request) { log := logf.FromContext(r.Context()) + if s.httpClient == nil { + log.Info("placement shim not yet initialized, rejecting request") + http.Error(w, "service not ready", http.StatusServiceUnavailable) + return + } + // Parse the trusted base URL and resolve the request path against it // so the upstream target is always anchored to the configured host. upstream, err := url.Parse(s.config.PlacementURL) From 5bedfb5ba06785adaa6bd312239645100732004c Mon Sep 17 00:00:00 2001 From: Philipp Matthes Date: Fri, 10 Apr 2026 13:23:11 +0200 Subject: [PATCH 17/17] Add configurable api-bind-address flag to shim --- cmd/shim/main.go | 13 +++++++++++-- helm/library/cortex-shim/values.yaml | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/cmd/shim/main.go b/cmd/shim/main.go index afaafb86c..6fb951757 100644 --- a/cmd/shim/main.go +++ b/cmd/shim/main.go @@ -52,6 +52,7 @@ func main() { restConfig := ctrl.GetConfigOrDie() var metricsAddr string + var apiBindAddr string var metricsCertPath, metricsCertName, metricsCertKey string var webhookCertPath, webhookCertName, webhookCertKey string // The shim does not require leader election, but this flag is provided to @@ -64,6 +65,7 @@ func main() { var tlsOpts []func(*tls.Config) flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") + flag.StringVar(&apiBindAddr, "api-bind-address", ":8080", "The address the shim API server binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. "+ @@ -94,6 +96,13 @@ func main() { os.Exit(1) } + // Check that the metrics and API bind addresses don't overlap. + if metricsAddr != "0" && metricsAddr == apiBindAddr { + err := errors.New("metrics-bind-address and api-bind-address must not be the same") + setupLog.Error(err, "invalid configuration", "metrics-bind-address", metricsAddr, "api-bind-address", apiBindAddr) + os.Exit(1) + } + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) // if the enable-http2 flag is false (the default), http/2 should be disabled @@ -272,8 +281,8 @@ func main() { errchan := make(chan error) go func() { errchan <- func() error { - setupLog.Info("starting api server", "address", ":8080") - return httpext.ListenAndServeContext(ctx, ":8080", mux) + setupLog.Info("starting api server", "address", apiBindAddr) + return httpext.ListenAndServeContext(ctx, apiBindAddr, mux) }() }() go func() { diff --git a/helm/library/cortex-shim/values.yaml b/helm/library/cortex-shim/values.yaml index 3acead93b..91eaba11f 100644 --- a/helm/library/cortex-shim/values.yaml +++ b/helm/library/cortex-shim/values.yaml @@ -5,6 +5,7 @@ deployment: image: repository: ghcr.io/cobaltcore-dev/cortex-shim args: + - "--api-bind-address=:8080" - "--metrics-bind-address=:2112" - "--health-probe-bind-address=:8081" - "--metrics-secure=false"