From 70b8c4cbee91502ab9a50321183f19def87fa060 Mon Sep 17 00:00:00 2001 From: Richa Shalom Gadagotti Date: Fri, 29 May 2026 22:00:25 +0000 Subject: [PATCH 1/2] [DPD] DPD CRD changes with version bump to v3.2 --- helm_chart/HyperPodHelmChart/Chart.yaml | 2 +- .../charts/inference-operator/Chart.yaml | 4 +- ...s.amazon.com_inferenceendpointconfigs.yaml | 811 ++++++++++++++++++ .../charts/inference-operator/values.yaml | 2 +- 4 files changed, 815 insertions(+), 4 deletions(-) diff --git a/helm_chart/HyperPodHelmChart/Chart.yaml b/helm_chart/HyperPodHelmChart/Chart.yaml index dc5876cd..ee51d40e 100644 --- a/helm_chart/HyperPodHelmChart/Chart.yaml +++ b/helm_chart/HyperPodHelmChart/Chart.yaml @@ -81,7 +81,7 @@ dependencies: repository: "file://charts/team-role-and-bindings" condition: team-role-and-bindings.enabled - name: hyperpod-inference-operator - version: "2.1.1" + version: "2.2.0" repository: "file://charts/inference-operator" condition: inferenceOperators.enabled - name: hyperpod-patching diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml index f4a92b05..aeeba18c 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml @@ -15,11 +15,11 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 2.1.1 +version: 2.2.0 # This is the version number of the application being deployed. Keep this aligned # with operator image MAJOR.MINOR version. -appVersion: "3.1" +appVersion: "3.2" dependencies: - name: aws-mountpoint-s3-csi-driver diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml index c5c6bd38..8b86aeb2 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml @@ -4741,6 +4741,817 @@ spec: type: object x-kubernetes-map-type: atomic type: object + pdSpec: + description: |- + Configuration for disaggregated prefill and decode (DPD). + Presence of pdSpec enables DPD mode, creating separate prefill and decode Deployments. + properties: + autoScalingSpec: + description: Autoscaling configuration for prefill and decode + roles independently. + properties: + decodingAutoScaling: + description: Autoscaling configuration for decode pods. + properties: + cloudWatchTrigger: + description: CloudWatch metric trigger to use for autoscaling + properties: + activationTargetValue: + default: 0 + description: Activation Value for CloudWatch metric + to scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + dimensions: + description: Dimensions for Cloudwatch metrics + items: + properties: + name: + description: CloudWatch Metric dimension name + type: string + value: + description: CloudWatch Metric dimension value + type: string + required: + - name + - value + type: object + type: array + metricCollectionPeriod: + default: 300 + description: Defines the Period for CloudWatch query + format: int32 + type: integer + metricCollectionStartTime: + default: 300 + description: Defines the StartTime for CloudWatch + query + format: int32 + type: integer + metricName: + description: Metric name to query for Cloudwatch trigger + type: string + metricStat: + default: Average + description: Statistics metric to be used by Trigger. + Used to define Stat for CloudWatch query. Default + is Average. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. + Enum: AverageValue - Uses average value of metric + per pod, Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + minValue: + default: 0 + description: Minimum metric value used in case of + empty response from CloudWatch. Default is 0. + type: number + name: + description: Name for the CloudWatch trigger + type: string + namespace: + description: AWS CloudWatch namespace for metric + type: string + targetValue: + description: TargetValue for CloudWatch metric + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during + polling interval. Default is true + type: boolean + type: object + cloudWatchTriggerList: + description: Multiple CloudWatch metric triggers to use + for autoscaling. Takes priority over CloudWatchTrigger + if both are provided. + items: + properties: + activationTargetValue: + default: 0 + description: Activation Value for CloudWatch metric + to scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + dimensions: + description: Dimensions for Cloudwatch metrics + items: + properties: + name: + description: CloudWatch Metric dimension name + type: string + value: + description: CloudWatch Metric dimension value + type: string + required: + - name + - value + type: object + type: array + metricCollectionPeriod: + default: 300 + description: Defines the Period for CloudWatch query + format: int32 + type: integer + metricCollectionStartTime: + default: 300 + description: Defines the StartTime for CloudWatch + query + format: int32 + type: integer + metricName: + description: Metric name to query for Cloudwatch + trigger + type: string + metricStat: + default: Average + description: Statistics metric to be used by Trigger. + Used to define Stat for CloudWatch query. Default + is Average. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. + Enum: AverageValue - Uses average value of metric + per pod, Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + minValue: + default: 0 + description: Minimum metric value used in case of + empty response from CloudWatch. Default is 0. + type: number + name: + description: Name for the CloudWatch trigger + type: string + namespace: + description: AWS CloudWatch namespace for metric + type: string + targetValue: + description: TargetValue for CloudWatch metric + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during + polling interval. Default is true + type: boolean + type: object + maxItems: 100 + type: array + cooldownPeriod: + default: 300 + description: The period to wait after the last trigger + reported active before scaling the resource back to + 0. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + initialCooldownPeriod: + default: 300 + description: The delay before the cooldownPeriod starts + after the initial creation of the ScaledObject. Default + 300 seconds. + format: int32 + minimum: 0 + type: integer + maxReplicaCount: + default: 5 + description: The maximum number of model pods to scale + to. Default 5. + format: int32 + minimum: 0 + type: integer + minReplicaCount: + default: 1 + description: The minimum number of model pods to scale + down to. Default 1. + format: int32 + minimum: 0 + type: integer + pollingInterval: + default: 30 + description: This is the interval to check each trigger + on. Default 30 seconds. + format: int32 + minimum: 0 + type: integer + prometheusTrigger: + description: Prometheus metric trigger to use for autoscaling + properties: + activationTargetValue: + default: 0 + description: Activation Value for Prometheus metric + to scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + customHeaders: + description: Custom headers to include while querying + the prometheus endpoint. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. + Enum: AverageValue - Uses average value of metric + per pod, Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + name: + description: Name for the Prometheus trigger + type: string + namespace: + description: Namespace for namespaced queries + type: string + query: + description: PromQLQuery for the metric. + type: string + serverAddress: + description: Server address for AMP workspace + pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+$|^$ + type: string + targetValue: + description: Target metric value for scaling + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during + polling interval. Default is true + type: boolean + type: object + prometheusTriggerList: + description: Multiple Prometheus metric triggers to use + for autoscaling. Takes priority over PrometheusTrigger + if both are provided. + items: + properties: + activationTargetValue: + default: 0 + description: Activation Value for Prometheus metric + to scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + customHeaders: + description: Custom headers to include while querying + the prometheus endpoint. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. + Enum: AverageValue - Uses average value of metric + per pod, Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + name: + description: Name for the Prometheus trigger + type: string + namespace: + description: Namespace for namespaced queries + type: string + query: + description: PromQLQuery for the metric. + type: string + serverAddress: + description: Server address for AMP workspace + pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+$|^$ + type: string + targetValue: + description: Target metric value for scaling + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during + polling interval. Default is true + type: boolean + type: object + maxItems: 100 + type: array + scaleDownStabilizationTime: + default: 300 + description: The time window to stabilize for HPA before + scaling down. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + scaleUpStabilizationTime: + default: 0 + description: The time window to stabilize for HPA before + scaling up. Default 0 seconds. + format: int32 + minimum: 0 + type: integer + type: object + prefillAutoScaling: + description: Autoscaling configuration for prefill pods. + properties: + cloudWatchTrigger: + description: CloudWatch metric trigger to use for autoscaling + properties: + activationTargetValue: + default: 0 + description: Activation Value for CloudWatch metric + to scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + dimensions: + description: Dimensions for Cloudwatch metrics + items: + properties: + name: + description: CloudWatch Metric dimension name + type: string + value: + description: CloudWatch Metric dimension value + type: string + required: + - name + - value + type: object + type: array + metricCollectionPeriod: + default: 300 + description: Defines the Period for CloudWatch query + format: int32 + type: integer + metricCollectionStartTime: + default: 300 + description: Defines the StartTime for CloudWatch + query + format: int32 + type: integer + metricName: + description: Metric name to query for Cloudwatch trigger + type: string + metricStat: + default: Average + description: Statistics metric to be used by Trigger. + Used to define Stat for CloudWatch query. Default + is Average. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. + Enum: AverageValue - Uses average value of metric + per pod, Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + minValue: + default: 0 + description: Minimum metric value used in case of + empty response from CloudWatch. Default is 0. + type: number + name: + description: Name for the CloudWatch trigger + type: string + namespace: + description: AWS CloudWatch namespace for metric + type: string + targetValue: + description: TargetValue for CloudWatch metric + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during + polling interval. Default is true + type: boolean + type: object + cloudWatchTriggerList: + description: Multiple CloudWatch metric triggers to use + for autoscaling. Takes priority over CloudWatchTrigger + if both are provided. + items: + properties: + activationTargetValue: + default: 0 + description: Activation Value for CloudWatch metric + to scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + dimensions: + description: Dimensions for Cloudwatch metrics + items: + properties: + name: + description: CloudWatch Metric dimension name + type: string + value: + description: CloudWatch Metric dimension value + type: string + required: + - name + - value + type: object + type: array + metricCollectionPeriod: + default: 300 + description: Defines the Period for CloudWatch query + format: int32 + type: integer + metricCollectionStartTime: + default: 300 + description: Defines the StartTime for CloudWatch + query + format: int32 + type: integer + metricName: + description: Metric name to query for Cloudwatch + trigger + type: string + metricStat: + default: Average + description: Statistics metric to be used by Trigger. + Used to define Stat for CloudWatch query. Default + is Average. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. + Enum: AverageValue - Uses average value of metric + per pod, Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + minValue: + default: 0 + description: Minimum metric value used in case of + empty response from CloudWatch. Default is 0. + type: number + name: + description: Name for the CloudWatch trigger + type: string + namespace: + description: AWS CloudWatch namespace for metric + type: string + targetValue: + description: TargetValue for CloudWatch metric + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during + polling interval. Default is true + type: boolean + type: object + maxItems: 100 + type: array + cooldownPeriod: + default: 300 + description: The period to wait after the last trigger + reported active before scaling the resource back to + 0. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + initialCooldownPeriod: + default: 300 + description: The delay before the cooldownPeriod starts + after the initial creation of the ScaledObject. Default + 300 seconds. + format: int32 + minimum: 0 + type: integer + maxReplicaCount: + default: 5 + description: The maximum number of model pods to scale + to. Default 5. + format: int32 + minimum: 0 + type: integer + minReplicaCount: + default: 1 + description: The minimum number of model pods to scale + down to. Default 1. + format: int32 + minimum: 0 + type: integer + pollingInterval: + default: 30 + description: This is the interval to check each trigger + on. Default 30 seconds. + format: int32 + minimum: 0 + type: integer + prometheusTrigger: + description: Prometheus metric trigger to use for autoscaling + properties: + activationTargetValue: + default: 0 + description: Activation Value for Prometheus metric + to scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + customHeaders: + description: Custom headers to include while querying + the prometheus endpoint. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. + Enum: AverageValue - Uses average value of metric + per pod, Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + name: + description: Name for the Prometheus trigger + type: string + namespace: + description: Namespace for namespaced queries + type: string + query: + description: PromQLQuery for the metric. + type: string + serverAddress: + description: Server address for AMP workspace + pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+$|^$ + type: string + targetValue: + description: Target metric value for scaling + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during + polling interval. Default is true + type: boolean + type: object + prometheusTriggerList: + description: Multiple Prometheus metric triggers to use + for autoscaling. Takes priority over PrometheusTrigger + if both are provided. + items: + properties: + activationTargetValue: + default: 0 + description: Activation Value for Prometheus metric + to scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + customHeaders: + description: Custom headers to include while querying + the prometheus endpoint. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. + Enum: AverageValue - Uses average value of metric + per pod, Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + name: + description: Name for the Prometheus trigger + type: string + namespace: + description: Namespace for namespaced queries + type: string + query: + description: PromQLQuery for the metric. + type: string + serverAddress: + description: Server address for AMP workspace + pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+$|^$ + type: string + targetValue: + description: Target metric value for scaling + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during + polling interval. Default is true + type: boolean + type: object + maxItems: 100 + type: array + scaleDownStabilizationTime: + default: 300 + description: The time window to stabilize for HPA before + scaling down. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + scaleUpStabilizationTime: + default: 0 + description: The time window to stabilize for HPA before + scaling up. Default 0 seconds. + format: int32 + minimum: 0 + type: integer + type: object + type: object + decodingSpec: + description: Configuration for decode pods. + properties: + args: + description: |- + Additional vLLM args for this role (e.g., --tensor-parallel-size, --gpu-memory-utilization, --max-num-seqs). + These are appended after the shared workerConfig.args and override any matching flags. + items: + type: string + type: array + nodeSelector: + additionalProperties: + type: string + description: Node selector for scheduling pods of this role + onto specific nodes. + type: object + replicas: + default: 1 + description: Number of replicas for this role. + format: int32 + minimum: 1 + type: integer + resources: + description: |- + Resource requests and limits for pods of this role. + Must include GPU resource requests for DPD to function. + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This field depends on the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + prefillSpec: + description: Configuration for prefill pods. + properties: + args: + description: |- + Additional vLLM args for this role (e.g., --tensor-parallel-size, --gpu-memory-utilization, --max-num-seqs). + These are appended after the shared workerConfig.args and override any matching flags. + items: + type: string + type: array + nodeSelector: + additionalProperties: + type: string + description: Node selector for scheduling pods of this role + onto specific nodes. + type: object + replicas: + default: 1 + description: Number of replicas for this role. + format: int32 + minimum: 1 + type: integer + resources: + description: |- + Resource requests and limits for pods of this role. + Must include GPU resource requests for DPD to function. + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This field depends on the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object + routingThreshold: + default: 4096 + description: |- + Token count threshold for conditional routing. + Requests with estimated input tokens >= threshold go through DPD path (remote prefill). + Requests below threshold go directly to decoder for local chunked prefill. + Default 4096. Set to 0 to always disaggregate. + format: int32 + minimum: 0 + type: integer + topologySpec: + description: Topology constraints for prefill and decode pod scheduling. + properties: + availabilityZone: + description: |- + Force pods into a specific availability zone. If empty, any AZ is allowed + (subject to sameAZ constraint). + type: string + placementGroup: + description: Placement group name for lowest network latency + between prefill and decode pods. + type: string + sameAZ: + default: true + description: |- + Enforce same availability zone placement for prefill and decode pods. + Required for optimal EFA latency. Default: true. + type: boolean + type: object + required: + - decodingSpec + - prefillSpec + type: object replicas: default: 1 description: The desired number of inference server replicas. Default diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml index 075a6df0..b193292b 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml @@ -24,7 +24,7 @@ image: ap-southeast-4: 311141544681.dkr.ecr.ap-southeast-4.amazonaws.com ap-southeast-3: 158128612970.dkr.ecr.ap-southeast-3.amazonaws.com eu-south-2: 025050981094.dkr.ecr.eu-south-2.amazonaws.com - tag: v3.1 + tag: v3.2 pullPolicy: Always repository: initContainer: From 580c833a3daff5be0131a64a2b35b99376973996 Mon Sep 17 00:00:00 2001 From: Richa Shalom Gadagotti Date: Fri, 5 Jun 2026 17:27:16 +0000 Subject: [PATCH 2/2] Pin operator to amd64 nodes via nodeAffinity and version bump to 2.2.1 --- .../charts/inference-operator/Chart.yaml | 2 +- .../config/manager/manager.yaml | 32 +++++++------------ .../charts/inference-operator/values.yaml | 21 ++++++++++++ 3 files changed, 34 insertions(+), 21 deletions(-) diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml index aeeba18c..7d1225bc 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 2.2.0 +version: 2.2.1 # This is the version number of the application being deployed. Keep this aligned # with operator image MAJOR.MINOR version. diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml index 407f067b..f13bc4b4 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml @@ -19,26 +19,18 @@ spec: labels: control-plane: {{ .Values.namePrefix }}-controller-manager spec: - # TODO(user): Uncomment the following code to configure the nodeAffinity expression - # according to the platforms which are supported by your solution. - # It is considered best practice to support multiple architectures. You can - # build your manager image using the makefile target docker-buildx. - # affinity: - # nodeAffinity: - # requiredDuringSchedulingIgnoredDuringExecution: - # nodeSelectorTerms: - # - matchExpressions: - # - key: kubernetes.io/arch - # operator: In - # values: - # - amd64 - # - arm64 - # - ppc64le - # - s390x - # - key: kubernetes.io/os - # operator: In - # values: - # - linux + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} securityContext: runAsNonRoot: true # TODO(user): For common cases that do not require escalating privileges diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml index b193292b..53fac781 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml @@ -38,6 +38,27 @@ tlsCertificateS3Bucket: enableWebhooks: true enableCustomServiceAccounts: false +# Architecture-aware scheduling for the operator deployment. +# Pins to amd64 Linux nodes since operator images are amd64-only. +# Override via EKS AddOn configurationValues if needed. +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: kubernetes.io/os + operator: In + values: + - linux + +nodeSelector: {} + +tolerations: [] + s3: enabled: true # IAM role ARN used for S3 CSI driver k8s service account