From 70b8c4cbee91502ab9a50321183f19def87fa060 Mon Sep 17 00:00:00 2001
From: Richa Shalom Gadagotti <rgadag@amazon.com>
Date: Fri, 29 May 2026 22:00:25 +0000
Subject: [PATCH 1/2] [DPD] DPD CRD changes with version bump to v3.2

---
 helm_chart/HyperPodHelmChart/Chart.yaml       |   2 +-
 .../charts/inference-operator/Chart.yaml      |   4 +-
 ...s.amazon.com_inferenceendpointconfigs.yaml | 811 ++++++++++++++++++
 .../charts/inference-operator/values.yaml     |   2 +-
 4 files changed, 815 insertions(+), 4 deletions(-)

diff --git a/helm_chart/HyperPodHelmChart/Chart.yaml b/helm_chart/HyperPodHelmChart/Chart.yaml
index dc5876cd..ee51d40e 100644
--- a/helm_chart/HyperPodHelmChart/Chart.yaml
+++ b/helm_chart/HyperPodHelmChart/Chart.yaml
@@ -81,7 +81,7 @@ dependencies:
     repository: "file://charts/team-role-and-bindings"
     condition: team-role-and-bindings.enabled
   - name: hyperpod-inference-operator
-    version: "2.1.1"
+    version: "2.2.0"
     repository: "file://charts/inference-operator"
     condition: inferenceOperators.enabled 
   - name: hyperpod-patching
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml
index f4a92b05..aeeba18c 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml
@@ -15,11 +15,11 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 2.1.1
+version: 2.2.0
 
 # This is the version number of the application being deployed. Keep this aligned 
 # with operator image MAJOR.MINOR version.
-appVersion: "3.1"
+appVersion: "3.2"
 
 dependencies:
 - name: aws-mountpoint-s3-csi-driver
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml
index c5c6bd38..8b86aeb2 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml
@@ -4741,6 +4741,817 @@ spec:
                     type: object
                     x-kubernetes-map-type: atomic
                 type: object
+              pdSpec:
+                description: |-
+                  Configuration for disaggregated prefill and decode (DPD).
+                  Presence of pdSpec enables DPD mode, creating separate prefill and decode Deployments.
+                properties:
+                  autoScalingSpec:
+                    description: Autoscaling configuration for prefill and decode
+                      roles independently.
+                    properties:
+                      decodingAutoScaling:
+                        description: Autoscaling configuration for decode pods.
+                        properties:
+                          cloudWatchTrigger:
+                            description: CloudWatch metric trigger to use for autoscaling
+                            properties:
+                              activationTargetValue:
+                                default: 0
+                                description: Activation Value for CloudWatch metric
+                                  to scale from 0 to 1. Only applicable if minReplicaCount
+                                  = 0
+                                type: number
+                              dimensions:
+                                description: Dimensions for Cloudwatch metrics
+                                items:
+                                  properties:
+                                    name:
+                                      description: CloudWatch Metric dimension name
+                                      type: string
+                                    value:
+                                      description: CloudWatch Metric dimension value
+                                      type: string
+                                  required:
+                                  - name
+                                  - value
+                                  type: object
+                                type: array
+                              metricCollectionPeriod:
+                                default: 300
+                                description: Defines the Period for CloudWatch query
+                                format: int32
+                                type: integer
+                              metricCollectionStartTime:
+                                default: 300
+                                description: Defines the StartTime for CloudWatch
+                                  query
+                                format: int32
+                                type: integer
+                              metricName:
+                                description: Metric name to query for Cloudwatch trigger
+                                type: string
+                              metricStat:
+                                default: Average
+                                description: Statistics metric to be used by Trigger.
+                                  Used to define Stat for CloudWatch query. Default
+                                  is Average.
+                                type: string
+                              metricType:
+                                default: Average
+                                description: 'The type of metric to be used by HPA.
+                                  Enum: AverageValue - Uses average value of metric
+                                  per pod, Value - Uses absolute metric value'
+                                enum:
+                                - Value
+                                - Average
+                                type: string
+                              minValue:
+                                default: 0
+                                description: Minimum metric value used in case of
+                                  empty response from CloudWatch. Default is 0.
+                                type: number
+                              name:
+                                description: Name for the CloudWatch trigger
+                                type: string
+                              namespace:
+                                description: AWS CloudWatch namespace for metric
+                                type: string
+                              targetValue:
+                                description: TargetValue for CloudWatch metric
+                                type: number
+                              useCachedMetrics:
+                                default: true
+                                description: Enable caching of metric values during
+                                  polling interval. Default is true
+                                type: boolean
+                            type: object
+                          cloudWatchTriggerList:
+                            description: Multiple CloudWatch metric triggers to use
+                              for autoscaling. Takes priority over CloudWatchTrigger
+                              if both are provided.
+                            items:
+                              properties:
+                                activationTargetValue:
+                                  default: 0
+                                  description: Activation Value for CloudWatch metric
+                                    to scale from 0 to 1. Only applicable if minReplicaCount
+                                    = 0
+                                  type: number
+                                dimensions:
+                                  description: Dimensions for Cloudwatch metrics
+                                  items:
+                                    properties:
+                                      name:
+                                        description: CloudWatch Metric dimension name
+                                        type: string
+                                      value:
+                                        description: CloudWatch Metric dimension value
+                                        type: string
+                                    required:
+                                    - name
+                                    - value
+                                    type: object
+                                  type: array
+                                metricCollectionPeriod:
+                                  default: 300
+                                  description: Defines the Period for CloudWatch query
+                                  format: int32
+                                  type: integer
+                                metricCollectionStartTime:
+                                  default: 300
+                                  description: Defines the StartTime for CloudWatch
+                                    query
+                                  format: int32
+                                  type: integer
+                                metricName:
+                                  description: Metric name to query for Cloudwatch
+                                    trigger
+                                  type: string
+                                metricStat:
+                                  default: Average
+                                  description: Statistics metric to be used by Trigger.
+                                    Used to define Stat for CloudWatch query. Default
+                                    is Average.
+                                  type: string
+                                metricType:
+                                  default: Average
+                                  description: 'The type of metric to be used by HPA.
+                                    Enum: AverageValue - Uses average value of metric
+                                    per pod, Value - Uses absolute metric value'
+                                  enum:
+                                  - Value
+                                  - Average
+                                  type: string
+                                minValue:
+                                  default: 0
+                                  description: Minimum metric value used in case of
+                                    empty response from CloudWatch. Default is 0.
+                                  type: number
+                                name:
+                                  description: Name for the CloudWatch trigger
+                                  type: string
+                                namespace:
+                                  description: AWS CloudWatch namespace for metric
+                                  type: string
+                                targetValue:
+                                  description: TargetValue for CloudWatch metric
+                                  type: number
+                                useCachedMetrics:
+                                  default: true
+                                  description: Enable caching of metric values during
+                                    polling interval. Default is true
+                                  type: boolean
+                              type: object
+                            maxItems: 100
+                            type: array
+                          cooldownPeriod:
+                            default: 300
+                            description: The period to wait after the last trigger
+                              reported active before scaling the resource back to
+                              0. Default 300 seconds.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                          initialCooldownPeriod:
+                            default: 300
+                            description: The delay before the cooldownPeriod starts
+                              after the initial creation of the ScaledObject. Default
+                              300 seconds.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                          maxReplicaCount:
+                            default: 5
+                            description: The maximum number of model pods to scale
+                              to. Default 5.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                          minReplicaCount:
+                            default: 1
+                            description: The minimum number of model pods to scale
+                              down to. Default 1.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                          pollingInterval:
+                            default: 30
+                            description: This is the interval to check each trigger
+                              on. Default 30 seconds.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                          prometheusTrigger:
+                            description: Prometheus metric trigger to use for autoscaling
+                            properties:
+                              activationTargetValue:
+                                default: 0
+                                description: Activation Value for Prometheus metric
+                                  to scale from 0 to 1. Only applicable if minReplicaCount
+                                  = 0
+                                type: number
+                              customHeaders:
+                                description: Custom headers to include while querying
+                                  the prometheus endpoint.
+                                type: string
+                              metricType:
+                                default: Average
+                                description: 'The type of metric to be used by HPA.
+                                  Enum: AverageValue - Uses average value of metric
+                                  per pod, Value - Uses absolute metric value'
+                                enum:
+                                - Value
+                                - Average
+                                type: string
+                              name:
+                                description: Name for the Prometheus trigger
+                                type: string
+                              namespace:
+                                description: Namespace for namespaced queries
+                                type: string
+                              query:
+                                description: PromQLQuery for the metric.
+                                type: string
+                              serverAddress:
+                                description: Server address for AMP workspace
+                                pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+$|^$
+                                type: string
+                              targetValue:
+                                description: Target metric value for scaling
+                                type: number
+                              useCachedMetrics:
+                                default: true
+                                description: Enable caching of metric values during
+                                  polling interval. Default is true
+                                type: boolean
+                            type: object
+                          prometheusTriggerList:
+                            description: Multiple Prometheus metric triggers to use
+                              for autoscaling. Takes priority over PrometheusTrigger
+                              if both are provided.
+                            items:
+                              properties:
+                                activationTargetValue:
+                                  default: 0
+                                  description: Activation Value for Prometheus metric
+                                    to scale from 0 to 1. Only applicable if minReplicaCount
+                                    = 0
+                                  type: number
+                                customHeaders:
+                                  description: Custom headers to include while querying
+                                    the prometheus endpoint.
+                                  type: string
+                                metricType:
+                                  default: Average
+                                  description: 'The type of metric to be used by HPA.
+                                    Enum: AverageValue - Uses average value of metric
+                                    per pod, Value - Uses absolute metric value'
+                                  enum:
+                                  - Value
+                                  - Average
+                                  type: string
+                                name:
+                                  description: Name for the Prometheus trigger
+                                  type: string
+                                namespace:
+                                  description: Namespace for namespaced queries
+                                  type: string
+                                query:
+                                  description: PromQLQuery for the metric.
+                                  type: string
+                                serverAddress:
+                                  description: Server address for AMP workspace
+                                  pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+$|^$
+                                  type: string
+                                targetValue:
+                                  description: Target metric value for scaling
+                                  type: number
+                                useCachedMetrics:
+                                  default: true
+                                  description: Enable caching of metric values during
+                                    polling interval. Default is true
+                                  type: boolean
+                              type: object
+                            maxItems: 100
+                            type: array
+                          scaleDownStabilizationTime:
+                            default: 300
+                            description: The time window to stabilize for HPA before
+                              scaling down. Default 300 seconds.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                          scaleUpStabilizationTime:
+                            default: 0
+                            description: The time window to stabilize for HPA before
+                              scaling up. Default 0 seconds.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                        type: object
+                      prefillAutoScaling:
+                        description: Autoscaling configuration for prefill pods.
+                        properties:
+                          cloudWatchTrigger:
+                            description: CloudWatch metric trigger to use for autoscaling
+                            properties:
+                              activationTargetValue:
+                                default: 0
+                                description: Activation Value for CloudWatch metric
+                                  to scale from 0 to 1. Only applicable if minReplicaCount
+                                  = 0
+                                type: number
+                              dimensions:
+                                description: Dimensions for Cloudwatch metrics
+                                items:
+                                  properties:
+                                    name:
+                                      description: CloudWatch Metric dimension name
+                                      type: string
+                                    value:
+                                      description: CloudWatch Metric dimension value
+                                      type: string
+                                  required:
+                                  - name
+                                  - value
+                                  type: object
+                                type: array
+                              metricCollectionPeriod:
+                                default: 300
+                                description: Defines the Period for CloudWatch query
+                                format: int32
+                                type: integer
+                              metricCollectionStartTime:
+                                default: 300
+                                description: Defines the StartTime for CloudWatch
+                                  query
+                                format: int32
+                                type: integer
+                              metricName:
+                                description: Metric name to query for Cloudwatch trigger
+                                type: string
+                              metricStat:
+                                default: Average
+                                description: Statistics metric to be used by Trigger.
+                                  Used to define Stat for CloudWatch query. Default
+                                  is Average.
+                                type: string
+                              metricType:
+                                default: Average
+                                description: 'The type of metric to be used by HPA.
+                                  Enum: AverageValue - Uses average value of metric
+                                  per pod, Value - Uses absolute metric value'
+                                enum:
+                                - Value
+                                - Average
+                                type: string
+                              minValue:
+                                default: 0
+                                description: Minimum metric value used in case of
+                                  empty response from CloudWatch. Default is 0.
+                                type: number
+                              name:
+                                description: Name for the CloudWatch trigger
+                                type: string
+                              namespace:
+                                description: AWS CloudWatch namespace for metric
+                                type: string
+                              targetValue:
+                                description: TargetValue for CloudWatch metric
+                                type: number
+                              useCachedMetrics:
+                                default: true
+                                description: Enable caching of metric values during
+                                  polling interval. Default is true
+                                type: boolean
+                            type: object
+                          cloudWatchTriggerList:
+                            description: Multiple CloudWatch metric triggers to use
+                              for autoscaling. Takes priority over CloudWatchTrigger
+                              if both are provided.
+                            items:
+                              properties:
+                                activationTargetValue:
+                                  default: 0
+                                  description: Activation Value for CloudWatch metric
+                                    to scale from 0 to 1. Only applicable if minReplicaCount
+                                    = 0
+                                  type: number
+                                dimensions:
+                                  description: Dimensions for Cloudwatch metrics
+                                  items:
+                                    properties:
+                                      name:
+                                        description: CloudWatch Metric dimension name
+                                        type: string
+                                      value:
+                                        description: CloudWatch Metric dimension value
+                                        type: string
+                                    required:
+                                    - name
+                                    - value
+                                    type: object
+                                  type: array
+                                metricCollectionPeriod:
+                                  default: 300
+                                  description: Defines the Period for CloudWatch query
+                                  format: int32
+                                  type: integer
+                                metricCollectionStartTime:
+                                  default: 300
+                                  description: Defines the StartTime for CloudWatch
+                                    query
+                                  format: int32
+                                  type: integer
+                                metricName:
+                                  description: Metric name to query for Cloudwatch
+                                    trigger
+                                  type: string
+                                metricStat:
+                                  default: Average
+                                  description: Statistics metric to be used by Trigger.
+                                    Used to define Stat for CloudWatch query. Default
+                                    is Average.
+                                  type: string
+                                metricType:
+                                  default: Average
+                                  description: 'The type of metric to be used by HPA.
+                                    Enum: AverageValue - Uses average value of metric
+                                    per pod, Value - Uses absolute metric value'
+                                  enum:
+                                  - Value
+                                  - Average
+                                  type: string
+                                minValue:
+                                  default: 0
+                                  description: Minimum metric value used in case of
+                                    empty response from CloudWatch. Default is 0.
+                                  type: number
+                                name:
+                                  description: Name for the CloudWatch trigger
+                                  type: string
+                                namespace:
+                                  description: AWS CloudWatch namespace for metric
+                                  type: string
+                                targetValue:
+                                  description: TargetValue for CloudWatch metric
+                                  type: number
+                                useCachedMetrics:
+                                  default: true
+                                  description: Enable caching of metric values during
+                                    polling interval. Default is true
+                                  type: boolean
+                              type: object
+                            maxItems: 100
+                            type: array
+                          cooldownPeriod:
+                            default: 300
+                            description: The period to wait after the last trigger
+                              reported active before scaling the resource back to
+                              0. Default 300 seconds.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                          initialCooldownPeriod:
+                            default: 300
+                            description: The delay before the cooldownPeriod starts
+                              after the initial creation of the ScaledObject. Default
+                              300 seconds.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                          maxReplicaCount:
+                            default: 5
+                            description: The maximum number of model pods to scale
+                              to. Default 5.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                          minReplicaCount:
+                            default: 1
+                            description: The minimum number of model pods to scale
+                              down to. Default 1.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                          pollingInterval:
+                            default: 30
+                            description: This is the interval to check each trigger
+                              on. Default 30 seconds.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                          prometheusTrigger:
+                            description: Prometheus metric trigger to use for autoscaling
+                            properties:
+                              activationTargetValue:
+                                default: 0
+                                description: Activation Value for Prometheus metric
+                                  to scale from 0 to 1. Only applicable if minReplicaCount
+                                  = 0
+                                type: number
+                              customHeaders:
+                                description: Custom headers to include while querying
+                                  the prometheus endpoint.
+                                type: string
+                              metricType:
+                                default: Average
+                                description: 'The type of metric to be used by HPA.
+                                  Enum: AverageValue - Uses average value of metric
+                                  per pod, Value - Uses absolute metric value'
+                                enum:
+                                - Value
+                                - Average
+                                type: string
+                              name:
+                                description: Name for the Prometheus trigger
+                                type: string
+                              namespace:
+                                description: Namespace for namespaced queries
+                                type: string
+                              query:
+                                description: PromQLQuery for the metric.
+                                type: string
+                              serverAddress:
+                                description: Server address for AMP workspace
+                                pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+$|^$
+                                type: string
+                              targetValue:
+                                description: Target metric value for scaling
+                                type: number
+                              useCachedMetrics:
+                                default: true
+                                description: Enable caching of metric values during
+                                  polling interval. Default is true
+                                type: boolean
+                            type: object
+                          prometheusTriggerList:
+                            description: Multiple Prometheus metric triggers to use
+                              for autoscaling. Takes priority over PrometheusTrigger
+                              if both are provided.
+                            items:
+                              properties:
+                                activationTargetValue:
+                                  default: 0
+                                  description: Activation Value for Prometheus metric
+                                    to scale from 0 to 1. Only applicable if minReplicaCount
+                                    = 0
+                                  type: number
+                                customHeaders:
+                                  description: Custom headers to include while querying
+                                    the prometheus endpoint.
+                                  type: string
+                                metricType:
+                                  default: Average
+                                  description: 'The type of metric to be used by HPA.
+                                    Enum: AverageValue - Uses average value of metric
+                                    per pod, Value - Uses absolute metric value'
+                                  enum:
+                                  - Value
+                                  - Average
+                                  type: string
+                                name:
+                                  description: Name for the Prometheus trigger
+                                  type: string
+                                namespace:
+                                  description: Namespace for namespaced queries
+                                  type: string
+                                query:
+                                  description: PromQLQuery for the metric.
+                                  type: string
+                                serverAddress:
+                                  description: Server address for AMP workspace
+                                  pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+$|^$
+                                  type: string
+                                targetValue:
+                                  description: Target metric value for scaling
+                                  type: number
+                                useCachedMetrics:
+                                  default: true
+                                  description: Enable caching of metric values during
+                                    polling interval. Default is true
+                                  type: boolean
+                              type: object
+                            maxItems: 100
+                            type: array
+                          scaleDownStabilizationTime:
+                            default: 300
+                            description: The time window to stabilize for HPA before
+                              scaling down. Default 300 seconds.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                          scaleUpStabilizationTime:
+                            default: 0
+                            description: The time window to stabilize for HPA before
+                              scaling up. Default 0 seconds.
+                            format: int32
+                            minimum: 0
+                            type: integer
+                        type: object
+                    type: object
+                  decodingSpec:
+                    description: Configuration for decode pods.
+                    properties:
+                      args:
+                        description: |-
+                          Additional vLLM args for this role (e.g., --tensor-parallel-size, --gpu-memory-utilization, --max-num-seqs).
+                          These are appended after the shared workerConfig.args and override any matching flags.
+                        items:
+                          type: string
+                        type: array
+                      nodeSelector:
+                        additionalProperties:
+                          type: string
+                        description: Node selector for scheduling pods of this role
+                          onto specific nodes.
+                        type: object
+                      replicas:
+                        default: 1
+                        description: Number of replicas for this role.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      resources:
+                        description: |-
+                          Resource requests and limits for pods of this role.
+                          Must include GPU resource requests for DPD to function.
+                        properties:
+                          claims:
+                            description: |-
+                              Claims lists the names of resources, defined in spec.resourceClaims,
+                              that are used by this container.
+
+                              This field depends on the
+                              DynamicResourceAllocation feature gate.
+
+                              This field is immutable. It can only be set for containers.
+                            items:
+                              description: ResourceClaim references one entry in PodSpec.ResourceClaims.
+                              properties:
+                                name:
+                                  description: |-
+                                    Name must match the name of one entry in pod.spec.resourceClaims of
+                                    the Pod where this field is used. It makes that resource available
+                                    inside a container.
+                                  type: string
+                                request:
+                                  description: |-
+                                    Request is the name chosen for a request in the referenced claim.
+                                    If empty, everything from the claim is made available, otherwise
+                                    only the result of this request.
+                                  type: string
+                              required:
+                              - name
+                              type: object
+                            type: array
+                            x-kubernetes-list-map-keys:
+                            - name
+                            x-kubernetes-list-type: map
+                          limits:
+                            additionalProperties:
+                              anyOf:
+                              - type: integer
+                              - type: string
+                              pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                              x-kubernetes-int-or-string: true
+                            description: |-
+                              Limits describes the maximum amount of compute resources allowed.
+                              More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                            type: object
+                          requests:
+                            additionalProperties:
+                              anyOf:
+                              - type: integer
+                              - type: string
+                              pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                              x-kubernetes-int-or-string: true
+                            description: |-
+                              Requests describes the minimum amount of compute resources required.
+                              If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                              otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                              More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                            type: object
+                        type: object
+                    type: object
+                  prefillSpec:
+                    description: Configuration for prefill pods.
+                    properties:
+                      args:
+                        description: |-
+                          Additional vLLM args for this role (e.g., --tensor-parallel-size, --gpu-memory-utilization, --max-num-seqs).
+                          These are appended after the shared workerConfig.args and override any matching flags.
+                        items:
+                          type: string
+                        type: array
+                      nodeSelector:
+                        additionalProperties:
+                          type: string
+                        description: Node selector for scheduling pods of this role
+                          onto specific nodes.
+                        type: object
+                      replicas:
+                        default: 1
+                        description: Number of replicas for this role.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      resources:
+                        description: |-
+                          Resource requests and limits for pods of this role.
+                          Must include GPU resource requests for DPD to function.
+                        properties:
+                          claims:
+                            description: |-
+                              Claims lists the names of resources, defined in spec.resourceClaims,
+                              that are used by this container.
+
+                              This field depends on the
+                              DynamicResourceAllocation feature gate.
+
+                              This field is immutable. It can only be set for containers.
+                            items:
+                              description: ResourceClaim references one entry in PodSpec.ResourceClaims.
+                              properties:
+                                name:
+                                  description: |-
+                                    Name must match the name of one entry in pod.spec.resourceClaims of
+                                    the Pod where this field is used. It makes that resource available
+                                    inside a container.
+                                  type: string
+                                request:
+                                  description: |-
+                                    Request is the name chosen for a request in the referenced claim.
+                                    If empty, everything from the claim is made available, otherwise
+                                    only the result of this request.
+                                  type: string
+                              required:
+                              - name
+                              type: object
+                            type: array
+                            x-kubernetes-list-map-keys:
+                            - name
+                            x-kubernetes-list-type: map
+                          limits:
+                            additionalProperties:
+                              anyOf:
+                              - type: integer
+                              - type: string
+                              pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                              x-kubernetes-int-or-string: true
+                            description: |-
+                              Limits describes the maximum amount of compute resources allowed.
+                              More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                            type: object
+                          requests:
+                            additionalProperties:
+                              anyOf:
+                              - type: integer
+                              - type: string
+                              pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                              x-kubernetes-int-or-string: true
+                            description: |-
+                              Requests describes the minimum amount of compute resources required.
+                              If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                              otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                              More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                            type: object
+                        type: object
+                    type: object
+                  routingThreshold:
+                    default: 4096
+                    description: |-
+                      Token count threshold for conditional routing.
+                      Requests with estimated input tokens >= threshold go through DPD path (remote prefill).
+                      Requests below threshold go directly to decoder for local chunked prefill.
+                      Default 4096. Set to 0 to always disaggregate.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  topologySpec:
+                    description: Topology constraints for prefill and decode pod scheduling.
+                    properties:
+                      availabilityZone:
+                        description: |-
+                          Force pods into a specific availability zone. If empty, any AZ is allowed
+                          (subject to sameAZ constraint).
+                        type: string
+                      placementGroup:
+                        description: Placement group name for lowest network latency
+                          between prefill and decode pods.
+                        type: string
+                      sameAZ:
+                        default: true
+                        description: |-
+                          Enforce same availability zone placement for prefill and decode pods.
+                          Required for optimal EFA latency. Default: true.
+                        type: boolean
+                    type: object
+                required:
+                - decodingSpec
+                - prefillSpec
+                type: object
               replicas:
                 default: 1
                 description: The desired number of inference server replicas. Default
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml
index 075a6df0..b193292b 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml
@@ -24,7 +24,7 @@ image:
     ap-southeast-4: 311141544681.dkr.ecr.ap-southeast-4.amazonaws.com
     ap-southeast-3: 158128612970.dkr.ecr.ap-southeast-3.amazonaws.com
     eu-south-2: 025050981094.dkr.ecr.eu-south-2.amazonaws.com
-  tag: v3.1
+  tag: v3.2
   pullPolicy: Always
   repository:
   initContainer:

From 580c833a3daff5be0131a64a2b35b99376973996 Mon Sep 17 00:00:00 2001
From: Richa Shalom Gadagotti <rgadag@amazon.com>
Date: Fri, 5 Jun 2026 17:27:16 +0000
Subject: [PATCH 2/2] Pin operator to amd64 nodes via nodeAffinity and version
 bump to 2.2.1

---
 .../charts/inference-operator/Chart.yaml      |  2 +-
 .../config/manager/manager.yaml               | 32 +++++++------------
 .../charts/inference-operator/values.yaml     | 21 ++++++++++++
 3 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml
index aeeba18c..7d1225bc 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 2.2.0
+version: 2.2.1
 
 # This is the version number of the application being deployed. Keep this aligned 
 # with operator image MAJOR.MINOR version.
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml
index 407f067b..f13bc4b4 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml
@@ -19,26 +19,18 @@ spec:
       labels:
         control-plane: {{ .Values.namePrefix }}-controller-manager
     spec:
-      # TODO(user): Uncomment the following code to configure the nodeAffinity expression
-      # according to the platforms which are supported by your solution.
-      # It is considered best practice to support multiple architectures. You can
-      # build your manager image using the makefile target docker-buildx.
-      # affinity:
-      #   nodeAffinity:
-      #     requiredDuringSchedulingIgnoredDuringExecution:
-      #       nodeSelectorTerms:
-      #         - matchExpressions:
-      #           - key: kubernetes.io/arch
-      #             operator: In
-      #             values:
-      #               - amd64
-      #               - arm64
-      #               - ppc64le
-      #               - s390x
-      #           - key: kubernetes.io/os
-      #             operator: In
-      #             values:
-      #               - linux
+      {{- with .Values.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
       securityContext:
         runAsNonRoot: true
         # TODO(user): For common cases that do not require escalating privileges
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml
index b193292b..53fac781 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml
@@ -38,6 +38,27 @@ tlsCertificateS3Bucket:
 enableWebhooks: true
 enableCustomServiceAccounts: false
 
+# Architecture-aware scheduling for the operator deployment.
+# Pins to amd64 Linux nodes since operator images are amd64-only.
+# Override via EKS AddOn configurationValues if needed.
+affinity:
+  nodeAffinity:
+    requiredDuringSchedulingIgnoredDuringExecution:
+      nodeSelectorTerms:
+        - matchExpressions:
+            - key: kubernetes.io/arch
+              operator: In
+              values:
+                - amd64
+            - key: kubernetes.io/os
+              operator: In
+              values:
+                - linux
+ 
+nodeSelector: {}
+ 
+tolerations: []
+
 s3:
   enabled: true
   # IAM role ARN used for S3 CSI driver k8s service account