From e3b0c71b46c6111a03e2a407ee9ea0c601fef170 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 15 May 2026 03:45:15 +0000
Subject: [PATCH] =?UTF-8?q?Add=202-GPU=20runners:=20arm64=20l4=C3=972=20ni?=
 =?UTF-8?q?ghtly=20+=20Windows=20amd64=20special=20runners?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

arm64 l4×2 runners are restricted to nightly-only use per the runner
team (ARM64 L4 capacity concerns). Add them as nightly-standard entries
in ci-nightly.yml so they run the standard test suite against wheels
from the latest successful main CI run.

Windows amd64 2-GPU runners (t4×2 TCC, h100×2 MCDM) are added as
special runners in the regular PR CI matrix, mirroring the existing
Linux amd64 2-GPU special runners.

Also update the Windows test job name to show GPU count (x2) for
multi-GPU entries, matching the Linux job name format.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci-nightly.yml         | 30 +++++++++++++++++++++---
 .github/workflows/test-wheel-windows.yml |  2 +-
 ci/test-matrix.yml                       |  6 +++++
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci-nightly.yml b/.github/workflows/ci-nightly.yml
index 180351d45ee..0aa4bfc3d48 100644
--- a/.github/workflows/ci-nightly.yml
+++ b/.github/workflows/ci-nightly.yml
@@ -3,10 +3,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Nightly CI pipeline that tests optional dependencies (PyTorch, numba-cuda)
-# against the latest cuda-python wheels built on main.
+# against the latest cuda-python wheels built on main, and runs the standard
+# test suite on runners reserved for nightly-only use (e.g. arm64 l4×2).
 #
 # This workflow does NOT build wheels — it downloads them from the latest
-# successful CI run on main and runs integration tests with optional deps.
+# successful CI run on main and runs integration/standard tests.
 
 name: "CI: Nightly optional-deps"
 
@@ -191,6 +192,26 @@ jobs:
       test-mode: nightly-numba-cuda
       matrix_filter: 'map(select(.MODE == "nightly-numba-cuda"))'
 
+  # ── Standard tests on nightly-only runners ──
+
+  test-standard-linux-aarch64:
+    name: "Nightly standard (linux-aarch64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-linux.yml
+    with:
+      build-type: nightly
+      host-platform: linux-aarch64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
+      test-mode: standard
+      matrix_filter: 'map(select(.MODE == "nightly-standard"))'
+
   # ── Status check ──
 
   checks:
@@ -205,6 +226,7 @@ jobs:
       - test-numba-cuda-linux-64
       - test-numba-cuda-linux-aarch64
       - test-numba-cuda-windows
+      - test-standard-linux-aarch64
     steps:
       - name: Exit
         run: |
@@ -227,7 +249,9 @@ jobs:
                  needs.test-numba-cuda-linux-aarch64.result == 'cancelled' ||
                  needs.test-numba-cuda-linux-aarch64.result == 'failure' ||
                  needs.test-numba-cuda-windows.result == 'cancelled' ||
-                 needs.test-numba-cuda-windows.result == 'failure' }}; then
+                 needs.test-numba-cuda-windows.result == 'failure' ||
+                 needs.test-standard-linux-aarch64.result == 'cancelled' ||
+                 needs.test-standard-linux-aarch64.result == 'failure' }}; then
             exit 1
           fi
           exit 0
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 328c0910677..04b77b27b2c 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -87,7 +87,7 @@ jobs:
           echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
 
   test:
-    name: Python ${{ matrix.PY_VER }}, CUDA ${{ matrix.CUDA_VER }} (${{ (matrix.LOCAL_CTK == '1' && 'local') || 'wheels' }}), GPU ${{ matrix.GPU }} (${{ matrix.DRIVER_MODE }})${{ matrix.TORCH_VER && format(', {0}', matrix.TORCH_VER) || '' }}${{ matrix.MODE == 'nightly-numba-cuda' && ', latest' || '' }}
+    name: Python ${{ matrix.PY_VER }}, CUDA ${{ matrix.CUDA_VER }} (${{ (matrix.LOCAL_CTK == '1' && 'local') || 'wheels' }}), GPU ${{ matrix.GPU }}${{ matrix.GPU_COUNT != '1' && format(' (x{0})', matrix.GPU_COUNT) || '' }} (${{ matrix.DRIVER_MODE }})${{ matrix.TORCH_VER && format(', {0}', matrix.TORCH_VER) || '' }}${{ matrix.MODE == 'nightly-numba-cuda' && ', latest' || '' }}
     # The build stage could fail but we want the CI to keep moving.
     needs: compute-matrix
     strategy:
diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index 884a4865523..35f02847ed7 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -77,6 +77,9 @@ linux:
     - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
     - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
     - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
+    # nightly-standard (arm64 l4×2 — nightly-only per runner team request)
+    - { MODE: 'nightly-standard', ARCH: 'arm64', PY_VER: '3.14',  CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '2', DRIVER: 'latest' }
+    - { MODE: 'nightly-standard', ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '2', DRIVER: 'latest' }
 
 windows:
   pull-request:
@@ -99,6 +102,9 @@ windows:
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
+    # special runners
+    - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.2.1', LOCAL_CTK: '1', GPU: 't4',         GPU_COUNT: '2', DRIVER: 'latest', DRIVER_MODE: 'TCC' }
+    - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.2.1', LOCAL_CTK: '0', GPU: 'h100',       GPU_COUNT: '2', DRIVER: 'latest', DRIVER_MODE: 'MCDM' }
   nightly:
     # nightly-pytorch
     - { MODE: 'nightly-pytorch',    ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.6.3', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.11.0',   TORCH_CUDA: 'cu126' }