From 956fd5186bde02b2fb2f2c7882a80d0ffe6c0ac7 Mon Sep 17 00:00:00 2001
From: Aryan <aryansputta@gmail.com>
Date: Thu, 14 May 2026 01:24:25 -0400
Subject: [PATCH 1/6] Use FIPS-safe hashes for program cache keys

Signed-off-by: Aryan <aryansputta@gmail.com>
---
 .../core/utils/_program_cache/_file_stream.py |   6 +-
 .../cuda/core/utils/_program_cache/_keys.py   |   4 +-
 cuda_core/tests/test_program_cache.py         |   2 +-
 cuda_core/tests/test_program_cache_fips.py    | 126 ++++++++++++++++++
 4 files changed, 132 insertions(+), 6 deletions(-)
 create mode 100644 cuda_core/tests/test_program_cache_fips.py

diff --git a/cuda_core/cuda/core/utils/_program_cache/_file_stream.py b/cuda_core/cuda/core/utils/_program_cache/_file_stream.py
index eccf494c99b..857979bc9be 100644
--- a/cuda_core/cuda/core/utils/_program_cache/_file_stream.py
+++ b/cuda_core/cuda/core/utils/_program_cache/_file_stream.py
@@ -422,11 +422,11 @@ def _path_for_key(self, key: object) -> Path:
         k = _as_key_bytes(key)
         # Hash the key to a fixed-length identifier so arbitrary-length user
         # keys never exceed per-component filename limits (typically 255 on
-        # ext4 / NTFS). With a 256-bit blake2b digest, the cache relies on
+        # ext4 / NTFS). With a 256-bit SHA-256 digest, the cache relies on
         # cryptographic collision resistance for key uniqueness -- two
         # distinct keys hashing to the same path is astronomically unlikely
-        # (~2^-128 with the 32-byte digest in use here).
-        digest = hashlib.blake2b(k, digest_size=32).hexdigest()
+        # (~2^-128 for practical collision work).
+        digest = hashlib.sha256(k).hexdigest()
         return self._entries / digest[:2] / digest[2:]
 
     # -- mapping API ---------------------------------------------------------
diff --git a/cuda_core/cuda/core/utils/_program_cache/_keys.py b/cuda_core/cuda/core/utils/_program_cache/_keys.py
index fbb5ef3f890..bb0fae9278a 100644
--- a/cuda_core/cuda/core/utils/_program_cache/_keys.py
+++ b/cuda_core/cuda/core/utils/_program_cache/_keys.py
@@ -35,7 +35,7 @@
 )
 
 # Bump when the key schema changes in a way that invalidates existing caches.
-_KEY_SCHEMA_VERSION = 1
+_KEY_SCHEMA_VERSION = 2
 
 _VALID_CODE_TYPES = frozenset({"c++", "ptx", "nvvm"})
 _VALID_TARGET_TYPES = frozenset({"ptx", "cubin", "ltoir"})
@@ -768,7 +768,7 @@ def make_program_cache_key(
     option_bytes = backend.option_fingerprint(options, target_type)
     name_tags = backend.encode_name_expressions(name_expressions)
 
-    hasher = hashlib.blake2b(digest_size=32)
+    hasher = hashlib.sha256()
 
     def _update(label: str, payload: bytes) -> None:
         hasher.update(label.encode("ascii"))
diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py
index 5a0dbcba2a2..3923312a3e9 100644
--- a/cuda_core/tests/test_program_cache.py
+++ b/cuda_core/tests/test_program_cache.py
@@ -1773,7 +1773,7 @@ def test_filestream_cache_size_cap_counts_tmp_files(tmp_path):
 
 def test_filestream_cache_handles_long_keys(tmp_path):
     """Arbitrary-length keys must not overflow per-component filename limits.
-    The filename is a fixed-length 256-bit blake2b digest; key uniqueness
+    The filename is a fixed-length 256-bit digest; key uniqueness
     relies on the digest's collision resistance."""
     from cuda.core.utils import FileStreamProgramCache
 
diff --git a/cuda_core/tests/test_program_cache_fips.py b/cuda_core/tests/test_program_cache_fips.py
new file mode 100644
index 00000000000..5bc5f5682cd
--- /dev/null
+++ b/cuda_core/tests/test_program_cache_fips.py
@@ -0,0 +1,126 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Source-level regression tests for FIPS-safe program-cache hashing.
+
+These tests load the leaf program-cache modules directly from source with
+small stubs, so they can run without importing the full built ``cuda.core``
+package. Run with ``--noconftest`` when the compiled extensions are not
+available:
+
+    pytest cuda_core/tests/test_program_cache_fips.py --noconftest
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+import types
+from pathlib import Path
+
+
+def _load_program_cache_modules(monkeypatch):
+    cuda_pkg = types.ModuleType("cuda")
+    cuda_pkg.__path__ = []
+    core_pkg = types.ModuleType("cuda.core")
+    core_pkg.__path__ = []
+    utils_pkg = types.ModuleType("cuda.core.utils")
+    utils_pkg.__path__ = []
+    cache_pkg = types.ModuleType("cuda.core.utils._program_cache")
+    cache_pkg.__path__ = []
+    utils_internal_pkg = types.ModuleType("cuda.core._utils")
+    utils_internal_pkg.__path__ = []
+
+    module_mod = types.ModuleType("cuda.core._module")
+
+    class ObjectCode:
+        pass
+
+    module_mod.ObjectCode = ObjectCode
+
+    program_mod = types.ModuleType("cuda.core._program")
+
+    class ProgramOptions:
+        def __init__(self, **kwargs):
+            self.arch = kwargs.pop("arch", "sm_80")
+            self.name = kwargs.pop("name", "default_program")
+            for key, value in kwargs.items():
+                setattr(self, key, value)
+
+        def as_bytes(self, backend, target_type):
+            return [
+                f"backend={backend}".encode(),
+                f"target_type={target_type}".encode(),
+                f"arch={self.arch}".encode(),
+                f"name={self.name}".encode(),
+            ]
+
+    program_mod.ProgramOptions = ProgramOptions
+
+    cuda_utils_mod = types.ModuleType("cuda.core._utils.cuda_utils")
+    cuda_utils_mod.driver = types.SimpleNamespace()
+    cuda_utils_mod.handle_return = lambda result: result
+    cuda_utils_mod.nvrtc = types.SimpleNamespace(nvrtcVersion=lambda: (13, 0))
+
+    modules = {
+        "cuda": cuda_pkg,
+        "cuda.core": core_pkg,
+        "cuda.core.utils": utils_pkg,
+        "cuda.core.utils._program_cache": cache_pkg,
+        "cuda.core._utils": utils_internal_pkg,
+        "cuda.core._module": module_mod,
+        "cuda.core._program": program_mod,
+        "cuda.core._utils.cuda_utils": cuda_utils_mod,
+    }
+    for name, module in modules.items():
+        monkeypatch.setitem(sys.modules, name, module)
+
+    base = Path(__file__).parent.parent / "cuda" / "core" / "utils" / "_program_cache"
+
+    def _load(name, filename):
+        spec = importlib.util.spec_from_file_location(name, base / filename)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        monkeypatch.setitem(sys.modules, name, module)
+        return module
+
+    _load("cuda.core.utils._program_cache._abc", "_abc.py")
+    keys_mod = _load("cuda.core.utils._program_cache._keys", "_keys.py")
+    file_stream_mod = _load("cuda.core.utils._program_cache._file_stream", "_file_stream.py")
+    return keys_mod, file_stream_mod, ProgramOptions
+
+
+def test_make_program_cache_key_avoids_fips_blocked_blake2b(monkeypatch):
+    import hashlib
+
+    keys_mod, _file_stream_mod, ProgramOptions = _load_program_cache_modules(monkeypatch)
+
+    def _blake2b_disabled(*args, **kwargs):
+        raise ValueError("disabled for FIPS")
+
+    monkeypatch.setattr(hashlib, "blake2b", _blake2b_disabled)
+
+    key = keys_mod.make_program_cache_key(
+        code="extern \"C\" __global__ void k() {}",
+        code_type="c++",
+        options=ProgramOptions(arch="sm_80"),
+        target_type="cubin",
+    )
+
+    assert isinstance(key, bytes)
+    assert len(key) == 32
+
+
+def test_filestream_cache_path_hash_avoids_fips_blocked_blake2b(tmp_path, monkeypatch):
+    import hashlib
+
+    _keys_mod, file_stream_mod, _ProgramOptions = _load_program_cache_modules(monkeypatch)
+
+    def _blake2b_disabled(*args, **kwargs):
+        raise ValueError("disabled for FIPS")
+
+    monkeypatch.setattr(hashlib, "blake2b", _blake2b_disabled)
+
+    with file_stream_mod.FileStreamProgramCache(tmp_path / "fc") as cache:
+        cache[b"my-key"] = b"payload"
+        assert cache[b"my-key"] == b"payload"

From d8727ee4a965afd4aed3f998a1fcc03f2fae1f50 Mon Sep 17 00:00:00 2001
From: Aryan <aryansputta@gmail.com>
Date: Thu, 14 May 2026 01:31:04 -0400
Subject: [PATCH 2/6] Fix FIPS cache test linting

Signed-off-by: Aryan <aryansputta@gmail.com>
---
 cuda_core/tests/test_program_cache_fips.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_program_cache_fips.py b/cuda_core/tests/test_program_cache_fips.py
index 5bc5f5682cd..800039675bf 100644
--- a/cuda_core/tests/test_program_cache_fips.py
+++ b/cuda_core/tests/test_program_cache_fips.py
@@ -101,7 +101,7 @@ def _blake2b_disabled(*args, **kwargs):
     monkeypatch.setattr(hashlib, "blake2b", _blake2b_disabled)
 
     key = keys_mod.make_program_cache_key(
-        code="extern \"C\" __global__ void k() {}",
+        code='extern "C" __global__ void k() {}',
         code_type="c++",
         options=ProgramOptions(arch="sm_80"),
         target_type="cubin",

From 44164f75f84db74c1aa4ee4b6aa3777f62e7a9a0 Mon Sep 17 00:00:00 2001
From: Aryan <aryansputta@gmail.com>
Date: Thu, 14 May 2026 12:22:18 -0400
Subject: [PATCH 3/6] Use benchmarked FIPS-safe cache hashing

Signed-off-by: Aryan <aryansputta@gmail.com>
---
 .../core/utils/_program_cache/_file_stream.py |  16 ++-
 .../cuda/core/utils/_program_cache/_keys.py   |   5 +-
 cuda_core/tests/test_program_cache.py         |   2 +-
 cuda_core/tests/test_program_cache_fips.py    | 126 ------------------
 4 files changed, 16 insertions(+), 133 deletions(-)
 delete mode 100644 cuda_core/tests/test_program_cache_fips.py

diff --git a/cuda_core/cuda/core/utils/_program_cache/_file_stream.py b/cuda_core/cuda/core/utils/_program_cache/_file_stream.py
index 857979bc9be..3bba1d53d6b 100644
--- a/cuda_core/cuda/core/utils/_program_cache/_file_stream.py
+++ b/cuda_core/cuda/core/utils/_program_cache/_file_stream.py
@@ -422,11 +422,17 @@ def _path_for_key(self, key: object) -> Path:
         k = _as_key_bytes(key)
         # Hash the key to a fixed-length identifier so arbitrary-length user
         # keys never exceed per-component filename limits (typically 255 on
-        # ext4 / NTFS). With a 256-bit SHA-256 digest, the cache relies on
-        # cryptographic collision resistance for key uniqueness -- two
-        # distinct keys hashing to the same path is astronomically unlikely
-        # (~2^-128 for practical collision work).
-        digest = hashlib.sha256(k).hexdigest()
+        # ext4 / NTFS).
+        #
+        # FIPS: must use a FIPS-approved hash algorithm. FIPS-enforcing
+        # systems can disable non-approved hashlib algorithms (for example
+        # blake2b) at the OpenSSL level. See #2043.
+        #
+        # With a 384-bit SHA-384 digest, the cache relies on collision
+        # resistance for key uniqueness -- two distinct keys hashing to the
+        # same path is astronomically unlikely (~2^-192 for practical
+        # collision work).
+        digest = hashlib.sha384(k, usedforsecurity=False).hexdigest()
         return self._entries / digest[:2] / digest[2:]
 
     # -- mapping API ---------------------------------------------------------
diff --git a/cuda_core/cuda/core/utils/_program_cache/_keys.py b/cuda_core/cuda/core/utils/_program_cache/_keys.py
index bb0fae9278a..34ef1af5242 100644
--- a/cuda_core/cuda/core/utils/_program_cache/_keys.py
+++ b/cuda_core/cuda/core/utils/_program_cache/_keys.py
@@ -768,7 +768,10 @@ def make_program_cache_key(
     option_bytes = backend.option_fingerprint(options, target_type)
     name_tags = backend.encode_name_expressions(name_expressions)
 
-    hasher = hashlib.sha256()
+    # IMPORTANT: Must use a FIPS-approved hash algorithm (SHA-2 family).
+    # FIPS-enforcing systems can disable non-approved hashlib algorithms
+    # (for example blake2b) at the OpenSSL level. See #2043.
+    hasher = hashlib.sha384(usedforsecurity=False)
 
     def _update(label: str, payload: bytes) -> None:
         hasher.update(label.encode("ascii"))
diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py
index 3923312a3e9..44f8a7f5552 100644
--- a/cuda_core/tests/test_program_cache.py
+++ b/cuda_core/tests/test_program_cache.py
@@ -126,7 +126,7 @@ def _make_key(**overrides):
 def test_make_program_cache_key_returns_bytes():
     key = _make_key()
     assert isinstance(key, bytes)
-    assert len(key) == 32
+    assert len(key) == 48
 
 
 def test_make_program_cache_key_propagates_as_bytes_typeerror(monkeypatch):
diff --git a/cuda_core/tests/test_program_cache_fips.py b/cuda_core/tests/test_program_cache_fips.py
deleted file mode 100644
index 800039675bf..00000000000
--- a/cuda_core/tests/test_program_cache_fips.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Source-level regression tests for FIPS-safe program-cache hashing.
-
-These tests load the leaf program-cache modules directly from source with
-small stubs, so they can run without importing the full built ``cuda.core``
-package. Run with ``--noconftest`` when the compiled extensions are not
-available:
-
-    pytest cuda_core/tests/test_program_cache_fips.py --noconftest
-"""
-
-from __future__ import annotations
-
-import importlib.util
-import sys
-import types
-from pathlib import Path
-
-
-def _load_program_cache_modules(monkeypatch):
-    cuda_pkg = types.ModuleType("cuda")
-    cuda_pkg.__path__ = []
-    core_pkg = types.ModuleType("cuda.core")
-    core_pkg.__path__ = []
-    utils_pkg = types.ModuleType("cuda.core.utils")
-    utils_pkg.__path__ = []
-    cache_pkg = types.ModuleType("cuda.core.utils._program_cache")
-    cache_pkg.__path__ = []
-    utils_internal_pkg = types.ModuleType("cuda.core._utils")
-    utils_internal_pkg.__path__ = []
-
-    module_mod = types.ModuleType("cuda.core._module")
-
-    class ObjectCode:
-        pass
-
-    module_mod.ObjectCode = ObjectCode
-
-    program_mod = types.ModuleType("cuda.core._program")
-
-    class ProgramOptions:
-        def __init__(self, **kwargs):
-            self.arch = kwargs.pop("arch", "sm_80")
-            self.name = kwargs.pop("name", "default_program")
-            for key, value in kwargs.items():
-                setattr(self, key, value)
-
-        def as_bytes(self, backend, target_type):
-            return [
-                f"backend={backend}".encode(),
-                f"target_type={target_type}".encode(),
-                f"arch={self.arch}".encode(),
-                f"name={self.name}".encode(),
-            ]
-
-    program_mod.ProgramOptions = ProgramOptions
-
-    cuda_utils_mod = types.ModuleType("cuda.core._utils.cuda_utils")
-    cuda_utils_mod.driver = types.SimpleNamespace()
-    cuda_utils_mod.handle_return = lambda result: result
-    cuda_utils_mod.nvrtc = types.SimpleNamespace(nvrtcVersion=lambda: (13, 0))
-
-    modules = {
-        "cuda": cuda_pkg,
-        "cuda.core": core_pkg,
-        "cuda.core.utils": utils_pkg,
-        "cuda.core.utils._program_cache": cache_pkg,
-        "cuda.core._utils": utils_internal_pkg,
-        "cuda.core._module": module_mod,
-        "cuda.core._program": program_mod,
-        "cuda.core._utils.cuda_utils": cuda_utils_mod,
-    }
-    for name, module in modules.items():
-        monkeypatch.setitem(sys.modules, name, module)
-
-    base = Path(__file__).parent.parent / "cuda" / "core" / "utils" / "_program_cache"
-
-    def _load(name, filename):
-        spec = importlib.util.spec_from_file_location(name, base / filename)
-        module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(module)
-        monkeypatch.setitem(sys.modules, name, module)
-        return module
-
-    _load("cuda.core.utils._program_cache._abc", "_abc.py")
-    keys_mod = _load("cuda.core.utils._program_cache._keys", "_keys.py")
-    file_stream_mod = _load("cuda.core.utils._program_cache._file_stream", "_file_stream.py")
-    return keys_mod, file_stream_mod, ProgramOptions
-
-
-def test_make_program_cache_key_avoids_fips_blocked_blake2b(monkeypatch):
-    import hashlib
-
-    keys_mod, _file_stream_mod, ProgramOptions = _load_program_cache_modules(monkeypatch)
-
-    def _blake2b_disabled(*args, **kwargs):
-        raise ValueError("disabled for FIPS")
-
-    monkeypatch.setattr(hashlib, "blake2b", _blake2b_disabled)
-
-    key = keys_mod.make_program_cache_key(
-        code='extern "C" __global__ void k() {}',
-        code_type="c++",
-        options=ProgramOptions(arch="sm_80"),
-        target_type="cubin",
-    )
-
-    assert isinstance(key, bytes)
-    assert len(key) == 32
-
-
-def test_filestream_cache_path_hash_avoids_fips_blocked_blake2b(tmp_path, monkeypatch):
-    import hashlib
-
-    _keys_mod, file_stream_mod, _ProgramOptions = _load_program_cache_modules(monkeypatch)
-
-    def _blake2b_disabled(*args, **kwargs):
-        raise ValueError("disabled for FIPS")
-
-    monkeypatch.setattr(hashlib, "blake2b", _blake2b_disabled)
-
-    with file_stream_mod.FileStreamProgramCache(tmp_path / "fc") as cache:
-        cache[b"my-key"] = b"payload"
-        assert cache[b"my-key"] == b"payload"

From c08fc21754a1fab2ccc6d3028002d52b097687af Mon Sep 17 00:00:00 2001
From: Aryan <aryansputta@gmail.com>
Date: Thu, 14 May 2026 12:55:00 -0400
Subject: [PATCH 4/6] Clarify SHA-384 cache digest comment

Signed-off-by: Aryan <aryansputta@gmail.com>
---
 cuda_core/tests/test_program_cache.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py
index 44f8a7f5552..bca3ee1a1c3 100644
--- a/cuda_core/tests/test_program_cache.py
+++ b/cuda_core/tests/test_program_cache.py
@@ -1773,7 +1773,7 @@ def test_filestream_cache_size_cap_counts_tmp_files(tmp_path):
 
 def test_filestream_cache_handles_long_keys(tmp_path):
     """Arbitrary-length keys must not overflow per-component filename limits.
-    The filename is a fixed-length 256-bit digest; key uniqueness
+    The filename is a fixed-length 384-bit digest; key uniqueness
     relies on the digest's collision resistance."""
     from cuda.core.utils import FileStreamProgramCache
 

From 6f98260c0655e51a37427c25e37286eb95a9b3e5 Mon Sep 17 00:00:00 2001
From: Aryan <aryansputta@gmail.com>
Date: Fri, 15 May 2026 17:09:52 -0400
Subject: [PATCH 5/6] Switch program cache hashing back to sha256

---
 .../core/utils/_program_cache/_file_stream.py |   8 +-
 .../cuda/core/utils/_program_cache/_keys.py   |   2 +-
 cuda_core/tests/test_program_cache.py         |   4 +-
 scripts/bench_program_cache_hashes.py         | 266 ++++++++++++++++++
 4 files changed, 273 insertions(+), 7 deletions(-)
 create mode 100644 scripts/bench_program_cache_hashes.py

diff --git a/cuda_core/cuda/core/utils/_program_cache/_file_stream.py b/cuda_core/cuda/core/utils/_program_cache/_file_stream.py
index 3bba1d53d6b..4459eeded6b 100644
--- a/cuda_core/cuda/core/utils/_program_cache/_file_stream.py
+++ b/cuda_core/cuda/core/utils/_program_cache/_file_stream.py
@@ -428,11 +428,11 @@ def _path_for_key(self, key: object) -> Path:
         # systems can disable non-approved hashlib algorithms (for example
         # blake2b) at the OpenSSL level. See #2043.
         #
-        # With a 384-bit SHA-384 digest, the cache relies on collision
+        # With a 256-bit SHA-256 digest, the cache relies on collision
         # resistance for key uniqueness -- two distinct keys hashing to the
-        # same path is astronomically unlikely (~2^-192 for practical
-        # collision work).
-        digest = hashlib.sha384(k, usedforsecurity=False).hexdigest()
+        # same path is astronomically unlikely (~2^128 practical collision
+        # work).
+        digest = hashlib.sha256(k, usedforsecurity=False).hexdigest()
         return self._entries / digest[:2] / digest[2:]
 
     # -- mapping API ---------------------------------------------------------
diff --git a/cuda_core/cuda/core/utils/_program_cache/_keys.py b/cuda_core/cuda/core/utils/_program_cache/_keys.py
index 34ef1af5242..039853204cc 100644
--- a/cuda_core/cuda/core/utils/_program_cache/_keys.py
+++ b/cuda_core/cuda/core/utils/_program_cache/_keys.py
@@ -771,7 +771,7 @@ def make_program_cache_key(
     # IMPORTANT: Must use a FIPS-approved hash algorithm (SHA-2 family).
     # FIPS-enforcing systems can disable non-approved hashlib algorithms
     # (for example blake2b) at the OpenSSL level. See #2043.
-    hasher = hashlib.sha384(usedforsecurity=False)
+    hasher = hashlib.sha256(usedforsecurity=False)
 
     def _update(label: str, payload: bytes) -> None:
         hasher.update(label.encode("ascii"))
diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py
index bca3ee1a1c3..3923312a3e9 100644
--- a/cuda_core/tests/test_program_cache.py
+++ b/cuda_core/tests/test_program_cache.py
@@ -126,7 +126,7 @@ def _make_key(**overrides):
 def test_make_program_cache_key_returns_bytes():
     key = _make_key()
     assert isinstance(key, bytes)
-    assert len(key) == 48
+    assert len(key) == 32
 
 
 def test_make_program_cache_key_propagates_as_bytes_typeerror(monkeypatch):
@@ -1773,7 +1773,7 @@ def test_filestream_cache_size_cap_counts_tmp_files(tmp_path):
 
 def test_filestream_cache_handles_long_keys(tmp_path):
     """Arbitrary-length keys must not overflow per-component filename limits.
-    The filename is a fixed-length 384-bit digest; key uniqueness
+    The filename is a fixed-length 256-bit digest; key uniqueness
     relies on the digest's collision resistance."""
     from cuda.core.utils import FileStreamProgramCache
 
diff --git a/scripts/bench_program_cache_hashes.py b/scripts/bench_program_cache_hashes.py
new file mode 100644
index 00000000000..147a847d90e
--- /dev/null
+++ b/scripts/bench_program_cache_hashes.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Benchmark FIPS-available hashlib candidates for cuda.core program-cache use.
+
+This mirrors the two relevant call sites:
+
+* ``FileStreamProgramCache._path_for_key()``: hash a cache key to a stable
+  filename component via ``hexdigest()``.
+* ``make_program_cache_key()``: incrementally build the digest from labeled
+  payload chunks and return ``digest()``.
+
+This is a review/support tool, not a production dependency. The benchmark is
+intentionally stdlib-only so reviewers can run it directly.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import inspect
+import statistics
+import sys
+import time
+from dataclasses import dataclass
+from typing import Callable
+
+_DEFAULT_ALGORITHMS = (
+    "sha1",
+    "sha224",
+    "sha256",
+    "sha384",
+    "sha512",
+    "sha512_224",
+    "sha512_256",
+    "sha3_224",
+    "sha3_256",
+    "sha3_384",
+    "sha3_512",
+    "shake_128",
+    "shake_256",
+)
+
+_SHAKE_DIGEST_BYTES = 32
+
+
+@dataclass(frozen=True)
+class HashCase:
+    name: str
+    runner: Callable[[Callable[..., object]], None]
+
+
+def _supports_usedforsecurity(constructor: Callable[..., object]) -> bool:
+    try:
+        signature = inspect.signature(constructor)
+    except (TypeError, ValueError):
+        return False
+    return "usedforsecurity" in signature.parameters
+
+
+def _make_constructor(name: str) -> Callable[..., object]:
+    constructor = getattr(hashlib, name, None)
+    if constructor is not None:
+        if _supports_usedforsecurity(constructor):
+            return lambda data=b"": constructor(data, usedforsecurity=False)
+        return constructor
+
+    def _constructor(data=b""):
+        try:
+            return hashlib.new(name, data, usedforsecurity=False)
+        except TypeError:
+            return hashlib.new(name, data)
+
+    return _constructor
+
+
+def _file_stream_case(name: str, key: bytes) -> HashCase:
+    def _runner(constructor: Callable[..., object]) -> None:
+        _hex_digest(constructor(key))
+
+    return HashCase(name, _runner)
+
+
+def _program_cache_case(name: str, payloads: tuple[tuple[str, bytes], ...]) -> HashCase:
+    def _runner(constructor: Callable[..., object]) -> None:
+        hasher = constructor()
+        for label, payload in payloads:
+            hasher.update(label.encode("ascii"))
+            hasher.update(len(payload).to_bytes(8, "big"))
+            hasher.update(payload)
+        _digest_bytes(hasher)
+
+    return HashCase(name, _runner)
+
+
+def _end_to_end_case(name: str, payloads: tuple[tuple[str, bytes], ...]) -> HashCase:
+    def _runner(constructor: Callable[..., object]) -> None:
+        hasher = constructor()
+        for label, payload in payloads:
+            hasher.update(label.encode("ascii"))
+            hasher.update(len(payload).to_bytes(8, "big"))
+            hasher.update(payload)
+        key = _digest_bytes(hasher)
+        _hex_digest(constructor(key))
+
+    return HashCase(name, _runner)
+
+
+def _digest_bytes(hasher: object) -> bytes:
+    try:
+        return hasher.digest()
+    except TypeError:
+        return hasher.digest(_SHAKE_DIGEST_BYTES)
+
+
+def _hex_digest(hasher: object) -> str:
+    try:
+        return hasher.hexdigest()
+    except TypeError:
+        return hasher.hexdigest(_SHAKE_DIGEST_BYTES)
+
+
+def _sample_cases() -> tuple[HashCase, ...]:
+    file_stream_key = bytes.fromhex("ab" * 32)
+    long_file_stream_key = (b"cuda-core-cache-key-" * 128)[:4096]
+
+    source = b"""
+extern "C" __global__ void saxpy(float a, const float* x, float* y) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    y[i] = a * x[i] + y[i];
+}
+""".strip()
+    ptx = b"""
+.version 8.0
+.target sm_90
+.address_size 64
+.visible .entry saxpy() { ret; }
+""".strip()
+    option_bytes = (
+        b"name='saxpy'",
+        b"arch='sm_90'",
+        b"max_register_count=None",
+        b"time=False",
+        b"link_time_optimization=False",
+        b"debug=False",
+        b"lineinfo=False",
+        b"ftz=None",
+        b"prec_div=None",
+        b"prec_sqrt=None",
+        b"fma=None",
+        b"split_compile=None",
+        b"ptxas_options=None",
+        b"no_cache=False",
+    )
+    names = (b"saxpy", b"_Z5saxpyv")
+    extra_digest = bytes.fromhex("cd" * 32)
+
+    cpp_payloads = (
+        ("schema", b"2"),
+        ("nvrtc", b"13.2"),
+        ("code_type", b"c++"),
+        ("target_type", b"cubin"),
+        ("code", source),
+        ("option_count", str(len(option_bytes)).encode("ascii")),
+        *tuple(("option", item) for item in option_bytes),
+        ("names_count", str(len(names)).encode("ascii")),
+        *tuple(("name", item) for item in names),
+        ("options_name", b"saxpy"),
+        ("extra_digest", extra_digest),
+    )
+    ptx_payloads = (
+        ("schema", b"2"),
+        ("linker", b"nvJitLink-13.2"),
+        ("code_type", b"ptx"),
+        ("target_type", b"cubin"),
+        ("code", ptx),
+        ("option_count", str(len(option_bytes)).encode("ascii")),
+        *tuple(("option", item) for item in option_bytes),
+        ("names_count", b"0"),
+        ("extra_digest", extra_digest),
+    )
+
+    return (
+        _file_stream_case("file_stream_key_32b", file_stream_key),
+        _file_stream_case("file_stream_key_4k", long_file_stream_key),
+        _program_cache_case("program_cache_cpp", cpp_payloads),
+        _program_cache_case("program_cache_ptx", ptx_payloads),
+        _end_to_end_case("end_to_end_cpp", cpp_payloads),
+        _end_to_end_case("end_to_end_ptx", ptx_payloads),
+    )
+
+
+def _benchmark_case(
+    case: HashCase,
+    constructor: Callable[..., object],
+    *,
+    loops: int,
+    repeat: int,
+) -> tuple[float, float]:
+    samples_ns: list[float] = []
+    for _ in range(repeat):
+        start = time.perf_counter_ns()
+        for _ in range(loops):
+            case.runner(constructor)
+        elapsed = time.perf_counter_ns() - start
+        samples_ns.append(elapsed / loops)
+    return statistics.mean(samples_ns), min(samples_ns)
+
+
+def _format_ns(value: float) -> str:
+    return f"{value:,.1f}"
+
+
+def _write_line(text: str = "") -> None:
+    sys.stdout.write(text + "\n")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--loops",
+        type=int,
+        default=200_000,
+        help="Iterations per repeat for each algorithm/case pair.",
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=7,
+        help="Independent timing repeats for each algorithm/case pair.",
+    )
+    parser.add_argument(
+        "--algorithms",
+        nargs="+",
+        default=list(_DEFAULT_ALGORITHMS),
+        help="hashlib algorithm names to benchmark.",
+    )
+    args = parser.parse_args()
+
+    cases = _sample_cases()
+    widths = {
+        "algorithm": max(len("Algorithm"), max(len(name) for name in args.algorithms)),
+        "case": max(len(case.name) for case in cases),
+    }
+
+    _write_line(
+        f"{'Algorithm':<{widths['algorithm']}}  "
+        f"{'Case':<{widths['case']}}  {'mean ns/op':>12}  {'best ns/op':>12}"
+    )
+    _write_line("-" * (widths["algorithm"] + widths["case"] + 28))
+
+    for algorithm in args.algorithms:
+        constructor = _make_constructor(algorithm)
+        for case in cases:
+            mean_ns, best_ns = _benchmark_case(case, constructor, loops=args.loops, repeat=args.repeat)
+            _write_line(
+                f"{algorithm:<{widths['algorithm']}}  "
+                f"{case.name:<{widths['case']}}  "
+                f"{_format_ns(mean_ns):>12}  {_format_ns(best_ns):>12}"
+            )
+
+
+if __name__ == "__main__":
+    main()

From 70fa564c1b6015f90c2a791bc7940b86d50fd30d Mon Sep 17 00:00:00 2001
From: Aryan <aryansputta@gmail.com>
Date: Fri, 15 May 2026 22:12:03 -0400
Subject: [PATCH 6/6] Remove benchmark helper script

---
 scripts/bench_program_cache_hashes.py | 266 --------------------------
 1 file changed, 266 deletions(-)
 delete mode 100644 scripts/bench_program_cache_hashes.py

diff --git a/scripts/bench_program_cache_hashes.py b/scripts/bench_program_cache_hashes.py
deleted file mode 100644
index 147a847d90e..00000000000
--- a/scripts/bench_program_cache_hashes.py
+++ /dev/null
@@ -1,266 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-"""Benchmark FIPS-available hashlib candidates for cuda.core program-cache use.
-
-This mirrors the two relevant call sites:
-
-* ``FileStreamProgramCache._path_for_key()``: hash a cache key to a stable
-  filename component via ``hexdigest()``.
-* ``make_program_cache_key()``: incrementally build the digest from labeled
-  payload chunks and return ``digest()``.
-
-This is a review/support tool, not a production dependency. The benchmark is
-intentionally stdlib-only so reviewers can run it directly.
-"""
-
-from __future__ import annotations
-
-import argparse
-import hashlib
-import inspect
-import statistics
-import sys
-import time
-from dataclasses import dataclass
-from typing import Callable
-
-_DEFAULT_ALGORITHMS = (
-    "sha1",
-    "sha224",
-    "sha256",
-    "sha384",
-    "sha512",
-    "sha512_224",
-    "sha512_256",
-    "sha3_224",
-    "sha3_256",
-    "sha3_384",
-    "sha3_512",
-    "shake_128",
-    "shake_256",
-)
-
-_SHAKE_DIGEST_BYTES = 32
-
-
-@dataclass(frozen=True)
-class HashCase:
-    name: str
-    runner: Callable[[Callable[..., object]], None]
-
-
-def _supports_usedforsecurity(constructor: Callable[..., object]) -> bool:
-    try:
-        signature = inspect.signature(constructor)
-    except (TypeError, ValueError):
-        return False
-    return "usedforsecurity" in signature.parameters
-
-
-def _make_constructor(name: str) -> Callable[..., object]:
-    constructor = getattr(hashlib, name, None)
-    if constructor is not None:
-        if _supports_usedforsecurity(constructor):
-            return lambda data=b"": constructor(data, usedforsecurity=False)
-        return constructor
-
-    def _constructor(data=b""):
-        try:
-            return hashlib.new(name, data, usedforsecurity=False)
-        except TypeError:
-            return hashlib.new(name, data)
-
-    return _constructor
-
-
-def _file_stream_case(name: str, key: bytes) -> HashCase:
-    def _runner(constructor: Callable[..., object]) -> None:
-        _hex_digest(constructor(key))
-
-    return HashCase(name, _runner)
-
-
-def _program_cache_case(name: str, payloads: tuple[tuple[str, bytes], ...]) -> HashCase:
-    def _runner(constructor: Callable[..., object]) -> None:
-        hasher = constructor()
-        for label, payload in payloads:
-            hasher.update(label.encode("ascii"))
-            hasher.update(len(payload).to_bytes(8, "big"))
-            hasher.update(payload)
-        _digest_bytes(hasher)
-
-    return HashCase(name, _runner)
-
-
-def _end_to_end_case(name: str, payloads: tuple[tuple[str, bytes], ...]) -> HashCase:
-    def _runner(constructor: Callable[..., object]) -> None:
-        hasher = constructor()
-        for label, payload in payloads:
-            hasher.update(label.encode("ascii"))
-            hasher.update(len(payload).to_bytes(8, "big"))
-            hasher.update(payload)
-        key = _digest_bytes(hasher)
-        _hex_digest(constructor(key))
-
-    return HashCase(name, _runner)
-
-
-def _digest_bytes(hasher: object) -> bytes:
-    try:
-        return hasher.digest()
-    except TypeError:
-        return hasher.digest(_SHAKE_DIGEST_BYTES)
-
-
-def _hex_digest(hasher: object) -> str:
-    try:
-        return hasher.hexdigest()
-    except TypeError:
-        return hasher.hexdigest(_SHAKE_DIGEST_BYTES)
-
-
-def _sample_cases() -> tuple[HashCase, ...]:
-    file_stream_key = bytes.fromhex("ab" * 32)
-    long_file_stream_key = (b"cuda-core-cache-key-" * 128)[:4096]
-
-    source = b"""
-extern "C" __global__ void saxpy(float a, const float* x, float* y) {
-    const int i = blockIdx.x * blockDim.x + threadIdx.x;
-    y[i] = a * x[i] + y[i];
-}
-""".strip()
-    ptx = b"""
-.version 8.0
-.target sm_90
-.address_size 64
-.visible .entry saxpy() { ret; }
-""".strip()
-    option_bytes = (
-        b"name='saxpy'",
-        b"arch='sm_90'",
-        b"max_register_count=None",
-        b"time=False",
-        b"link_time_optimization=False",
-        b"debug=False",
-        b"lineinfo=False",
-        b"ftz=None",
-        b"prec_div=None",
-        b"prec_sqrt=None",
-        b"fma=None",
-        b"split_compile=None",
-        b"ptxas_options=None",
-        b"no_cache=False",
-    )
-    names = (b"saxpy", b"_Z5saxpyv")
-    extra_digest = bytes.fromhex("cd" * 32)
-
-    cpp_payloads = (
-        ("schema", b"2"),
-        ("nvrtc", b"13.2"),
-        ("code_type", b"c++"),
-        ("target_type", b"cubin"),
-        ("code", source),
-        ("option_count", str(len(option_bytes)).encode("ascii")),
-        *tuple(("option", item) for item in option_bytes),
-        ("names_count", str(len(names)).encode("ascii")),
-        *tuple(("name", item) for item in names),
-        ("options_name", b"saxpy"),
-        ("extra_digest", extra_digest),
-    )
-    ptx_payloads = (
-        ("schema", b"2"),
-        ("linker", b"nvJitLink-13.2"),
-        ("code_type", b"ptx"),
-        ("target_type", b"cubin"),
-        ("code", ptx),
-        ("option_count", str(len(option_bytes)).encode("ascii")),
-        *tuple(("option", item) for item in option_bytes),
-        ("names_count", b"0"),
-        ("extra_digest", extra_digest),
-    )
-
-    return (
-        _file_stream_case("file_stream_key_32b", file_stream_key),
-        _file_stream_case("file_stream_key_4k", long_file_stream_key),
-        _program_cache_case("program_cache_cpp", cpp_payloads),
-        _program_cache_case("program_cache_ptx", ptx_payloads),
-        _end_to_end_case("end_to_end_cpp", cpp_payloads),
-        _end_to_end_case("end_to_end_ptx", ptx_payloads),
-    )
-
-
-def _benchmark_case(
-    case: HashCase,
-    constructor: Callable[..., object],
-    *,
-    loops: int,
-    repeat: int,
-) -> tuple[float, float]:
-    samples_ns: list[float] = []
-    for _ in range(repeat):
-        start = time.perf_counter_ns()
-        for _ in range(loops):
-            case.runner(constructor)
-        elapsed = time.perf_counter_ns() - start
-        samples_ns.append(elapsed / loops)
-    return statistics.mean(samples_ns), min(samples_ns)
-
-
-def _format_ns(value: float) -> str:
-    return f"{value:,.1f}"
-
-
-def _write_line(text: str = "") -> None:
-    sys.stdout.write(text + "\n")
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument(
-        "--loops",
-        type=int,
-        default=200_000,
-        help="Iterations per repeat for each algorithm/case pair.",
-    )
-    parser.add_argument(
-        "--repeat",
-        type=int,
-        default=7,
-        help="Independent timing repeats for each algorithm/case pair.",
-    )
-    parser.add_argument(
-        "--algorithms",
-        nargs="+",
-        default=list(_DEFAULT_ALGORITHMS),
-        help="hashlib algorithm names to benchmark.",
-    )
-    args = parser.parse_args()
-
-    cases = _sample_cases()
-    widths = {
-        "algorithm": max(len("Algorithm"), max(len(name) for name in args.algorithms)),
-        "case": max(len(case.name) for case in cases),
-    }
-
-    _write_line(
-        f"{'Algorithm':<{widths['algorithm']}}  "
-        f"{'Case':<{widths['case']}}  {'mean ns/op':>12}  {'best ns/op':>12}"
-    )
-    _write_line("-" * (widths["algorithm"] + widths["case"] + 28))
-
-    for algorithm in args.algorithms:
-        constructor = _make_constructor(algorithm)
-        for case in cases:
-            mean_ns, best_ns = _benchmark_case(case, constructor, loops=args.loops, repeat=args.repeat)
-            _write_line(
-                f"{algorithm:<{widths['algorithm']}}  "
-                f"{case.name:<{widths['case']}}  "
-                f"{_format_ns(mean_ns):>12}  {_format_ns(best_ns):>12}"
-            )
-
-
-if __name__ == "__main__":
-    main()