From 956fd5186bde02b2fb2f2c7882a80d0ffe6c0ac7 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 14 May 2026 01:24:25 -0400 Subject: [PATCH 1/6] Use FIPS-safe hashes for program cache keys Signed-off-by: Aryan --- .../core/utils/_program_cache/_file_stream.py | 6 +- .../cuda/core/utils/_program_cache/_keys.py | 4 +- cuda_core/tests/test_program_cache.py | 2 +- cuda_core/tests/test_program_cache_fips.py | 126 ++++++++++++++++++ 4 files changed, 132 insertions(+), 6 deletions(-) create mode 100644 cuda_core/tests/test_program_cache_fips.py diff --git a/cuda_core/cuda/core/utils/_program_cache/_file_stream.py b/cuda_core/cuda/core/utils/_program_cache/_file_stream.py index eccf494c99b..857979bc9be 100644 --- a/cuda_core/cuda/core/utils/_program_cache/_file_stream.py +++ b/cuda_core/cuda/core/utils/_program_cache/_file_stream.py @@ -422,11 +422,11 @@ def _path_for_key(self, key: object) -> Path: k = _as_key_bytes(key) # Hash the key to a fixed-length identifier so arbitrary-length user # keys never exceed per-component filename limits (typically 255 on - # ext4 / NTFS). With a 256-bit blake2b digest, the cache relies on + # ext4 / NTFS). With a 256-bit SHA-256 digest, the cache relies on # cryptographic collision resistance for key uniqueness -- two # distinct keys hashing to the same path is astronomically unlikely - # (~2^-128 with the 32-byte digest in use here). - digest = hashlib.blake2b(k, digest_size=32).hexdigest() + # (~2^-128 for practical collision work). + digest = hashlib.sha256(k).hexdigest() return self._entries / digest[:2] / digest[2:] # -- mapping API --------------------------------------------------------- diff --git a/cuda_core/cuda/core/utils/_program_cache/_keys.py b/cuda_core/cuda/core/utils/_program_cache/_keys.py index fbb5ef3f890..bb0fae9278a 100644 --- a/cuda_core/cuda/core/utils/_program_cache/_keys.py +++ b/cuda_core/cuda/core/utils/_program_cache/_keys.py @@ -35,7 +35,7 @@ ) # Bump when the key schema changes in a way that invalidates existing caches. -_KEY_SCHEMA_VERSION = 1 +_KEY_SCHEMA_VERSION = 2 _VALID_CODE_TYPES = frozenset({"c++", "ptx", "nvvm"}) _VALID_TARGET_TYPES = frozenset({"ptx", "cubin", "ltoir"}) @@ -768,7 +768,7 @@ def make_program_cache_key( option_bytes = backend.option_fingerprint(options, target_type) name_tags = backend.encode_name_expressions(name_expressions) - hasher = hashlib.blake2b(digest_size=32) + hasher = hashlib.sha256() def _update(label: str, payload: bytes) -> None: hasher.update(label.encode("ascii")) diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py index 5a0dbcba2a2..3923312a3e9 100644 --- a/cuda_core/tests/test_program_cache.py +++ b/cuda_core/tests/test_program_cache.py @@ -1773,7 +1773,7 @@ def test_filestream_cache_size_cap_counts_tmp_files(tmp_path): def test_filestream_cache_handles_long_keys(tmp_path): """Arbitrary-length keys must not overflow per-component filename limits. - The filename is a fixed-length 256-bit blake2b digest; key uniqueness + The filename is a fixed-length 256-bit digest; key uniqueness relies on the digest's collision resistance.""" from cuda.core.utils import FileStreamProgramCache diff --git a/cuda_core/tests/test_program_cache_fips.py b/cuda_core/tests/test_program_cache_fips.py new file mode 100644 index 00000000000..5bc5f5682cd --- /dev/null +++ b/cuda_core/tests/test_program_cache_fips.py @@ -0,0 +1,126 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Source-level regression tests for FIPS-safe program-cache hashing. + +These tests load the leaf program-cache modules directly from source with +small stubs, so they can run without importing the full built ``cuda.core`` +package. Run with ``--noconftest`` when the compiled extensions are not +available: + + pytest cuda_core/tests/test_program_cache_fips.py --noconftest +""" + +from __future__ import annotations + +import importlib.util +import sys +import types +from pathlib import Path + + +def _load_program_cache_modules(monkeypatch): + cuda_pkg = types.ModuleType("cuda") + cuda_pkg.__path__ = [] + core_pkg = types.ModuleType("cuda.core") + core_pkg.__path__ = [] + utils_pkg = types.ModuleType("cuda.core.utils") + utils_pkg.__path__ = [] + cache_pkg = types.ModuleType("cuda.core.utils._program_cache") + cache_pkg.__path__ = [] + utils_internal_pkg = types.ModuleType("cuda.core._utils") + utils_internal_pkg.__path__ = [] + + module_mod = types.ModuleType("cuda.core._module") + + class ObjectCode: + pass + + module_mod.ObjectCode = ObjectCode + + program_mod = types.ModuleType("cuda.core._program") + + class ProgramOptions: + def __init__(self, **kwargs): + self.arch = kwargs.pop("arch", "sm_80") + self.name = kwargs.pop("name", "default_program") + for key, value in kwargs.items(): + setattr(self, key, value) + + def as_bytes(self, backend, target_type): + return [ + f"backend={backend}".encode(), + f"target_type={target_type}".encode(), + f"arch={self.arch}".encode(), + f"name={self.name}".encode(), + ] + + program_mod.ProgramOptions = ProgramOptions + + cuda_utils_mod = types.ModuleType("cuda.core._utils.cuda_utils") + cuda_utils_mod.driver = types.SimpleNamespace() + cuda_utils_mod.handle_return = lambda result: result + cuda_utils_mod.nvrtc = types.SimpleNamespace(nvrtcVersion=lambda: (13, 0)) + + modules = { + "cuda": cuda_pkg, + "cuda.core": core_pkg, + "cuda.core.utils": utils_pkg, + "cuda.core.utils._program_cache": cache_pkg, + "cuda.core._utils": utils_internal_pkg, + "cuda.core._module": module_mod, + "cuda.core._program": program_mod, + "cuda.core._utils.cuda_utils": cuda_utils_mod, + } + for name, module in modules.items(): + monkeypatch.setitem(sys.modules, name, module) + + base = Path(__file__).parent.parent / "cuda" / "core" / "utils" / "_program_cache" + + def _load(name, filename): + spec = importlib.util.spec_from_file_location(name, base / filename) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + monkeypatch.setitem(sys.modules, name, module) + return module + + _load("cuda.core.utils._program_cache._abc", "_abc.py") + keys_mod = _load("cuda.core.utils._program_cache._keys", "_keys.py") + file_stream_mod = _load("cuda.core.utils._program_cache._file_stream", "_file_stream.py") + return keys_mod, file_stream_mod, ProgramOptions + + +def test_make_program_cache_key_avoids_fips_blocked_blake2b(monkeypatch): + import hashlib + + keys_mod, _file_stream_mod, ProgramOptions = _load_program_cache_modules(monkeypatch) + + def _blake2b_disabled(*args, **kwargs): + raise ValueError("disabled for FIPS") + + monkeypatch.setattr(hashlib, "blake2b", _blake2b_disabled) + + key = keys_mod.make_program_cache_key( + code="extern \"C\" __global__ void k() {}", + code_type="c++", + options=ProgramOptions(arch="sm_80"), + target_type="cubin", + ) + + assert isinstance(key, bytes) + assert len(key) == 32 + + +def test_filestream_cache_path_hash_avoids_fips_blocked_blake2b(tmp_path, monkeypatch): + import hashlib + + _keys_mod, file_stream_mod, _ProgramOptions = _load_program_cache_modules(monkeypatch) + + def _blake2b_disabled(*args, **kwargs): + raise ValueError("disabled for FIPS") + + monkeypatch.setattr(hashlib, "blake2b", _blake2b_disabled) + + with file_stream_mod.FileStreamProgramCache(tmp_path / "fc") as cache: + cache[b"my-key"] = b"payload" + assert cache[b"my-key"] == b"payload" From d8727ee4a965afd4aed3f998a1fcc03f2fae1f50 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 14 May 2026 01:31:04 -0400 Subject: [PATCH 2/6] Fix FIPS cache test linting Signed-off-by: Aryan --- cuda_core/tests/test_program_cache_fips.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/test_program_cache_fips.py b/cuda_core/tests/test_program_cache_fips.py index 5bc5f5682cd..800039675bf 100644 --- a/cuda_core/tests/test_program_cache_fips.py +++ b/cuda_core/tests/test_program_cache_fips.py @@ -101,7 +101,7 @@ def _blake2b_disabled(*args, **kwargs): monkeypatch.setattr(hashlib, "blake2b", _blake2b_disabled) key = keys_mod.make_program_cache_key( - code="extern \"C\" __global__ void k() {}", + code='extern "C" __global__ void k() {}', code_type="c++", options=ProgramOptions(arch="sm_80"), target_type="cubin", From 44164f75f84db74c1aa4ee4b6aa3777f62e7a9a0 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 14 May 2026 12:22:18 -0400 Subject: [PATCH 3/6] Use benchmarked FIPS-safe cache hashing Signed-off-by: Aryan --- .../core/utils/_program_cache/_file_stream.py | 16 ++- .../cuda/core/utils/_program_cache/_keys.py | 5 +- cuda_core/tests/test_program_cache.py | 2 +- cuda_core/tests/test_program_cache_fips.py | 126 ------------------ 4 files changed, 16 insertions(+), 133 deletions(-) delete mode 100644 cuda_core/tests/test_program_cache_fips.py diff --git a/cuda_core/cuda/core/utils/_program_cache/_file_stream.py b/cuda_core/cuda/core/utils/_program_cache/_file_stream.py index 857979bc9be..3bba1d53d6b 100644 --- a/cuda_core/cuda/core/utils/_program_cache/_file_stream.py +++ b/cuda_core/cuda/core/utils/_program_cache/_file_stream.py @@ -422,11 +422,17 @@ def _path_for_key(self, key: object) -> Path: k = _as_key_bytes(key) # Hash the key to a fixed-length identifier so arbitrary-length user # keys never exceed per-component filename limits (typically 255 on - # ext4 / NTFS). With a 256-bit SHA-256 digest, the cache relies on - # cryptographic collision resistance for key uniqueness -- two - # distinct keys hashing to the same path is astronomically unlikely - # (~2^-128 for practical collision work). - digest = hashlib.sha256(k).hexdigest() + # ext4 / NTFS). + # + # FIPS: must use a FIPS-approved hash algorithm. FIPS-enforcing + # systems can disable non-approved hashlib algorithms (for example + # blake2b) at the OpenSSL level. See #2043. + # + # With a 384-bit SHA-384 digest, the cache relies on collision + # resistance for key uniqueness -- two distinct keys hashing to the + # same path is astronomically unlikely (~2^-192 for practical + # collision work). + digest = hashlib.sha384(k, usedforsecurity=False).hexdigest() return self._entries / digest[:2] / digest[2:] # -- mapping API --------------------------------------------------------- diff --git a/cuda_core/cuda/core/utils/_program_cache/_keys.py b/cuda_core/cuda/core/utils/_program_cache/_keys.py index bb0fae9278a..34ef1af5242 100644 --- a/cuda_core/cuda/core/utils/_program_cache/_keys.py +++ b/cuda_core/cuda/core/utils/_program_cache/_keys.py @@ -768,7 +768,10 @@ def make_program_cache_key( option_bytes = backend.option_fingerprint(options, target_type) name_tags = backend.encode_name_expressions(name_expressions) - hasher = hashlib.sha256() + # IMPORTANT: Must use a FIPS-approved hash algorithm (SHA-2 family). + # FIPS-enforcing systems can disable non-approved hashlib algorithms + # (for example blake2b) at the OpenSSL level. See #2043. + hasher = hashlib.sha384(usedforsecurity=False) def _update(label: str, payload: bytes) -> None: hasher.update(label.encode("ascii")) diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py index 3923312a3e9..44f8a7f5552 100644 --- a/cuda_core/tests/test_program_cache.py +++ b/cuda_core/tests/test_program_cache.py @@ -126,7 +126,7 @@ def _make_key(**overrides): def test_make_program_cache_key_returns_bytes(): key = _make_key() assert isinstance(key, bytes) - assert len(key) == 32 + assert len(key) == 48 def test_make_program_cache_key_propagates_as_bytes_typeerror(monkeypatch): diff --git a/cuda_core/tests/test_program_cache_fips.py b/cuda_core/tests/test_program_cache_fips.py deleted file mode 100644 index 800039675bf..00000000000 --- a/cuda_core/tests/test_program_cache_fips.py +++ /dev/null @@ -1,126 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Source-level regression tests for FIPS-safe program-cache hashing. - -These tests load the leaf program-cache modules directly from source with -small stubs, so they can run without importing the full built ``cuda.core`` -package. Run with ``--noconftest`` when the compiled extensions are not -available: - - pytest cuda_core/tests/test_program_cache_fips.py --noconftest -""" - -from __future__ import annotations - -import importlib.util -import sys -import types -from pathlib import Path - - -def _load_program_cache_modules(monkeypatch): - cuda_pkg = types.ModuleType("cuda") - cuda_pkg.__path__ = [] - core_pkg = types.ModuleType("cuda.core") - core_pkg.__path__ = [] - utils_pkg = types.ModuleType("cuda.core.utils") - utils_pkg.__path__ = [] - cache_pkg = types.ModuleType("cuda.core.utils._program_cache") - cache_pkg.__path__ = [] - utils_internal_pkg = types.ModuleType("cuda.core._utils") - utils_internal_pkg.__path__ = [] - - module_mod = types.ModuleType("cuda.core._module") - - class ObjectCode: - pass - - module_mod.ObjectCode = ObjectCode - - program_mod = types.ModuleType("cuda.core._program") - - class ProgramOptions: - def __init__(self, **kwargs): - self.arch = kwargs.pop("arch", "sm_80") - self.name = kwargs.pop("name", "default_program") - for key, value in kwargs.items(): - setattr(self, key, value) - - def as_bytes(self, backend, target_type): - return [ - f"backend={backend}".encode(), - f"target_type={target_type}".encode(), - f"arch={self.arch}".encode(), - f"name={self.name}".encode(), - ] - - program_mod.ProgramOptions = ProgramOptions - - cuda_utils_mod = types.ModuleType("cuda.core._utils.cuda_utils") - cuda_utils_mod.driver = types.SimpleNamespace() - cuda_utils_mod.handle_return = lambda result: result - cuda_utils_mod.nvrtc = types.SimpleNamespace(nvrtcVersion=lambda: (13, 0)) - - modules = { - "cuda": cuda_pkg, - "cuda.core": core_pkg, - "cuda.core.utils": utils_pkg, - "cuda.core.utils._program_cache": cache_pkg, - "cuda.core._utils": utils_internal_pkg, - "cuda.core._module": module_mod, - "cuda.core._program": program_mod, - "cuda.core._utils.cuda_utils": cuda_utils_mod, - } - for name, module in modules.items(): - monkeypatch.setitem(sys.modules, name, module) - - base = Path(__file__).parent.parent / "cuda" / "core" / "utils" / "_program_cache" - - def _load(name, filename): - spec = importlib.util.spec_from_file_location(name, base / filename) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - monkeypatch.setitem(sys.modules, name, module) - return module - - _load("cuda.core.utils._program_cache._abc", "_abc.py") - keys_mod = _load("cuda.core.utils._program_cache._keys", "_keys.py") - file_stream_mod = _load("cuda.core.utils._program_cache._file_stream", "_file_stream.py") - return keys_mod, file_stream_mod, ProgramOptions - - -def test_make_program_cache_key_avoids_fips_blocked_blake2b(monkeypatch): - import hashlib - - keys_mod, _file_stream_mod, ProgramOptions = _load_program_cache_modules(monkeypatch) - - def _blake2b_disabled(*args, **kwargs): - raise ValueError("disabled for FIPS") - - monkeypatch.setattr(hashlib, "blake2b", _blake2b_disabled) - - key = keys_mod.make_program_cache_key( - code='extern "C" __global__ void k() {}', - code_type="c++", - options=ProgramOptions(arch="sm_80"), - target_type="cubin", - ) - - assert isinstance(key, bytes) - assert len(key) == 32 - - -def test_filestream_cache_path_hash_avoids_fips_blocked_blake2b(tmp_path, monkeypatch): - import hashlib - - _keys_mod, file_stream_mod, _ProgramOptions = _load_program_cache_modules(monkeypatch) - - def _blake2b_disabled(*args, **kwargs): - raise ValueError("disabled for FIPS") - - monkeypatch.setattr(hashlib, "blake2b", _blake2b_disabled) - - with file_stream_mod.FileStreamProgramCache(tmp_path / "fc") as cache: - cache[b"my-key"] = b"payload" - assert cache[b"my-key"] == b"payload" From c08fc21754a1fab2ccc6d3028002d52b097687af Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 14 May 2026 12:55:00 -0400 Subject: [PATCH 4/6] Clarify SHA-384 cache digest comment Signed-off-by: Aryan --- cuda_core/tests/test_program_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py index 44f8a7f5552..bca3ee1a1c3 100644 --- a/cuda_core/tests/test_program_cache.py +++ b/cuda_core/tests/test_program_cache.py @@ -1773,7 +1773,7 @@ def test_filestream_cache_size_cap_counts_tmp_files(tmp_path): def test_filestream_cache_handles_long_keys(tmp_path): """Arbitrary-length keys must not overflow per-component filename limits. - The filename is a fixed-length 256-bit digest; key uniqueness + The filename is a fixed-length 384-bit digest; key uniqueness relies on the digest's collision resistance.""" from cuda.core.utils import FileStreamProgramCache From 6f98260c0655e51a37427c25e37286eb95a9b3e5 Mon Sep 17 00:00:00 2001 From: Aryan Date: Fri, 15 May 2026 17:09:52 -0400 Subject: [PATCH 5/6] Switch program cache hashing back to sha256 --- .../core/utils/_program_cache/_file_stream.py | 8 +- .../cuda/core/utils/_program_cache/_keys.py | 2 +- cuda_core/tests/test_program_cache.py | 4 +- scripts/bench_program_cache_hashes.py | 266 ++++++++++++++++++ 4 files changed, 273 insertions(+), 7 deletions(-) create mode 100644 scripts/bench_program_cache_hashes.py diff --git a/cuda_core/cuda/core/utils/_program_cache/_file_stream.py b/cuda_core/cuda/core/utils/_program_cache/_file_stream.py index 3bba1d53d6b..4459eeded6b 100644 --- a/cuda_core/cuda/core/utils/_program_cache/_file_stream.py +++ b/cuda_core/cuda/core/utils/_program_cache/_file_stream.py @@ -428,11 +428,11 @@ def _path_for_key(self, key: object) -> Path: # systems can disable non-approved hashlib algorithms (for example # blake2b) at the OpenSSL level. See #2043. # - # With a 384-bit SHA-384 digest, the cache relies on collision + # With a 256-bit SHA-256 digest, the cache relies on collision # resistance for key uniqueness -- two distinct keys hashing to the - # same path is astronomically unlikely (~2^-192 for practical - # collision work). - digest = hashlib.sha384(k, usedforsecurity=False).hexdigest() + # same path is astronomically unlikely (~2^128 practical collision + # work). + digest = hashlib.sha256(k, usedforsecurity=False).hexdigest() return self._entries / digest[:2] / digest[2:] # -- mapping API --------------------------------------------------------- diff --git a/cuda_core/cuda/core/utils/_program_cache/_keys.py b/cuda_core/cuda/core/utils/_program_cache/_keys.py index 34ef1af5242..039853204cc 100644 --- a/cuda_core/cuda/core/utils/_program_cache/_keys.py +++ b/cuda_core/cuda/core/utils/_program_cache/_keys.py @@ -771,7 +771,7 @@ def make_program_cache_key( # IMPORTANT: Must use a FIPS-approved hash algorithm (SHA-2 family). # FIPS-enforcing systems can disable non-approved hashlib algorithms # (for example blake2b) at the OpenSSL level. See #2043. - hasher = hashlib.sha384(usedforsecurity=False) + hasher = hashlib.sha256(usedforsecurity=False) def _update(label: str, payload: bytes) -> None: hasher.update(label.encode("ascii")) diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py index bca3ee1a1c3..3923312a3e9 100644 --- a/cuda_core/tests/test_program_cache.py +++ b/cuda_core/tests/test_program_cache.py @@ -126,7 +126,7 @@ def _make_key(**overrides): def test_make_program_cache_key_returns_bytes(): key = _make_key() assert isinstance(key, bytes) - assert len(key) == 48 + assert len(key) == 32 def test_make_program_cache_key_propagates_as_bytes_typeerror(monkeypatch): @@ -1773,7 +1773,7 @@ def test_filestream_cache_size_cap_counts_tmp_files(tmp_path): def test_filestream_cache_handles_long_keys(tmp_path): """Arbitrary-length keys must not overflow per-component filename limits. - The filename is a fixed-length 384-bit digest; key uniqueness + The filename is a fixed-length 256-bit digest; key uniqueness relies on the digest's collision resistance.""" from cuda.core.utils import FileStreamProgramCache diff --git a/scripts/bench_program_cache_hashes.py b/scripts/bench_program_cache_hashes.py new file mode 100644 index 00000000000..147a847d90e --- /dev/null +++ b/scripts/bench_program_cache_hashes.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Benchmark FIPS-available hashlib candidates for cuda.core program-cache use. + +This mirrors the two relevant call sites: + +* ``FileStreamProgramCache._path_for_key()``: hash a cache key to a stable + filename component via ``hexdigest()``. +* ``make_program_cache_key()``: incrementally build the digest from labeled + payload chunks and return ``digest()``. + +This is a review/support tool, not a production dependency. The benchmark is +intentionally stdlib-only so reviewers can run it directly. +""" + +from __future__ import annotations + +import argparse +import hashlib +import inspect +import statistics +import sys +import time +from dataclasses import dataclass +from typing import Callable + +_DEFAULT_ALGORITHMS = ( + "sha1", + "sha224", + "sha256", + "sha384", + "sha512", + "sha512_224", + "sha512_256", + "sha3_224", + "sha3_256", + "sha3_384", + "sha3_512", + "shake_128", + "shake_256", +) + +_SHAKE_DIGEST_BYTES = 32 + + +@dataclass(frozen=True) +class HashCase: + name: str + runner: Callable[[Callable[..., object]], None] + + +def _supports_usedforsecurity(constructor: Callable[..., object]) -> bool: + try: + signature = inspect.signature(constructor) + except (TypeError, ValueError): + return False + return "usedforsecurity" in signature.parameters + + +def _make_constructor(name: str) -> Callable[..., object]: + constructor = getattr(hashlib, name, None) + if constructor is not None: + if _supports_usedforsecurity(constructor): + return lambda data=b"": constructor(data, usedforsecurity=False) + return constructor + + def _constructor(data=b""): + try: + return hashlib.new(name, data, usedforsecurity=False) + except TypeError: + return hashlib.new(name, data) + + return _constructor + + +def _file_stream_case(name: str, key: bytes) -> HashCase: + def _runner(constructor: Callable[..., object]) -> None: + _hex_digest(constructor(key)) + + return HashCase(name, _runner) + + +def _program_cache_case(name: str, payloads: tuple[tuple[str, bytes], ...]) -> HashCase: + def _runner(constructor: Callable[..., object]) -> None: + hasher = constructor() + for label, payload in payloads: + hasher.update(label.encode("ascii")) + hasher.update(len(payload).to_bytes(8, "big")) + hasher.update(payload) + _digest_bytes(hasher) + + return HashCase(name, _runner) + + +def _end_to_end_case(name: str, payloads: tuple[tuple[str, bytes], ...]) -> HashCase: + def _runner(constructor: Callable[..., object]) -> None: + hasher = constructor() + for label, payload in payloads: + hasher.update(label.encode("ascii")) + hasher.update(len(payload).to_bytes(8, "big")) + hasher.update(payload) + key = _digest_bytes(hasher) + _hex_digest(constructor(key)) + + return HashCase(name, _runner) + + +def _digest_bytes(hasher: object) -> bytes: + try: + return hasher.digest() + except TypeError: + return hasher.digest(_SHAKE_DIGEST_BYTES) + + +def _hex_digest(hasher: object) -> str: + try: + return hasher.hexdigest() + except TypeError: + return hasher.hexdigest(_SHAKE_DIGEST_BYTES) + + +def _sample_cases() -> tuple[HashCase, ...]: + file_stream_key = bytes.fromhex("ab" * 32) + long_file_stream_key = (b"cuda-core-cache-key-" * 128)[:4096] + + source = b""" +extern "C" __global__ void saxpy(float a, const float* x, float* y) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + y[i] = a * x[i] + y[i]; +} +""".strip() + ptx = b""" +.version 8.0 +.target sm_90 +.address_size 64 +.visible .entry saxpy() { ret; } +""".strip() + option_bytes = ( + b"name='saxpy'", + b"arch='sm_90'", + b"max_register_count=None", + b"time=False", + b"link_time_optimization=False", + b"debug=False", + b"lineinfo=False", + b"ftz=None", + b"prec_div=None", + b"prec_sqrt=None", + b"fma=None", + b"split_compile=None", + b"ptxas_options=None", + b"no_cache=False", + ) + names = (b"saxpy", b"_Z5saxpyv") + extra_digest = bytes.fromhex("cd" * 32) + + cpp_payloads = ( + ("schema", b"2"), + ("nvrtc", b"13.2"), + ("code_type", b"c++"), + ("target_type", b"cubin"), + ("code", source), + ("option_count", str(len(option_bytes)).encode("ascii")), + *tuple(("option", item) for item in option_bytes), + ("names_count", str(len(names)).encode("ascii")), + *tuple(("name", item) for item in names), + ("options_name", b"saxpy"), + ("extra_digest", extra_digest), + ) + ptx_payloads = ( + ("schema", b"2"), + ("linker", b"nvJitLink-13.2"), + ("code_type", b"ptx"), + ("target_type", b"cubin"), + ("code", ptx), + ("option_count", str(len(option_bytes)).encode("ascii")), + *tuple(("option", item) for item in option_bytes), + ("names_count", b"0"), + ("extra_digest", extra_digest), + ) + + return ( + _file_stream_case("file_stream_key_32b", file_stream_key), + _file_stream_case("file_stream_key_4k", long_file_stream_key), + _program_cache_case("program_cache_cpp", cpp_payloads), + _program_cache_case("program_cache_ptx", ptx_payloads), + _end_to_end_case("end_to_end_cpp", cpp_payloads), + _end_to_end_case("end_to_end_ptx", ptx_payloads), + ) + + +def _benchmark_case( + case: HashCase, + constructor: Callable[..., object], + *, + loops: int, + repeat: int, +) -> tuple[float, float]: + samples_ns: list[float] = [] + for _ in range(repeat): + start = time.perf_counter_ns() + for _ in range(loops): + case.runner(constructor) + elapsed = time.perf_counter_ns() - start + samples_ns.append(elapsed / loops) + return statistics.mean(samples_ns), min(samples_ns) + + +def _format_ns(value: float) -> str: + return f"{value:,.1f}" + + +def _write_line(text: str = "") -> None: + sys.stdout.write(text + "\n") + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--loops", + type=int, + default=200_000, + help="Iterations per repeat for each algorithm/case pair.", + ) + parser.add_argument( + "--repeat", + type=int, + default=7, + help="Independent timing repeats for each algorithm/case pair.", + ) + parser.add_argument( + "--algorithms", + nargs="+", + default=list(_DEFAULT_ALGORITHMS), + help="hashlib algorithm names to benchmark.", + ) + args = parser.parse_args() + + cases = _sample_cases() + widths = { + "algorithm": max(len("Algorithm"), max(len(name) for name in args.algorithms)), + "case": max(len(case.name) for case in cases), + } + + _write_line( + f"{'Algorithm':<{widths['algorithm']}} " + f"{'Case':<{widths['case']}} {'mean ns/op':>12} {'best ns/op':>12}" + ) + _write_line("-" * (widths["algorithm"] + widths["case"] + 28)) + + for algorithm in args.algorithms: + constructor = _make_constructor(algorithm) + for case in cases: + mean_ns, best_ns = _benchmark_case(case, constructor, loops=args.loops, repeat=args.repeat) + _write_line( + f"{algorithm:<{widths['algorithm']}} " + f"{case.name:<{widths['case']}} " + f"{_format_ns(mean_ns):>12} {_format_ns(best_ns):>12}" + ) + + +if __name__ == "__main__": + main() From 70fa564c1b6015f90c2a791bc7940b86d50fd30d Mon Sep 17 00:00:00 2001 From: Aryan Date: Fri, 15 May 2026 22:12:03 -0400 Subject: [PATCH 6/6] Remove benchmark helper script --- scripts/bench_program_cache_hashes.py | 266 -------------------------- 1 file changed, 266 deletions(-) delete mode 100644 scripts/bench_program_cache_hashes.py diff --git a/scripts/bench_program_cache_hashes.py b/scripts/bench_program_cache_hashes.py deleted file mode 100644 index 147a847d90e..00000000000 --- a/scripts/bench_program_cache_hashes.py +++ /dev/null @@ -1,266 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -"""Benchmark FIPS-available hashlib candidates for cuda.core program-cache use. - -This mirrors the two relevant call sites: - -* ``FileStreamProgramCache._path_for_key()``: hash a cache key to a stable - filename component via ``hexdigest()``. -* ``make_program_cache_key()``: incrementally build the digest from labeled - payload chunks and return ``digest()``. - -This is a review/support tool, not a production dependency. The benchmark is -intentionally stdlib-only so reviewers can run it directly. -""" - -from __future__ import annotations - -import argparse -import hashlib -import inspect -import statistics -import sys -import time -from dataclasses import dataclass -from typing import Callable - -_DEFAULT_ALGORITHMS = ( - "sha1", - "sha224", - "sha256", - "sha384", - "sha512", - "sha512_224", - "sha512_256", - "sha3_224", - "sha3_256", - "sha3_384", - "sha3_512", - "shake_128", - "shake_256", -) - -_SHAKE_DIGEST_BYTES = 32 - - -@dataclass(frozen=True) -class HashCase: - name: str - runner: Callable[[Callable[..., object]], None] - - -def _supports_usedforsecurity(constructor: Callable[..., object]) -> bool: - try: - signature = inspect.signature(constructor) - except (TypeError, ValueError): - return False - return "usedforsecurity" in signature.parameters - - -def _make_constructor(name: str) -> Callable[..., object]: - constructor = getattr(hashlib, name, None) - if constructor is not None: - if _supports_usedforsecurity(constructor): - return lambda data=b"": constructor(data, usedforsecurity=False) - return constructor - - def _constructor(data=b""): - try: - return hashlib.new(name, data, usedforsecurity=False) - except TypeError: - return hashlib.new(name, data) - - return _constructor - - -def _file_stream_case(name: str, key: bytes) -> HashCase: - def _runner(constructor: Callable[..., object]) -> None: - _hex_digest(constructor(key)) - - return HashCase(name, _runner) - - -def _program_cache_case(name: str, payloads: tuple[tuple[str, bytes], ...]) -> HashCase: - def _runner(constructor: Callable[..., object]) -> None: - hasher = constructor() - for label, payload in payloads: - hasher.update(label.encode("ascii")) - hasher.update(len(payload).to_bytes(8, "big")) - hasher.update(payload) - _digest_bytes(hasher) - - return HashCase(name, _runner) - - -def _end_to_end_case(name: str, payloads: tuple[tuple[str, bytes], ...]) -> HashCase: - def _runner(constructor: Callable[..., object]) -> None: - hasher = constructor() - for label, payload in payloads: - hasher.update(label.encode("ascii")) - hasher.update(len(payload).to_bytes(8, "big")) - hasher.update(payload) - key = _digest_bytes(hasher) - _hex_digest(constructor(key)) - - return HashCase(name, _runner) - - -def _digest_bytes(hasher: object) -> bytes: - try: - return hasher.digest() - except TypeError: - return hasher.digest(_SHAKE_DIGEST_BYTES) - - -def _hex_digest(hasher: object) -> str: - try: - return hasher.hexdigest() - except TypeError: - return hasher.hexdigest(_SHAKE_DIGEST_BYTES) - - -def _sample_cases() -> tuple[HashCase, ...]: - file_stream_key = bytes.fromhex("ab" * 32) - long_file_stream_key = (b"cuda-core-cache-key-" * 128)[:4096] - - source = b""" -extern "C" __global__ void saxpy(float a, const float* x, float* y) { - const int i = blockIdx.x * blockDim.x + threadIdx.x; - y[i] = a * x[i] + y[i]; -} -""".strip() - ptx = b""" -.version 8.0 -.target sm_90 -.address_size 64 -.visible .entry saxpy() { ret; } -""".strip() - option_bytes = ( - b"name='saxpy'", - b"arch='sm_90'", - b"max_register_count=None", - b"time=False", - b"link_time_optimization=False", - b"debug=False", - b"lineinfo=False", - b"ftz=None", - b"prec_div=None", - b"prec_sqrt=None", - b"fma=None", - b"split_compile=None", - b"ptxas_options=None", - b"no_cache=False", - ) - names = (b"saxpy", b"_Z5saxpyv") - extra_digest = bytes.fromhex("cd" * 32) - - cpp_payloads = ( - ("schema", b"2"), - ("nvrtc", b"13.2"), - ("code_type", b"c++"), - ("target_type", b"cubin"), - ("code", source), - ("option_count", str(len(option_bytes)).encode("ascii")), - *tuple(("option", item) for item in option_bytes), - ("names_count", str(len(names)).encode("ascii")), - *tuple(("name", item) for item in names), - ("options_name", b"saxpy"), - ("extra_digest", extra_digest), - ) - ptx_payloads = ( - ("schema", b"2"), - ("linker", b"nvJitLink-13.2"), - ("code_type", b"ptx"), - ("target_type", b"cubin"), - ("code", ptx), - ("option_count", str(len(option_bytes)).encode("ascii")), - *tuple(("option", item) for item in option_bytes), - ("names_count", b"0"), - ("extra_digest", extra_digest), - ) - - return ( - _file_stream_case("file_stream_key_32b", file_stream_key), - _file_stream_case("file_stream_key_4k", long_file_stream_key), - _program_cache_case("program_cache_cpp", cpp_payloads), - _program_cache_case("program_cache_ptx", ptx_payloads), - _end_to_end_case("end_to_end_cpp", cpp_payloads), - _end_to_end_case("end_to_end_ptx", ptx_payloads), - ) - - -def _benchmark_case( - case: HashCase, - constructor: Callable[..., object], - *, - loops: int, - repeat: int, -) -> tuple[float, float]: - samples_ns: list[float] = [] - for _ in range(repeat): - start = time.perf_counter_ns() - for _ in range(loops): - case.runner(constructor) - elapsed = time.perf_counter_ns() - start - samples_ns.append(elapsed / loops) - return statistics.mean(samples_ns), min(samples_ns) - - -def _format_ns(value: float) -> str: - return f"{value:,.1f}" - - -def _write_line(text: str = "") -> None: - sys.stdout.write(text + "\n") - - -def main() -> None: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--loops", - type=int, - default=200_000, - help="Iterations per repeat for each algorithm/case pair.", - ) - parser.add_argument( - "--repeat", - type=int, - default=7, - help="Independent timing repeats for each algorithm/case pair.", - ) - parser.add_argument( - "--algorithms", - nargs="+", - default=list(_DEFAULT_ALGORITHMS), - help="hashlib algorithm names to benchmark.", - ) - args = parser.parse_args() - - cases = _sample_cases() - widths = { - "algorithm": max(len("Algorithm"), max(len(name) for name in args.algorithms)), - "case": max(len(case.name) for case in cases), - } - - _write_line( - f"{'Algorithm':<{widths['algorithm']}} " - f"{'Case':<{widths['case']}} {'mean ns/op':>12} {'best ns/op':>12}" - ) - _write_line("-" * (widths["algorithm"] + widths["case"] + 28)) - - for algorithm in args.algorithms: - constructor = _make_constructor(algorithm) - for case in cases: - mean_ns, best_ns = _benchmark_case(case, constructor, loops=args.loops, repeat=args.repeat) - _write_line( - f"{algorithm:<{widths['algorithm']}} " - f"{case.name:<{widths['case']}} " - f"{_format_ns(mean_ns):>12} {_format_ns(best_ns):>12}" - ) - - -if __name__ == "__main__": - main()