diff --git a/bench/00_misc/fltflt_arithmetic.cu b/bench/00_misc/fltflt_arithmetic.cu index 25ad61395..2b5503074 100644 --- a/bench/00_misc/fltflt_arithmetic.cu +++ b/bench/00_misc/fltflt_arithmetic.cu @@ -70,6 +70,72 @@ static void add_gops_per_sec_summary(nvbench::state &state, double ops_per_op = s.set_float64("value", total_ops / seconds / 1e9); } +// Bump a value to the next bit pattern by adding 1 to its integer +// representation. Used to vary a loop input across iterations without +// charging the benchmark for an unrelated fp add on every iteration. +// +// Only `double` and `fltflt` overloads are provided: on most GPUs an +// int64 add is significantly faster than an fp64 add, and the fltflt +// alternative would dispatch through fltflt_add (~20 fp32 ops). For +// float, fp32 add and int32 add run at the same rate, so call sites +// should keep `x = x + small`. +// +// For fltflt, only the hi component is bumped, leaving the pair +// non-canonical -- benches only care that the value differs from the +// previous iteration. The +1 would be UB at the largest-positive bit +// pattern (a NaN); call sites here never reach that. +__device__ __forceinline__ void bump_ulp(double &x) { + x = __longlong_as_double(__double_as_longlong(x) + 1LL); +} +__device__ __forceinline__ void bump_ulp(fltflt &x) { + x.hi = __int_as_float(__float_as_int(x.hi) + 1); +} + +// Compute-bound kernel used solely to spin GPU clocks up to steady +// state. Self-contained (doesn't depend on any of the iterative_* +// kernels below) so it can be called from warmup_gpu_once() before +// they're defined. +__global__ void clock_warmup_kernel(float *out, int N) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + float acc = static_cast(idx) * 0.001f; + #pragma unroll 1 + for (int i = 0; i < N; i++) { + acc = acc * 1.0000001f + 1.0e-6f; + } + if (idx == 0) out[0] = acc; +} + +// Idempotent process-level GPU warmup. Only the first call in a +// process actually runs warmup launches; subsequent calls are no-ops. +// Brings GPU clocks to steady state before the first nvbench timing +// window so the *first* benchmark to execute (whichever one that may +// be) does not get charged for clock ramp-up. +// +// inner_iters is sized so each launch runs ~50 ms on Blackwell, and +// four launches give ~200 ms total -- comfortably past the GPU's clock +// ramp window. (FMA-bound work retires near peak fp32 throughput, so +// undersized warmups would otherwise be over in a few ms.) +static void warmup_gpu_once() +{ + static bool warmed = false; + if (warmed) return; + warmed = true; + + constexpr int block_size = 256; + constexpr int grid_size = 1024; + constexpr int inner_iters = 2'000'000; + + float *tmp = nullptr; + MATX_CUDA_CHECK(cudaMalloc(&tmp, sizeof(float))); + for (int w = 0; w < 4; w++) { + clock_warmup_kernel<<>>(tmp, inner_iters); + MATX_CUDA_CHECK_LAST_ERROR(); + } + MATX_CUDA_CHECK(cudaDeviceSynchronize()); + MATX_CUDA_CHECK(cudaFree(tmp)); +} + template __global__ void iterative_add_kernel(T* __restrict__ result, int64_t size, int32_t iterations) { @@ -277,10 +343,15 @@ __global__ void iterative_fma_kernel(T* __restrict__ result, int64_t size, int32 } //============================================================================== -// Addition Benchmark +// Addition Throughput Benchmark +// +// Many independent accumulators (ILP_FACTOR=8) and outer-loop unrolling +// expose maximum instruction-level parallelism. Latency-hiding fully covers +// per-call dependency chains, so this measures *throughput*: ops/sec when +// the warp scheduler always has independent work in flight. //============================================================================== template -void fltflt_bench_add(nvbench::state &state, nvbench::type_list) +void fltflt_bench_add_throughput(nvbench::state &state, nvbench::type_list) { const index_t size = static_cast(state.get_int64("Array Size")); const int32_t iterations = static_cast(state.get_int64("Iterations")); @@ -296,6 +367,7 @@ void fltflt_bench_add(nvbench::state &state, nvbench::type_list) constexpr int block_size = 256; int grid_size = static_cast((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); // Benchmark execution @@ -306,7 +378,7 @@ void fltflt_bench_add(nvbench::state &state, nvbench::type_list) add_gops_per_sec_summary(state); } -NVBENCH_BENCH_TYPES(fltflt_bench_add, NVBENCH_TYPE_AXES(precision_types)) +NVBENCH_BENCH_TYPES(fltflt_bench_add_throughput, NVBENCH_TYPE_AXES(precision_types)) .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1)) .add_int64_axis("Iterations", {250}); @@ -328,6 +400,7 @@ void fltflt_bench_sub(nvbench::state &state, nvbench::type_list) constexpr int block_size = 256; int grid_size = static_cast((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -359,6 +432,7 @@ void fltflt_bench_mul(nvbench::state &state, nvbench::type_list) constexpr int block_size = 256; int grid_size = static_cast((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -390,6 +464,7 @@ void fltflt_bench_div(nvbench::state &state, nvbench::type_list) constexpr int block_size = 256; int grid_size = static_cast((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -421,6 +496,7 @@ void fltflt_bench_sqrt(nvbench::state &state, nvbench::type_list) constexpr int block_size = 256; int grid_size = static_cast((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -492,6 +568,7 @@ void fltflt_bench_sqrt_fast(nvbench::state &state, nvbench::type_list((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -574,6 +651,7 @@ void fltflt_bench_norm3d(nvbench::state &state, nvbench::type_list((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -606,6 +684,7 @@ void fltflt_bench_abs(nvbench::state &state, nvbench::type_list) constexpr int block_size = 256; int grid_size = static_cast((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -637,6 +716,7 @@ void fltflt_bench_fma(nvbench::state &state, nvbench::type_list) constexpr int block_size = 256; int grid_size = static_cast((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -704,6 +784,7 @@ void fltflt_bench_madd(nvbench::state &state, nvbench::type_list) constexpr int block_size = 256; int grid_size = static_cast((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -778,6 +859,7 @@ void fltflt_bench_round(nvbench::state &state, nvbench::type_list constexpr int block_size = 256; int grid_size = static_cast((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -829,10 +911,13 @@ __global__ void iterative_fmod_kernel(T* __restrict__ result, int64_t size, int3 asm volatile("" : "+d"(val[ilp])); } } - if constexpr (std::is_same_v) { - init_val = init_val + 2048.0f; + if constexpr (std::is_same_v) { + // fp32 add is full-rate, no benefit from a bit-twiddle here. + init_val += 2048.0f; } else { - init_val += static_cast(2048.0f); + // Bit-pattern bump avoids an fp64 add (or full fltflt_add) on + // every iteration just to defeat hoisting of the fmod call. + bump_ulp(init_val); } } @@ -860,6 +945,7 @@ void fltflt_bench_fmod(nvbench::state &state, nvbench::type_list) constexpr int block_size = 256; int grid_size = static_cast((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -910,7 +996,14 @@ __global__ void iterative_trunc_kernel(T* __restrict__ result, int64_t size, int asm volatile("" : "+d"(val[ilp])); } } - init_val = init_val + static_cast(2048.0f); + if constexpr (std::is_same_v) { + // fp32 add is full-rate, no benefit from a bit-twiddle here. + init_val += 2048.0f; + } else { + // Bit-pattern bump avoids an fp64 add (or full fltflt_add) on + // every iteration just to defeat hoisting of the trunc call. + bump_ulp(init_val); + } } T result_val = val[0]; @@ -937,6 +1030,7 @@ void fltflt_bench_trunc(nvbench::state &state, nvbench::type_list constexpr int block_size = 256; int grid_size = static_cast((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -987,7 +1081,14 @@ __global__ void iterative_floor_kernel(T* __restrict__ result, int64_t size, int asm volatile("" : "+d"(val[ilp])); } } - init_val = init_val + static_cast(2048.0f); + if constexpr (std::is_same_v) { + // fp32 add is full-rate, no benefit from a bit-twiddle here. + init_val += 2048.0f; + } else { + // Bit-pattern bump avoids an fp64 add (or full fltflt_add) on + // every iteration just to defeat hoisting of the floor call. + bump_ulp(init_val); + } } T result_val = val[0]; @@ -1014,6 +1115,7 @@ void fltflt_bench_floor(nvbench::state &state, nvbench::type_list constexpr int block_size = 256; int grid_size = static_cast((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -1050,7 +1152,15 @@ __global__ void iterative_cast2dbl_kernel(double* __restrict__ result, int64_t s acc[ilp] = static_cast(src_val); asm volatile("" : "+d"(acc[ilp])); } - src_val = src_val + static_cast(0.0001); + if constexpr (std::is_same_v) { + // fp32 add is full-rate, no benefit from a bit-twiddle here. + src_val = src_val + static_cast(0.0001); + } else { + // Vary src_val via bit-pattern bump -- keeps the cast2dbl cost + // un-contaminated by an unrelated fp64 add or fltflt_add per + // iteration. + bump_ulp(src_val); + } } double result_val = acc[0]; @@ -1077,6 +1187,7 @@ void fltflt_bench_cast2dbl(nvbench::state &state, nvbench::type_list((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -1113,12 +1224,14 @@ __global__ void iterative_cast2fltflt_kernel(fltflt* __restrict__ result, int64_ acc[ilp] = static_cast(src_val); asm volatile("" : "+f"(acc[ilp].hi), "+f"(acc[ilp].lo)); } - // For double, increment the bit pattern to get the next representable value - // so the loop anti-aliasing doesn't introduce a double-precision add. - if constexpr (cuda::std::is_same_v) { - src_val = __longlong_as_double(__double_as_longlong(src_val) + 1LL); - } else { + if constexpr (std::is_same_v) { + // fp32 add is full-rate, no benefit from a bit-twiddle here. src_val = src_val + static_cast(0.0001); + } else { + // Vary src_val via bit-pattern bump -- keeps the cast2fltflt + // cost un-contaminated by an unrelated fp64 add or fltflt_add + // per iteration. + bump_ulp(src_val); } } @@ -1146,6 +1259,7 @@ void fltflt_bench_cast2fltflt(nvbench::state &state, nvbench::type_list((size + block_size - 1) / block_size); + warmup_gpu_once(); exec.sync(); state.exec([&](nvbench::launch &launch) { @@ -1158,3 +1272,66 @@ void fltflt_bench_cast2fltflt(nvbench::state &state, nvbench::type_listthroughput transition: at Blocks=1 only one warp runs on one SM, +// fully exposing the chain, while at Blocks=1024 the scheduler has many +// warps in flight and latency is partially hidden. +//============================================================================== +template +__global__ void chain_add_kernel(int N, PrecisionType *__restrict__ out) +{ + // Construct via float so the same expression compiles for float, double, + // and fltflt (each has a constructor accepting a float). + PrecisionType acc{1.0f}; +#pragma unroll 1 + for (int i = 0; i < N; i++) { + // step varies per iteration to defeat loop-invariant hoisting and force + // a true data dependency on the running accumulator. + const PrecisionType step{static_cast(i + 1)}; + acc = acc + step; // dispatches to PrecisionType's operator+ + } + out[blockIdx.x * blockDim.x + threadIdx.x] = acc; +} + +template +void fltflt_bench_add_latency(nvbench::state &state, nvbench::type_list) +{ + const int chain_len = static_cast(state.get_int64("Chain Length")); + const int blocks = static_cast(state.get_int64("Blocks")); + constexpr int threads = 32; // exactly one warp per block + + cudaExecutor exec{0}; + const size_t total_threads = static_cast(blocks) * threads; + auto result = make_tensor({static_cast(total_threads)}); + + state.add_element_count(static_cast(chain_len) * total_threads, "ops"); + + warmup_gpu_once(); + exec.sync(); + + state.exec([&](nvbench::launch &launch) { + chain_add_kernel + <<>>( + chain_len, result.Data()); + }); +} + +NVBENCH_BENCH_TYPES(fltflt_bench_add_latency, NVBENCH_TYPE_AXES(precision_types)) + .add_int64_axis("Chain Length", {1024, 4096, 16384}) + // Blocks=1 : 1 warp on 1 SM, all other SMs idle -- latency fully exposed + // Blocks=4 : 4 warps on 4 SMs, each SM has 1 warp -- still latency-bound + // Blocks=160 : ~1 block per SM on a ~160-SM device -- partial latency hiding + // Blocks=1024: many blocks per SM -- throughput-bound, latency hides + .add_int64_axis("Blocks", {1, 4, 160, 1024}); diff --git a/bench/scripts/run_benchmarks.py b/bench/scripts/run_benchmarks.py new file mode 100755 index 000000000..d945e716d --- /dev/null +++ b/bench/scripts/run_benchmarks.py @@ -0,0 +1,458 @@ +#!/usr/bin/env python3 + +# BSD 3-Clause License +# +# Copyright (c) 2026, NVIDIA Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" +Run a profile of MatX benchmarks via nvbench and emit per-profile summaries. + +Each profile defines: + - the per-source CMake bench executable to invoke, + - the list of nvbench benchmark names to run, + - an optional summary handler that walks the nvbench JSON output and + prints a domain-specific table (e.g. fp32/fp64/fltflt slowdown ratios + for fltflt, Gproj/s for sarbp). + +Output: + - bench_results/.json (raw nvbench JSON, the source of truth) + - bench_results/.md (rendered nvbench markdown table) + - bench_results/.csv (raw nvbench CSV) + - stdout (the profile's domain-specific summary) + +Examples: + python run_benchmarks.py # run every profile + python run_benchmarks.py --profile fltflt # run a single profile + python run_benchmarks.py --profile sarbp -- --profile # forward --profile to nvbench + +The script does not parse markdown -- it reads nvbench's JSON output, which +is part of nvbench's stable contract. +""" + +import argparse +import json +import subprocess +import sys +from pathlib import Path + + +# --------------------------------------------------------------------------- +# nvbench JSON helpers -- schema: benchmarks[].states[].summaries[].data[].value +# --------------------------------------------------------------------------- + +def summary_float(state, tag, default=None): + """Pull a float-valued summary tag out of one nvbench state.""" + for s in state.get("summaries", []): + if s.get("tag") != tag: + continue + for d in s.get("data", []): + if d.get("name") == "value": + try: + return float(d.get("value")) + except (TypeError, ValueError): + return default + return default + + +def axis_value(state, name): + """Pull a per-state axis value (int or string).""" + for a in state.get("axis_values", []): + if a.get("name") == name: + v = a.get("value") + try: + return int(v) + except (TypeError, ValueError): + return v + return None + + +def states_for_benchmark(data, bench_name): + """Yield all states for a given benchmark name from a parsed JSON file.""" + for b in data.get("benchmarks", []): + if b.get("name") == bench_name: + yield from b.get("states", []) + + +GPU_TIME_TAG = "nv/cold/time/gpu/mean" # seconds + + +def fmt_time(seconds): + """Format seconds in auto-scaled units.""" + if seconds is None: + return "N/A" + if seconds < 1e-6: + return f"{seconds * 1e9:.3f} ns" + if seconds < 1e-3: + return f"{seconds * 1e6:.3f} us" + if seconds < 1.0: + return f"{seconds * 1e3:.3f} ms" + return f"{seconds:.3f} s" + + +# --------------------------------------------------------------------------- +# fltflt summary: per-op fp32 / fp64 / fltflt slowdown ratios. +# --------------------------------------------------------------------------- + +# Map from nvbench type-axis "input_string" to a friendly precision label. +_PRECISION_FROM_AXIS = { + "F32": "float", + "F64": "double", + "matx::fltflt": "fltflt", +} + + +def _fltflt_pick_time(states, *, prefer=None): + """For a list of states (one per type), return {precision: gpu_time_seconds}. + + `prefer`, if given, is a dict of axis_name -> value that selects the row + used when the bench has additional axes (e.g. {Blocks: 1, Chain Length: + 16384} for the latency bench). Without `prefer`, the first matching row + is taken (typical for benches with only the type axis). + """ + out = {} + for st in states: + type_str = axis_value(st, "T") + prec = _PRECISION_FROM_AXIS.get(type_str) + if prec is None or prec in out: + continue + if prefer is not None: + ok = True + for k, v in prefer.items(): + if axis_value(st, k) != v: + ok = False + break + if not ok: + continue + t = summary_float(st, GPU_TIME_TAG) + if t is not None: + out[prec] = t + return out + + +def summarize_fltflt(json_path): + """Walk nvbench JSON for the fltflt profile and emit a slowdown table.""" + with open(json_path) as f: + data = json.load(f) + + # (display name, nvbench bench name, optional axis filter for picking a row). + rows = [ + ("add_throughput", "fltflt_bench_add_throughput", None), + ("add_latency", "fltflt_bench_add_latency", + {"Blocks": 1, "Chain Length": 16384}), + ("sub", "fltflt_bench_sub", None), + ("mul", "fltflt_bench_mul", None), + ("div", "fltflt_bench_div", None), + ("sqrt", "fltflt_bench_sqrt", None), + ("sqrt_fast", "fltflt_bench_sqrt_fast", None), + ("norm3d", "fltflt_bench_norm3d", None), + ("abs", "fltflt_bench_abs", None), + ("fma", "fltflt_bench_fma", None), + ("madd", "fltflt_bench_madd", None), + ("round", "fltflt_bench_round", None), + ("trunc", "fltflt_bench_trunc", None), + ("floor", "fltflt_bench_floor", None), + ("fmod", "fltflt_bench_fmod", None), + ("cast2dbl", "fltflt_bench_cast2dbl", None), + ("cast2fltflt", "fltflt_bench_cast2fltflt", None), + ] + + print() + print("=" * 86) + print("FLTFLT BENCHMARK SUMMARY") + print("=" * 86) + print("float / double / fltflt: slowdown vs float (lower is better, float = 1.0x).") + print("dbl/fltflt: speedup of fltflt over double (higher is better).") + print() + print(f"{'Benchmark':<18}{'float':>12}{'double':>12}{'fltflt':>12}{'dbl/fltflt':>14}") + print("-" * 86) + + for label, bench, prefer in rows: + states = list(states_for_benchmark(data, bench)) + if not states: + continue + t = _fltflt_pick_time(states, prefer=prefer) + f = t.get("float") + d = t.get("double") + ff = t.get("fltflt") + if f is None: + continue + + def slow(x): + return f"{x / f:.2f}x" if x is not None else "N/A" + + # Speedup framing: bigger means fltflt wins more vs double. + speedup_vs_dbl = ( + f"{d / ff:.2f}x" if (d is not None and ff is not None and ff > 0) else "N/A" + ) + print(f"{label:<18}{slow(f):>12}{slow(d):>12}{slow(ff):>12}{speedup_vs_dbl:>14}") + + print() + print("Raw GPU times (cold mean):") + print(f"{'Benchmark':<18}{'float':>14}{'double':>14}{'fltflt':>14}") + print("-" * 86) + for label, bench, prefer in rows: + states = list(states_for_benchmark(data, bench)) + if not states: + continue + t = _fltflt_pick_time(states, prefer=prefer) + print(f"{label:<18}{fmt_time(t.get('float')):>14}" + f"{fmt_time(t.get('double')):>14}{fmt_time(t.get('fltflt')):>14}") + print("=" * 86) + + +# --------------------------------------------------------------------------- +# sarbp summary: Gproj/s = problem_size**3 / time, plus relative table. +# --------------------------------------------------------------------------- + +def _sarbp_results(data): + """Return {variant: {problem_size: gpu_time_seconds}}.""" + out = {} + for variant in ("float", "double", "mixed", "fltflt"): + bench_name = f"sarbp_{variant}" + per_size = {} + for st in states_for_benchmark(data, bench_name): + ps = axis_value(st, "Problem Size") + t = summary_float(st, GPU_TIME_TAG) + if ps is not None and t is not None: + per_size[ps] = t + if per_size: + out[variant] = per_size + return out + + +def _gproj_per_sec(problem_size, time_s): + return (problem_size ** 3) / 1e9 / time_s + + +def summarize_sarbp(json_path): + with open(json_path) as f: + data = json.load(f) + + results = _sarbp_results(data) + if not results: + print("(sarbp: no results)") + return + + variants = [v for v in ("float", "double", "mixed", "fltflt") if v in results] + all_sizes = sorted({s for r in results.values() for s in r}) + + print() + print("=" * 90) + print("SAR BACKPROJECTION BENCHMARK SUMMARY") + print("=" * 90) + print("Gigabackprojections per second (Gproj/s) -- operations = problem_size^3.") + print() + + print(f"{'Problem Size':<14}" + "".join(f"{v:>14}" for v in variants)) + print("-" * 90) + for ps in all_sizes: + row = f"{ps:<14}" + for v in variants: + t = results[v].get(ps) + row += f"{_gproj_per_sec(ps, t):>14.3f}" if t else f"{'N/A':>14}" + print(row) + + if "float" in results: + print() + print("Relative throughput (float = 1.0x):") + print(f"{'Problem Size':<14}" + "".join(f"{v:>14}" for v in variants)) + print("-" * 90) + for ps in all_sizes: + row = f"{ps:<14}" + f_t = results["float"].get(ps) + f_g = _gproj_per_sec(ps, f_t) if f_t else None + for v in variants: + t = results[v].get(ps) + if t is None or f_g is None: + row += f"{'N/A':>14}" + else: + row += f"{_gproj_per_sec(ps, t) / f_g:>13.3f}x" + print(row) + print("=" * 90) + + +# --------------------------------------------------------------------------- +# Profile registry -- one entry per logical bench family. +# --------------------------------------------------------------------------- + +PROFILES = { + "fltflt": { + "exe_stems": ["bench_00_misc_fltflt_arithmetic", "matx_bench"], + "benchmarks": [ + "fltflt_bench_add_throughput", + "fltflt_bench_add_latency", + "fltflt_bench_sub", + "fltflt_bench_mul", + "fltflt_bench_div", + "fltflt_bench_sqrt", + "fltflt_bench_sqrt_fast", + "fltflt_bench_norm3d", + "fltflt_bench_abs", + "fltflt_bench_fma", + "fltflt_bench_madd", + "fltflt_bench_round", + "fltflt_bench_trunc", + "fltflt_bench_floor", + "fltflt_bench_fmod", + "fltflt_bench_cast2dbl", + "fltflt_bench_cast2fltflt", + ], + "summary": summarize_fltflt, + # Inherited from the deleted run_fltflt_benchmarks.py. + "timeout_seconds": 300, + }, + "sarbp": { + "exe_stems": ["bench_00_transform_sarbp", "matx_bench"], + "benchmarks": ["sarbp_float", "sarbp_double", "sarbp_mixed", "sarbp_fltflt"], + "summary": summarize_sarbp, + # Inherited from the deleted run_sarbp_benchmarks.py; sarbp at the + # default Problem Size runs longer than the fltflt sweep. + "timeout_seconds": 600, + }, +} + +# Default timeout for any future profile that doesn't set its own. +DEFAULT_TIMEOUT_SECONDS = 600 + + +# --------------------------------------------------------------------------- +# Build-dir / executable resolution. +# --------------------------------------------------------------------------- + +def _resolve_exe(build_dir, stems): + bench_dir = build_dir / "bench" + for stem in stems: + for cand in (bench_dir / stem, bench_dir / f"{stem}.exe"): + if cand.is_file(): + return cand + return None + + +def find_default_build_dir(): + cwd = Path.cwd() + if (cwd / "bench").is_dir() and any( + p.name.startswith("bench_") for p in (cwd / "bench").iterdir() if p.is_file() + ): + return cwd + script_dir = Path(__file__).resolve().parent + for candidate in (script_dir / "../../build", script_dir / "../../../build"): + c = candidate.resolve() + if c.exists() and (c / "bench").is_dir(): + return c + return cwd # let _resolve_exe produce a clear error + + +# --------------------------------------------------------------------------- +# Per-profile orchestration. +# --------------------------------------------------------------------------- + +def run_profile(name, profile, build_dir, out_dir, extra_args): + exe = _resolve_exe(build_dir, profile["exe_stems"]) + if exe is None: + print(f"[{name}] could not find any of {profile['exe_stems']} under {build_dir}/bench/", + file=sys.stderr) + return False + + json_path = out_dir / f"{name}.json" + md_path = out_dir / f"{name}.md" + csv_path = out_dir / f"{name}.csv" + + cmd = [str(exe)] + for b in profile["benchmarks"]: + cmd += ["--benchmark", b] + cmd += [ + "--json", str(json_path), + "--md", str(md_path), + "--csv", str(csv_path), + ] + cmd += extra_args + timeout = profile.get("timeout_seconds", DEFAULT_TIMEOUT_SECONDS) + print(f"[{name}] {' '.join(cmd)} (timeout {timeout}s)") + try: + res = subprocess.run(cmd, timeout=timeout) + except subprocess.TimeoutExpired: + print(f"[{name}] nvbench exceeded timeout of {timeout}s", file=sys.stderr) + return False + if res.returncode != 0: + print(f"[{name}] nvbench exited with status {res.returncode}", file=sys.stderr) + return False + + if profile.get("summary"): + try: + profile["summary"](json_path) + except Exception as e: + print(f"[{name}] summary failed: {e}", file=sys.stderr) + return False + return True + + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--build-dir", type=Path, default=None, + help="Build directory containing bench/. " + "Defaults to the current directory if it has bench_*; otherwise /build.", + ) + parser.add_argument( + "--profile", + choices=sorted(PROFILES) + ["all"], + default="all", + help="Which profile to run (default: all).", + ) + parser.add_argument( + "--out-dir", type=Path, default=Path("bench_results"), + help="Directory for nvbench JSON/MD/CSV output (default: ./bench_results).", + ) + parser.add_argument( + "nvbench_args", nargs=argparse.REMAINDER, + help="Extra args forwarded verbatim to the nvbench executable. " + "Use `--` to separate them from this script's flags.", + ) + args = parser.parse_args() + extra = [a for a in args.nvbench_args if a != "--"] + + build_dir = args.build_dir if args.build_dir else find_default_build_dir() + args.out_dir.mkdir(parents=True, exist_ok=True) + + profiles = sorted(PROFILES) if args.profile == "all" else [args.profile] + failures = 0 + for name in profiles: + ok = run_profile(name, PROFILES[name], build_dir, args.out_dir, extra) + if not ok: + failures += 1 + + print(f"\nDone. {len(profiles) - failures}/{len(profiles)} profile(s) succeeded; " + f"output under {args.out_dir}/") + sys.exit(1 if failures else 0) + + +if __name__ == "__main__": + main() diff --git a/bench/scripts/run_fltflt_benchmarks.py b/bench/scripts/run_fltflt_benchmarks.py deleted file mode 100755 index 6e0d7c44c..000000000 --- a/bench/scripts/run_fltflt_benchmarks.py +++ /dev/null @@ -1,512 +0,0 @@ -#!/usr/bin/env python3 - -# BSD 3-Clause License -# -# Copyright (c) 2026, NVIDIA Corporation -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" -Run fltflt arithmetic benchmarks and summarize results. -Shows performance relative to single-precision (float = 1.0x baseline). -""" - -import subprocess -import re -import sys -import argparse -from pathlib import Path -from collections import defaultdict - -# Regex to strip ANSI escape codes from nvbench colored output -ANSI_ESCAPE = re.compile(r'\x1b\[[0-9;]*[mK]') - - -def strip_ansi(text): - """Remove ANSI escape codes from a string.""" - return ANSI_ESCAPE.sub('', text) - - -def _resolve_bench_executable(build_dir, stem): - """Return path to bench/ or bench/.exe if present.""" - bench_dir = build_dir / "bench" - for name in (stem, f"{stem}.exe"): - path = bench_dir / name - if path.is_file(): - return path - return None - - -def find_benchmark_executable(build_dir): - """Find the fltflt benchmark executable (per-source CMake target).""" - stem = "bench_00_misc_fltflt_arithmetic" - benchmark_path = _resolve_bench_executable(build_dir, stem) - - if benchmark_path is not None: - return benchmark_path - - print( - "Error: Could not find benchmark executable " - f"bench/{stem} under {build_dir}" - ) - return None - - -def build_dir_contains_benchmark_exes(build_dir): - """True if build_dir/bench contains at least one bench_* executable.""" - bench_dir = build_dir / "bench" - if not bench_dir.is_dir(): - return False - return any( - p.is_file() and p.name.startswith("bench_") for p in bench_dir.iterdir() - ) - - -def run_benchmark(executable_path, benchmark_name, verbose=False): - """Run a specific benchmark and capture output.""" - print(f"Running benchmark: {benchmark_name}") - - try: - result = subprocess.run( - [str(executable_path), "--benchmark", benchmark_name], - capture_output=True, - text=True, - timeout=300 # 5 minute timeout - ) - - if result.returncode != 0: - print(f" Warning: Benchmark failed with return code {result.returncode}") - print(f" stderr: {result.stderr}") - return None - - if verbose: - print(f" Raw output:\n{result.stdout}") - - return result.stdout - except subprocess.TimeoutExpired: - print(f" Benchmark timed out after 5 minutes") - return None - except Exception as e: - print(f" Error running benchmark: {e}") - return None - - -def parse_time_value(time_str): - """Parse time string like '668.707 us' or '6.785 ms' and convert to milliseconds.""" - time_str = strip_ansi(time_str).strip() - - # Match number and unit - match = re.match(r'([\d.]+)\s*(us|ms|ns|s)', time_str) - if not match: - return None - - value = float(match.group(1)) - unit = match.group(2) - - # Convert to milliseconds - if unit == 'us': - return value / 1000.0 - elif unit == 'ms': - return value - elif unit == 'ns': - return value / 1_000_000.0 - elif unit == 's': - return value * 1000.0 - else: - return value - - -def parse_benchmark_output(output, verbose=False): - """ - Parse the table format output from nvbench. - - Expected format: - | T | Array Size | ... | GPU Time | ... - |--------------|-----------------|-----|------------|----- - | F32 | ... | 668.707 us | ... - | F64 | ... | 47.650 ms | ... - | matx::fltflt | ... | 6.785 ms | ... - """ - results = {} - # Strip ANSI codes from the entire output before line-by-line processing - output = strip_ansi(output) - lines = output.strip().split('\n') - - # Find the header line to locate GPU Time column - gpu_time_col_idx = None - for i, line in enumerate(lines): - if '|' in line and 'GPU Time' in line: - # Split by | and find GPU Time column index - cols = [col.strip() for col in line.split('|')] - for j, col in enumerate(cols): - if col == 'GPU Time': - gpu_time_col_idx = j - break - if gpu_time_col_idx is not None: - if verbose: - print(f" Found GPU Time at column index {gpu_time_col_idx} in: {line.rstrip()}") - break - - if gpu_time_col_idx is None: - print(" Warning: Could not find GPU Time column in output") - return results - - # Parse data rows - for line in lines: - if '|' not in line: - continue - - # Skip header and separator lines: - # - any line containing 'GPU Time' is a column header - # - any line with '---' is a separator/divider row - # - lines where the type column (stripped) is exactly 'T' are header rows - # (nvbench labels the type axis column as 'T') - cols_raw = line.split('|') - if len(cols_raw) < 3: - continue - - type_col_raw = cols_raw[1] # unstripped, between first two '|' - if 'GPU Time' in line or '---' in line or type_col_raw.strip() == 'T': - continue - - cols = [col.strip() for col in cols_raw] - - if len(cols) <= gpu_time_col_idx: - continue - - # Get type column (first data column after the leading empty string) - type_col = cols[1] - - if not type_col: - continue - - # Map type names (nvbench aliases float->F32, double->F64) - if 'F32' in type_col: - precision = 'float' - elif 'F64' in type_col: - precision = 'double' - elif 'fltflt' in type_col: - precision = 'fltflt' - else: - continue - - # Extract GPU time - gpu_time_str = cols[gpu_time_col_idx] - gpu_time_ms = parse_time_value(gpu_time_str) - - if gpu_time_ms is not None: - if verbose: - print(f" Parsed: type={precision}, gpu_time_col={gpu_time_str!r}, value={gpu_time_ms:.6f} ms") - results[precision] = gpu_time_ms - elif verbose: - print(f" Warning: Could not parse GPU time from col {gpu_time_col_idx}: {gpu_time_str!r}") - - return results - - -def parse_benchmark_output_no_type(output, verbose=False): - """ - Parse nvbench output for benchmarks without a type axis (fltflt-only). - Returns a dict with a single 'fltflt' key. - """ - results = {} - output = strip_ansi(output) - lines = output.strip().split('\n') - - gpu_time_col_idx = None - for line in lines: - if '|' in line and 'GPU Time' in line: - cols = [col.strip() for col in line.split('|')] - for j, col in enumerate(cols): - if col == 'GPU Time': - gpu_time_col_idx = j - break - if gpu_time_col_idx is not None: - if verbose: - print(f" Found GPU Time at column index {gpu_time_col_idx} in: {line.rstrip()}") - break - - if gpu_time_col_idx is None: - print(" Warning: Could not find GPU Time column in output") - return results - - for line in lines: - if '|' not in line or 'GPU Time' in line or '---' in line: - continue - cols = [col.strip() for col in line.split('|')] - if len(cols) <= gpu_time_col_idx: - continue - gpu_time_str = cols[gpu_time_col_idx] - gpu_time_ms = parse_time_value(gpu_time_str) - if gpu_time_ms is not None: - if verbose: - print(f" Parsed: type=fltflt, gpu_time_col={gpu_time_str!r}, value={gpu_time_ms:.6f} ms") - results['fltflt'] = gpu_time_ms - break - - return results - - -def format_time(time_ms): - """Format a time in ms with appropriate precision and units.""" - if time_ms is None: - return "N/A" - if time_ms < 0.001: - return f"{time_ms * 1e6:.3f} ns" - elif time_ms < 1.0: - return f"{time_ms * 1000.0:.3f} us" - else: - return f"{time_ms:.3f} ms" - - -def calculate_relative_performance(results): - """ - Calculate performance relative to float (single-precision). - float = 1.0x (baseline) - Higher values mean slower (took more time relative to float) - """ - relative = {} - - for bench_name, timings in results.items(): - if 'float' not in timings: - print(f"Warning: No float baseline for {bench_name}, skipping") - continue - - float_time = timings['float'] - relative[bench_name] = {} - - for precision, time_value in timings.items(): - # Relative slowdown: how many times slower than float - relative[bench_name][precision] = time_value / float_time - - return relative - - -def print_summary(results, relative): - """Print a formatted summary table.""" - print("\n") - print("=" * 80) - print("FLTFLT BENCHMARK SUMMARY") - print("=" * 80) - print() - print("Performance relative to single-precision (float = 1.0x baseline)") - print("Higher values indicate slower performance") - print() - - # Print header - print(f"{'Benchmark':<15} {'float':<12} {'double':<12} {'fltflt':<12} {'fltflt vs dbl':<15}") - print("-" * 66) - - # Order benchmarks - use the canonical order but only show benchmarks that were actually run - bench_order = ['add', 'sub', 'mul', 'div', 'sqrt', 'sqrt_fast', 'norm3d', 'abs', 'fma', 'madd', 'round', 'trunc', 'floor', 'fmod', 'cast2dbl', 'cast2fltflt'] - # Filter to only benchmarks present in results - bench_order = [b for b in bench_order if b in results] - - for bench in bench_order: - if bench not in relative: - continue - - rel = relative[bench] - timings = results[bench] - - # Get values with defaults - float_rel = rel.get('float', 1.0) - double_rel = rel.get('double', None) - fltflt_rel = rel.get('fltflt', None) - - # Calculate fltflt speedup vs double (double_time / fltflt_time) - fltflt_vs_double = None - if 'double' in timings and 'fltflt' in timings: - fltflt_vs_double = timings['double'] / timings['fltflt'] - - # Format output - float_str = f"{float_rel:.2f}x" - double_str = f"{double_rel:.2f}x" if double_rel is not None else "N/A" - fltflt_str = f"{fltflt_rel:.2f}x" if fltflt_rel is not None else "N/A" - speedup_str = f"{fltflt_vs_double:.2f}x" if fltflt_vs_double is not None else "N/A" - - print(f"{bench:<15} {float_str:<12} {double_str:<12} {fltflt_str:<12} {speedup_str:<15}") - - print() - print("-" * 80) - print("Raw timings (auto-scaled units):") - print() - print(f"{'Benchmark':<15} {'float':<15} {'double':<15} {'fltflt':<15} {'fltflt vs dbl':<15}") - print("-" * 75) - - for bench in bench_order: - timings = results[bench] - - float_time = timings.get('float', None) - double_time = timings.get('double', None) - fltflt_time = timings.get('fltflt', None) - - # Calculate fltflt speedup vs double - fltflt_vs_double = None - if double_time is not None and fltflt_time is not None: - fltflt_vs_double = double_time / fltflt_time - - float_str = format_time(float_time) - double_str = format_time(double_time) - fltflt_str = format_time(fltflt_time) - speedup_str = f"{fltflt_vs_double:.2f}x" if fltflt_vs_double is not None else "N/A" - - print(f"{bench:<15} {float_str:<15} {double_str:<15} {fltflt_str:<15} {speedup_str:<15}") - - print("=" * 80) - - -def main(): - parser = argparse.ArgumentParser( - description="Run fltflt arithmetic benchmarks and summarize results." - ) - parser.add_argument( - "--build-dir", - type=Path, - default=None, - help="Path to the MatX build directory containing bench/bench_00_misc_fltflt_arithmetic. " - "If not specified, common locations are searched automatically.", - ) - parser.add_argument( - "--verbose", "-v", - action="store_true", - help="Print verbose output including raw benchmark output and parsed values.", - ) - parser.add_argument( - "--benchmarks", - nargs="+", - default=None, - metavar="BENCH", - help="Run only specific benchmarks (e.g. add sub mul). " - "Defaults to all benchmarks.", - ) - args = parser.parse_args() - - # Find MatX build directory - if args.build_dir is not None: - build_dir = args.build_dir - if not build_dir.exists(): - print(f"Error: Specified build directory does not exist: {build_dir}") - sys.exit(1) - else: - script_dir = Path(__file__).parent - - # Check if the current working directory looks like a valid build directory - # (i.e. it already contains bench/bench_* executables). This lets users run the script - # from any build directory without needing --build-dir. - cwd = Path.cwd() - if build_dir_contains_benchmark_exes(cwd): - build_dir = cwd - else: - # Fall back to searching common locations relative to the script - possible_build_dirs = [ - script_dir / "build", - script_dir / "repos" / "MatX" / "build", - script_dir / "../build", - script_dir / "../../build", - ] - - build_dir = None - for bd in possible_build_dirs: - bd_resolved = bd.resolve() - if bd_resolved.exists() and build_dir_contains_benchmark_exes(bd_resolved): - build_dir = bd_resolved - break - - if build_dir is None: - print("Error: Could not find MatX build directory") - print("Try running from a build directory, or use --build-dir to specify one") - sys.exit(1) - - print(f"Using build directory: {build_dir}") - - # Find benchmark executable - benchmark_exe = find_benchmark_executable(build_dir) - - if benchmark_exe is None: - sys.exit(1) - - print(f"Found benchmark: {benchmark_exe}") - print() - - # List of benchmarks to run - all_benchmarks = ['add', 'sub', 'mul', 'div', 'sqrt', 'sqrt_fast', 'norm3d', 'abs', 'fma', 'madd', 'round', 'trunc', 'floor', 'fmod', 'cast2dbl', 'cast2fltflt'] - # Benchmarks that only have a fltflt variant (no float/double type axis) - fltflt_only_benchmarks = set() - benchmarks = args.benchmarks if args.benchmarks is not None else all_benchmarks - - # Validate user-provided benchmarks - if args.benchmarks is not None: - invalid_benchmarks = [b for b in args.benchmarks if b not in all_benchmarks] - if invalid_benchmarks: - print(f"Error: Unknown benchmark(s): {', '.join(invalid_benchmarks)}") - print(f"Valid benchmarks are: {', '.join(all_benchmarks)}") - sys.exit(1) - - all_results = {} - - # Run each benchmark - for bench in benchmarks: - bench_name = f"fltflt_bench_{bench}" - print(f"\n{'=' * 80}") - output = run_benchmark(benchmark_exe, bench_name, verbose=args.verbose) - - if output is None: - print(f" Skipping {bench} due to error") - continue - - # Parse results - if bench in fltflt_only_benchmarks: - results = parse_benchmark_output_no_type(output, verbose=args.verbose) - else: - results = parse_benchmark_output(output, verbose=args.verbose) - - if not results: - print(f" Warning: Could not parse results for {bench}") - print(" Raw output:") - print(output) - continue - - all_results[bench] = results - parsed_parts = [f"{k}={format_time(v)}" for k, v in results.items()] - print(f" Parsed: {', '.join(parsed_parts)}") - - print(f"\n{'=' * 80}") - - if not all_results: - print("\nError: No benchmark results collected") - sys.exit(1) - - print(f"\nSuccessfully collected results for {len(all_results)} benchmarks") - - # Calculate relative performance - relative = calculate_relative_performance(all_results) - - # Print summary - print_summary(all_results, relative) - -if __name__ == "__main__": - main() diff --git a/bench/scripts/run_sarbp_benchmarks.py b/bench/scripts/run_sarbp_benchmarks.py deleted file mode 100755 index a13d49f8e..000000000 --- a/bench/scripts/run_sarbp_benchmarks.py +++ /dev/null @@ -1,466 +0,0 @@ -#!/usr/bin/env python3 - -# BSD 3-Clause License -# -# Copyright (c) 2026, NVIDIA Corporation -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" -Run SAR backprojection benchmarks and summarize results. -Computes gigabackprojections per second (Gproj/s) for each benchmark variant. -""" - -import subprocess -import re -import sys -import argparse -from pathlib import Path - -# Regex to strip ANSI escape codes from nvbench colored output -ANSI_ESCAPE = re.compile(r'\x1b\[[0-9;]*[mK]') - - -def strip_ansi(text): - """Remove ANSI escape codes from a string.""" - return ANSI_ESCAPE.sub('', text) - - -def _resolve_bench_executable(build_dir, stem): - """Return path to bench/ or bench/.exe if present.""" - bench_dir = build_dir / "bench" - for name in (stem, f"{stem}.exe"): - path = bench_dir / name - if path.is_file(): - return path - return None - - -def find_benchmark_executable(build_dir): - """Find the SAR BP benchmark executable (per-source CMake target).""" - stem = "bench_00_transform_sarbp" - benchmark_path = _resolve_bench_executable(build_dir, stem) - - if benchmark_path is not None: - return benchmark_path - - print( - "Error: Could not find benchmark executable " - f"bench/{stem} under {build_dir}" - ) - return None - - -def build_dir_contains_benchmark_exes(build_dir): - """True if build_dir/bench contains at least one bench_* executable.""" - bench_dir = build_dir / "bench" - if not bench_dir.is_dir(): - return False - return any( - p.is_file() and p.name.startswith("bench_") for p in bench_dir.iterdir() - ) - - -def run_benchmark(executable_path, benchmark_name, verbose=False): - """Run a specific benchmark and capture output.""" - print(f"Running benchmark: {benchmark_name}") - - try: - result = subprocess.run( - [str(executable_path), "--benchmark", benchmark_name], - capture_output=True, - text=True, - timeout=600 # 10 minute timeout for larger benchmarks - ) - - if result.returncode != 0: - print(f" Warning: Benchmark failed with return code {result.returncode}") - print(f" stderr: {result.stderr}") - return None - - if verbose: - print(f" Raw output:\n{result.stdout}") - - return result.stdout - except subprocess.TimeoutExpired: - print(f" Benchmark timed out after 10 minutes") - return None - except Exception as e: - print(f" Error running benchmark: {e}") - return None - - -def parse_time_value(time_str): - """Parse time string like '668.707 us' or '6.785 ms' and convert to seconds.""" - time_str = strip_ansi(time_str).strip() - - # Match number and unit - match = re.match(r'([\d.]+)\s*(us|ms|ns|s)', time_str) - if not match: - return None - - value = float(match.group(1)) - unit = match.group(2) - - # Convert to seconds - if unit == 'us': - return value / 1_000_000.0 - elif unit == 'ms': - return value / 1_000.0 - elif unit == 'ns': - return value / 1_000_000_000.0 - elif unit == 's': - return value - else: - return value - - -def parse_benchmark_output(output, verbose=False): - """ - Parse the table format output from nvbench for sarbp benchmarks. - - Expected format: - | Problem Size | ... | GPU Time | ... - |--------------|-----|------------|----- - | 1000 | ... | 123.456 ms | ... - | 2000 | ... | 987.654 ms | ... - """ - results = {} - output = strip_ansi(output) - lines = output.strip().split('\n') - - # Find the header line to locate GPU Time and Problem Size columns - gpu_time_col_idx = None - problem_size_col_idx = None - for i, line in enumerate(lines): - if '|' in line and 'GPU Time' in line: - # Split by | and find column indices - cols = [col.strip() for col in line.split('|')] - for j, col in enumerate(cols): - if col == 'GPU Time': - gpu_time_col_idx = j - elif col == 'Problem Size': - problem_size_col_idx = j - if gpu_time_col_idx is not None and problem_size_col_idx is not None: - if verbose: - print(f" Found GPU Time at column index {gpu_time_col_idx}, " - f"Problem Size at column index {problem_size_col_idx} in: {line.rstrip()}") - break - - if gpu_time_col_idx is None: - print(" Warning: Could not find GPU Time column in output") - return results - - if problem_size_col_idx is None: - print(" Warning: Could not find Problem Size column in output") - return results - - # Parse data rows - for line in lines: - if '|' not in line: - continue - - # Skip header and separator lines - if 'GPU Time' in line or '---' in line or 'Problem Size' in line: - continue - - cols = [col.strip() for col in line.split('|')] - - if len(cols) <= max(gpu_time_col_idx, problem_size_col_idx): - continue - - # Get problem size - problem_size_str = cols[problem_size_col_idx] - try: - problem_size = int(problem_size_str) - except ValueError: - continue - - # Extract GPU time - gpu_time_str = cols[gpu_time_col_idx] - gpu_time_s = parse_time_value(gpu_time_str) - - if gpu_time_s is not None: - if verbose: - print(f" Parsed: problem_size={problem_size}, gpu_time_col={gpu_time_str!r}, value={gpu_time_s*1000:.6f} ms") - results[problem_size] = gpu_time_s - elif verbose: - print(f" Warning: Could not parse GPU time from col {gpu_time_col_idx}: {gpu_time_str!r}") - - return results - - -def calculate_gproj_per_sec(problem_size, time_seconds): - """ - Calculate gigabackprojections per second. - - Each sarbp execution computes: - num_pulses * image_width * image_height backprojection operations - - For our benchmarks: all dimensions = problem_size - So: operations = problem_size^3 - """ - operations = problem_size ** 3 - giga_operations = operations / 1e9 - gproj_per_sec = giga_operations / time_seconds - return gproj_per_sec - - -def print_summary(all_results): - """Print a formatted summary table.""" - print("\n") - print("=" * 100) - print("SAR BACKPROJECTION BENCHMARK SUMMARY") - print("=" * 100) - print() - print("Performance in Gigabackprojections per second (Gproj/s)") - print("Higher values indicate better performance") - print() - print(f"Note: Operations = num_pulses × image_width × image_height = problem_size³") - print() - - # Print detailed results for each variant - # Use only the variants that were actually run - variants = sorted(all_results.keys()) - - for variant in variants: - print(f"\n{variant.upper()} Precision:") - print("-" * 80) - print(f"{'Problem Size':<15} {'Operations':<18} {'Time (ms)':<15} {'Gproj/s':<15}") - print("-" * 80) - - for problem_size in sorted(all_results[variant].keys()): - time_s = all_results[variant][problem_size] - time_ms = time_s * 1000.0 - operations = problem_size ** 3 - gproj_s = calculate_gproj_per_sec(problem_size, time_s) - - print(f"{problem_size:<15} {operations:<18,} {time_ms:<15.3f} {gproj_s:<15.3f}") - - # Print comparative summary - print("\n") - print("=" * 100) - print("COMPARATIVE SUMMARY (Gproj/s)") - print("=" * 100) - print() - - # Get all problem sizes - all_problem_sizes = set() - for variant_results in all_results.values(): - all_problem_sizes.update(variant_results.keys()) - all_problem_sizes = sorted(all_problem_sizes) - - # Print header - header = f"{'Problem Size':<15}" - for variant in variants: - header += f" {variant:<15}" - print(header) - print("-" * 100) - - # Print data rows - for problem_size in all_problem_sizes: - row = f"{problem_size:<15}" - for variant in variants: - if problem_size in all_results[variant]: - time_s = all_results[variant][problem_size] - gproj_s = calculate_gproj_per_sec(problem_size, time_s) - row += f" {gproj_s:<15.3f}" - else: - row += f" {'N/A':<15}" - print(row) - - # Print relative performance (relative to float) - only if float was run - if 'float' in all_results: - print("\n") - print("=" * 100) - print("RELATIVE PERFORMANCE (float = 1.0x baseline)") - print("=" * 100) - print() - - # Print header - header = f"{'Problem Size':<15}" - for variant in variants: - header += f" {variant:<15}" - print(header) - print("-" * 100) - - # Print data rows - for problem_size in all_problem_sizes: - row = f"{problem_size:<15}" - - # Get float baseline - float_gproj_s = None - if problem_size in all_results['float']: - time_s = all_results['float'][problem_size] - float_gproj_s = calculate_gproj_per_sec(problem_size, time_s) - - for variant in variants: - if problem_size in all_results[variant]: - time_s = all_results[variant][problem_size] - gproj_s = calculate_gproj_per_sec(problem_size, time_s) - - if float_gproj_s is not None and float_gproj_s > 0: - relative = gproj_s / float_gproj_s - row += f" {relative:<15.3f}" - else: - row += f" {1.0:<15.3f}" if variant == 'float' else f" {'N/A':<15}" - else: - row += f" {'N/A':<15}" - print(row) - - print("=" * 100) - else: - print("=" * 100) - - -def main(): - parser = argparse.ArgumentParser( - description="Run SAR backprojection benchmarks and summarize results." - ) - parser.add_argument( - "--build-dir", - type=Path, - default=None, - help="Path to the MatX build directory containing bench/bench_00_transform_sarbp. " - "If not specified, the current working directory is checked first, " - "then common locations relative to the script are searched.", - ) - parser.add_argument( - "--verbose", "-v", - action="store_true", - help="Print verbose output including raw benchmark output and parsed values.", - ) - parser.add_argument( - "--variants", - nargs="+", - default=None, - metavar="VARIANT", - help="Run only specific benchmark variants (e.g. float double). " - "Defaults to all variants: float double mixed fltflt.", - ) - args = parser.parse_args() - - # Find MatX build directory - if args.build_dir is not None: - build_dir = args.build_dir - if not build_dir.exists(): - print(f"Error: Specified build directory does not exist: {build_dir}") - sys.exit(1) - else: - # Check if the current working directory looks like a valid build directory - # (i.e. it already contains bench/bench_* executables). This lets users run the script - # from any build directory without needing --build-dir. - cwd = Path.cwd() - if build_dir_contains_benchmark_exes(cwd): - build_dir = cwd - else: - # Fall back to searching common locations relative to the script - script_dir = Path(__file__).parent - possible_build_dirs = [ - script_dir / "build", - script_dir / "repos" / "MatX" / "build", - script_dir / "../build", - script_dir / "../../build", - ] - - build_dir = None - for bd in possible_build_dirs: - bd_resolved = bd.resolve() - if bd_resolved.exists() and build_dir_contains_benchmark_exes(bd_resolved): - build_dir = bd_resolved - break - - if build_dir is None: - print("Error: Could not find MatX build directory") - print("Try running from a build directory, or use --build-dir to specify one") - sys.exit(1) - - print(f"Using build directory: {build_dir}") - - # Find benchmark executable - benchmark_exe = find_benchmark_executable(build_dir) - - if benchmark_exe is None: - sys.exit(1) - - print(f"Found benchmark: {benchmark_exe}") - print() - - # List of SAR BP benchmark variants - all_variants = ['float', 'double', 'mixed', 'fltflt'] - variants = args.variants if args.variants is not None else all_variants - - # Validate user-provided variants - if args.variants is not None: - invalid_variants = [v for v in args.variants if v not in all_variants] - if invalid_variants: - print(f"Error: Unknown variant(s): {', '.join(invalid_variants)}") - print(f"Valid variants are: {', '.join(all_variants)}") - sys.exit(1) - - all_results = {} - - # Run each benchmark variant - for variant in variants: - bench_name = f"sarbp_{variant}" - print(f"\n{'=' * 100}") - output = run_benchmark(benchmark_exe, bench_name, verbose=args.verbose) - - if output is None: - print(f" Skipping {variant} due to error") - continue - - # Parse results - results = parse_benchmark_output(output, verbose=args.verbose) - - if not results: - print(f" Warning: Could not parse results for {variant}") - print(" Raw output:") - print(output) - continue - - all_results[variant] = results - - # Print parsed results with Gproj/s - parsed_str = ', '.join([ - f'size={size}: {time_s*1000:.3f}ms ({calculate_gproj_per_sec(size, time_s):.3f} Gproj/s)' - for size, time_s in sorted(results.items()) - ]) - print(f" Parsed: {parsed_str}") - - print(f"\n{'=' * 100}") - - if not all_results: - print("\nError: No benchmark results collected") - sys.exit(1) - - print(f"\nSuccessfully collected results for {len(all_results)} benchmark variants") - - # Print summary - print_summary(all_results) - -if __name__ == "__main__": - main() diff --git a/include/matx/kernels/fltflt.h b/include/matx/kernels/fltflt.h index 99673f0f1..b876d7b1c 100644 --- a/include/matx/kernels/fltflt.h +++ b/include/matx/kernels/fltflt.h @@ -47,6 +47,10 @@ namespace matx { // "Extended-Precision Floating-Point Numbers for GPU Computation", Andrew Thall, // https://andrewthall.org/papers/df64_qf128.pdf // That paper cites key work from D. E. Knuth, T. J. Dekker, A. H. Karp and others. +// The reference for the FPAN-based implementation of fltflt_add() is: +// "High-Performance Branch-Free Algorithms for Extended-Precision Floating-Point Arithmetic", +// David K. Zhang and Alex Aiken, Proceedings of the International Conference for High Performance +// Computing, Networking, Storage and Analysis, 2025. // fltflt represents an unevaluated floating point sum of two non-overlapping fp32 components. // The hi component is the most significant part of the sum, and the lo component is the least significant part. @@ -250,21 +254,40 @@ static __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt fltflt_two_prod_fma( return fltflt{ x, y }; } -// fltflt_add is the df64_add() function given by Thall. This function uses two_sum() -// for the hi and lo components followed by addition of the cross terms and -// re-normalization to a non-overlapping expansion. +// FPAN-based two-term addition from Zhang & Aiken (SC'25), Figure 2. +// +// The Thall df64_add() form chains two FastTwoSums in series on the +// critical path (the lo-side path traverses both), giving depth 13 fp32 +// ops. The FPAN form below runs the FastTwoSum on the hi parts (q) +// concurrently with the lo-side add (st_lo), so q's 3 ops sit entirely +// off the critical path and the lo-side path traverses only one +// FastTwoSum. Critical-path depth: 10 fp32 ops vs 13 for Thall. +// +// Both forms use the same 20 fp32 ops total, so steady-state throughput +// is identical. The latency win shows up when the SM cannot fully hide +// the per-call dependency chain (low occupancy, serial accumulators, +// reduction tails). For reference, Thall's df64_add reads: +// fltflt s = fltflt_two_sum(a.hi, b.hi); +// const fltflt t = fltflt_two_sum(a.lo, b.lo); +// s.lo = detail::fadd_rn(s.lo, t.hi); +// s = fltflt_fast_two_sum(s.hi, s.lo); +// s.lo = detail::fadd_rn(s.lo, t.lo); +// s = fltflt_fast_two_sum(s.hi, s.lo); +// return s; static __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt fltflt_add(fltflt a, fltflt b) { fltflt s = fltflt_two_sum(a.hi, b.hi); const fltflt t = fltflt_two_sum(a.lo, b.lo); - s.lo = detail::fadd_rn(s.lo, t.hi); - s = fltflt_fast_two_sum(s.hi, s.lo); - s.lo = detail::fadd_rn(s.lo, t.lo); - s = fltflt_fast_two_sum(s.hi, s.lo); - return s; + const fltflt q = fltflt_fast_two_sum(s.hi, t.hi); + const float st_lo = detail::fadd_rn(s.lo, t.lo); + const float stq_lo = detail::fadd_rn(st_lo, q.lo); + return fltflt_fast_two_sum(q.hi, stq_lo); } // This overload is an optimization of fltflt_add() for the case where b is -// a float, and thus b.lo is zero. +// a float, and thus b.lo is zero. The FPAN restructuring above does not +// apply here because there is no second TwoSum to lift; this form +// remains the Thall-style chain (TwoSum -> add -> FastTwoSum, ~9 fp32 +// ops on the critical path). static __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt fltflt_add(fltflt a, float b) { fltflt s = fltflt_two_sum(a.hi, b); s.lo = detail::fadd_rn(s.lo, a.lo);