diff --git a/bench/00_misc/fltflt_arithmetic.cu b/bench/00_misc/fltflt_arithmetic.cu
index 25ad61395..2b5503074 100644
--- a/bench/00_misc/fltflt_arithmetic.cu
+++ b/bench/00_misc/fltflt_arithmetic.cu
@@ -70,6 +70,72 @@ static void add_gops_per_sec_summary(nvbench::state &state, double ops_per_op =
   s.set_float64("value", total_ops / seconds / 1e9);
 }
 
+// Bump a value to the next bit pattern by adding 1 to its integer
+// representation. Used to vary a loop input across iterations without
+// charging the benchmark for an unrelated fp add on every iteration.
+//
+// Only `double` and `fltflt` overloads are provided: on most GPUs an
+// int64 add is significantly faster than an fp64 add, and the fltflt
+// alternative would dispatch through fltflt_add (~20 fp32 ops). For
+// float, fp32 add and int32 add run at the same rate, so call sites
+// should keep `x = x + small`.
+//
+// For fltflt, only the hi component is bumped, leaving the pair
+// non-canonical -- benches only care that the value differs from the
+// previous iteration. The +1 would be UB at the largest-positive bit
+// pattern (a NaN); call sites here never reach that.
+__device__ __forceinline__ void bump_ulp(double &x) {
+  x = __longlong_as_double(__double_as_longlong(x) + 1LL);
+}
+__device__ __forceinline__ void bump_ulp(fltflt &x) {
+  x.hi = __int_as_float(__float_as_int(x.hi) + 1);
+}
+
+// Compute-bound kernel used solely to spin GPU clocks up to steady
+// state. Self-contained (doesn't depend on any of the iterative_*
+// kernels below) so it can be called from warmup_gpu_once() before
+// they're defined.
+__global__ void clock_warmup_kernel(float *out, int N)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  float acc = static_cast<float>(idx) * 0.001f;
+  #pragma unroll 1
+  for (int i = 0; i < N; i++) {
+    acc = acc * 1.0000001f + 1.0e-6f;
+  }
+  if (idx == 0) out[0] = acc;
+}
+
+// Idempotent process-level GPU warmup. Only the first call in a
+// process actually runs warmup launches; subsequent calls are no-ops.
+// Brings GPU clocks to steady state before the first nvbench timing
+// window so the *first* benchmark to execute (whichever one that may
+// be) does not get charged for clock ramp-up.
+//
+// inner_iters is sized so each launch runs ~50 ms on Blackwell, and
+// four launches give ~200 ms total -- comfortably past the GPU's clock
+// ramp window. (FMA-bound work retires near peak fp32 throughput, so
+// undersized warmups would otherwise be over in a few ms.)
+static void warmup_gpu_once()
+{
+  static bool warmed = false;
+  if (warmed) return;
+  warmed = true;
+
+  constexpr int block_size = 256;
+  constexpr int grid_size  = 1024;
+  constexpr int inner_iters = 2'000'000;
+
+  float *tmp = nullptr;
+  MATX_CUDA_CHECK(cudaMalloc(&tmp, sizeof(float)));
+  for (int w = 0; w < 4; w++) {
+    clock_warmup_kernel<<<grid_size, block_size>>>(tmp, inner_iters);
+    MATX_CUDA_CHECK_LAST_ERROR();
+  }
+  MATX_CUDA_CHECK(cudaDeviceSynchronize());
+  MATX_CUDA_CHECK(cudaFree(tmp));
+}
+
 template <typename T>
 __global__ void iterative_add_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
 {
@@ -277,10 +343,15 @@ __global__ void iterative_fma_kernel(T* __restrict__ result, int64_t size, int32
 }
 
 //==============================================================================
-// Addition Benchmark
+// Addition Throughput Benchmark
+//
+// Many independent accumulators (ILP_FACTOR=8) and outer-loop unrolling
+// expose maximum instruction-level parallelism. Latency-hiding fully covers
+// per-call dependency chains, so this measures *throughput*: ops/sec when
+// the warp scheduler always has independent work in flight.
 //==============================================================================
 template <typename PrecisionType>
-void fltflt_bench_add(nvbench::state &state, nvbench::type_list<PrecisionType>)
+void fltflt_bench_add_throughput(nvbench::state &state, nvbench::type_list<PrecisionType>)
 {
   const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
   const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
@@ -296,6 +367,7 @@ void fltflt_bench_add(nvbench::state &state, nvbench::type_list<PrecisionType>)
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   // Benchmark execution
@@ -306,7 +378,7 @@ void fltflt_bench_add(nvbench::state &state, nvbench::type_list<PrecisionType>)
   add_gops_per_sec_summary(state);
 }
 
-NVBENCH_BENCH_TYPES(fltflt_bench_add, NVBENCH_TYPE_AXES(precision_types))
+NVBENCH_BENCH_TYPES(fltflt_bench_add_throughput, NVBENCH_TYPE_AXES(precision_types))
   .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
   .add_int64_axis("Iterations", {250});
 
@@ -328,6 +400,7 @@ void fltflt_bench_sub(nvbench::state &state, nvbench::type_list<PrecisionType>)
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -359,6 +432,7 @@ void fltflt_bench_mul(nvbench::state &state, nvbench::type_list<PrecisionType>)
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -390,6 +464,7 @@ void fltflt_bench_div(nvbench::state &state, nvbench::type_list<PrecisionType>)
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -421,6 +496,7 @@ void fltflt_bench_sqrt(nvbench::state &state, nvbench::type_list<PrecisionType>)
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -492,6 +568,7 @@ void fltflt_bench_sqrt_fast(nvbench::state &state, nvbench::type_list<PrecisionT
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -574,6 +651,7 @@ void fltflt_bench_norm3d(nvbench::state &state, nvbench::type_list<PrecisionType
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -606,6 +684,7 @@ void fltflt_bench_abs(nvbench::state &state, nvbench::type_list<PrecisionType>)
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -637,6 +716,7 @@ void fltflt_bench_fma(nvbench::state &state, nvbench::type_list<PrecisionType>)
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -704,6 +784,7 @@ void fltflt_bench_madd(nvbench::state &state, nvbench::type_list<PrecisionType>)
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -778,6 +859,7 @@ void fltflt_bench_round(nvbench::state &state, nvbench::type_list<PrecisionType>
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -829,10 +911,13 @@ __global__ void iterative_fmod_kernel(T* __restrict__ result, int64_t size, int3
           asm volatile("" : "+d"(val[ilp]));
         }
       }
-      if constexpr (std::is_same_v<T, fltflt>) {
-        init_val = init_val + 2048.0f;
+      if constexpr (std::is_same_v<T, float>) {
+        // fp32 add is full-rate, no benefit from a bit-twiddle here.
+        init_val += 2048.0f;
       } else {
-        init_val += static_cast<T>(2048.0f);
+        // Bit-pattern bump avoids an fp64 add (or full fltflt_add) on
+        // every iteration just to defeat hoisting of the fmod call.
+        bump_ulp(init_val);
       }
     }
 
@@ -860,6 +945,7 @@ void fltflt_bench_fmod(nvbench::state &state, nvbench::type_list<PrecisionType>)
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -910,7 +996,14 @@ __global__ void iterative_trunc_kernel(T* __restrict__ result, int64_t size, int
           asm volatile("" : "+d"(val[ilp]));
         }
       }
-      init_val = init_val + static_cast<T>(2048.0f);
+      if constexpr (std::is_same_v<T, float>) {
+        // fp32 add is full-rate, no benefit from a bit-twiddle here.
+        init_val += 2048.0f;
+      } else {
+        // Bit-pattern bump avoids an fp64 add (or full fltflt_add) on
+        // every iteration just to defeat hoisting of the trunc call.
+        bump_ulp(init_val);
+      }
     }
 
     T result_val = val[0];
@@ -937,6 +1030,7 @@ void fltflt_bench_trunc(nvbench::state &state, nvbench::type_list<PrecisionType>
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -987,7 +1081,14 @@ __global__ void iterative_floor_kernel(T* __restrict__ result, int64_t size, int
           asm volatile("" : "+d"(val[ilp]));
         }
       }
-      init_val = init_val + static_cast<T>(2048.0f);
+      if constexpr (std::is_same_v<T, float>) {
+        // fp32 add is full-rate, no benefit from a bit-twiddle here.
+        init_val += 2048.0f;
+      } else {
+        // Bit-pattern bump avoids an fp64 add (or full fltflt_add) on
+        // every iteration just to defeat hoisting of the floor call.
+        bump_ulp(init_val);
+      }
     }
 
     T result_val = val[0];
@@ -1014,6 +1115,7 @@ void fltflt_bench_floor(nvbench::state &state, nvbench::type_list<PrecisionType>
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -1050,7 +1152,15 @@ __global__ void iterative_cast2dbl_kernel(double* __restrict__ result, int64_t s
         acc[ilp] = static_cast<double>(src_val);
         asm volatile("" : "+d"(acc[ilp]));
       }
-      src_val = src_val + static_cast<T>(0.0001);
+      if constexpr (std::is_same_v<T, float>) {
+        // fp32 add is full-rate, no benefit from a bit-twiddle here.
+        src_val = src_val + static_cast<T>(0.0001);
+      } else {
+        // Vary src_val via bit-pattern bump -- keeps the cast2dbl cost
+        // un-contaminated by an unrelated fp64 add or fltflt_add per
+        // iteration.
+        bump_ulp(src_val);
+      }
     }
 
     double result_val = acc[0];
@@ -1077,6 +1187,7 @@ void fltflt_bench_cast2dbl(nvbench::state &state, nvbench::type_list<PrecisionTy
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -1113,12 +1224,14 @@ __global__ void iterative_cast2fltflt_kernel(fltflt* __restrict__ result, int64_
         acc[ilp] = static_cast<fltflt>(src_val);
         asm volatile("" : "+f"(acc[ilp].hi), "+f"(acc[ilp].lo));
       }
-      // For double, increment the bit pattern to get the next representable value
-      // so the loop anti-aliasing doesn't introduce a double-precision add.
-      if constexpr (cuda::std::is_same_v<T, double>) {
-        src_val = __longlong_as_double(__double_as_longlong(src_val) + 1LL);
-      } else {
+      if constexpr (std::is_same_v<T, float>) {
+        // fp32 add is full-rate, no benefit from a bit-twiddle here.
         src_val = src_val + static_cast<T>(0.0001);
+      } else {
+        // Vary src_val via bit-pattern bump -- keeps the cast2fltflt
+        // cost un-contaminated by an unrelated fp64 add or fltflt_add
+        // per iteration.
+        bump_ulp(src_val);
       }
     }
 
@@ -1146,6 +1259,7 @@ void fltflt_bench_cast2fltflt(nvbench::state &state, nvbench::type_list<Precisio
   constexpr int block_size = 256;
   int grid_size = static_cast<int>((size + block_size - 1) / block_size);
 
+  warmup_gpu_once();
   exec.sync();
 
   state.exec([&](nvbench::launch &launch) {
@@ -1158,3 +1272,66 @@ void fltflt_bench_cast2fltflt(nvbench::state &state, nvbench::type_list<Precisio
 NVBENCH_BENCH_TYPES(fltflt_bench_cast2fltflt, NVBENCH_TYPE_AXES(precision_types))
   .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
   .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Addition Latency Benchmark
+//
+// Mirrors fltflt_bench_add_throughput but with the opposite scheduling
+// posture: a single in-flight accumulator per thread, no ILP, no inner-loop
+// unroll, and step varies per iteration (so the compiler cannot hoist or
+// reassociate the chain). Each iteration's input depends on the previous
+// iteration's output, so per-call dependency chains directly drive runtime.
+//
+// For fltflt this exposes the depth difference between the production
+// fltflt_add (Zhang & Aiken SC'25 Fig 2 FPAN, critical path ~10 fp32 ops)
+// and Thall's df64_add (~13 fp32 ops). The "Blocks" axis sweeps the
+// latency->throughput transition: at Blocks=1 only one warp runs on one SM,
+// fully exposing the chain, while at Blocks=1024 the scheduler has many
+// warps in flight and latency is partially hidden.
+//==============================================================================
+template <typename PrecisionType>
+__global__ void chain_add_kernel(int N, PrecisionType *__restrict__ out)
+{
+  // Construct via float so the same expression compiles for float, double,
+  // and fltflt (each has a constructor accepting a float).
+  PrecisionType acc{1.0f};
+#pragma unroll 1
+  for (int i = 0; i < N; i++) {
+    // step varies per iteration to defeat loop-invariant hoisting and force
+    // a true data dependency on the running accumulator.
+    const PrecisionType step{static_cast<float>(i + 1)};
+    acc = acc + step;  // dispatches to PrecisionType's operator+
+  }
+  out[blockIdx.x * blockDim.x + threadIdx.x] = acc;
+}
+
+template <typename PrecisionType>
+void fltflt_bench_add_latency(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const int chain_len = static_cast<int>(state.get_int64("Chain Length"));
+  const int blocks    = static_cast<int>(state.get_int64("Blocks"));
+  constexpr int threads = 32;  // exactly one warp per block
+
+  cudaExecutor exec{0};
+  const size_t total_threads = static_cast<size_t>(blocks) * threads;
+  auto result = make_tensor<PrecisionType>({static_cast<index_t>(total_threads)});
+
+  state.add_element_count(static_cast<int64_t>(chain_len) * total_threads, "ops");
+
+  warmup_gpu_once();
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    chain_add_kernel<PrecisionType>
+        <<<blocks, threads, 0, (cudaStream_t)launch.get_stream()>>>(
+            chain_len, result.Data());
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_add_latency, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_axis("Chain Length", {1024, 4096, 16384})
+  // Blocks=1   : 1 warp on 1 SM, all other SMs idle  -- latency fully exposed
+  // Blocks=4   : 4 warps on 4 SMs, each SM has 1 warp -- still latency-bound
+  // Blocks=160 : ~1 block per SM on a ~160-SM device  -- partial latency hiding
+  // Blocks=1024: many blocks per SM                  -- throughput-bound, latency hides
+  .add_int64_axis("Blocks",       {1, 4, 160, 1024});
diff --git a/bench/scripts/run_benchmarks.py b/bench/scripts/run_benchmarks.py
new file mode 100755
index 000000000..d945e716d
--- /dev/null
+++ b/bench/scripts/run_benchmarks.py
@@ -0,0 +1,458 @@
+#!/usr/bin/env python3
+
+# BSD 3-Clause License
+#
+# Copyright (c) 2026, NVIDIA Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+Run a profile of MatX benchmarks via nvbench and emit per-profile summaries.
+
+Each profile defines:
+  - the per-source CMake bench executable to invoke,
+  - the list of nvbench benchmark names to run,
+  - an optional summary handler that walks the nvbench JSON output and
+    prints a domain-specific table (e.g. fp32/fp64/fltflt slowdown ratios
+    for fltflt, Gproj/s for sarbp).
+
+Output:
+  - bench_results/<profile>.json  (raw nvbench JSON, the source of truth)
+  - bench_results/<profile>.md    (rendered nvbench markdown table)
+  - bench_results/<profile>.csv   (raw nvbench CSV)
+  - stdout                        (the profile's domain-specific summary)
+
+Examples:
+  python run_benchmarks.py                       # run every profile
+  python run_benchmarks.py --profile fltflt      # run a single profile
+  python run_benchmarks.py --profile sarbp -- --profile  # forward --profile to nvbench
+
+The script does not parse markdown -- it reads nvbench's JSON output, which
+is part of nvbench's stable contract.
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+
+# ---------------------------------------------------------------------------
+# nvbench JSON helpers -- schema: benchmarks[].states[].summaries[].data[].value
+# ---------------------------------------------------------------------------
+
+def summary_float(state, tag, default=None):
+    """Pull a float-valued summary tag out of one nvbench state."""
+    for s in state.get("summaries", []):
+        if s.get("tag") != tag:
+            continue
+        for d in s.get("data", []):
+            if d.get("name") == "value":
+                try:
+                    return float(d.get("value"))
+                except (TypeError, ValueError):
+                    return default
+    return default
+
+
+def axis_value(state, name):
+    """Pull a per-state axis value (int or string)."""
+    for a in state.get("axis_values", []):
+        if a.get("name") == name:
+            v = a.get("value")
+            try:
+                return int(v)
+            except (TypeError, ValueError):
+                return v
+    return None
+
+
+def states_for_benchmark(data, bench_name):
+    """Yield all states for a given benchmark name from a parsed JSON file."""
+    for b in data.get("benchmarks", []):
+        if b.get("name") == bench_name:
+            yield from b.get("states", [])
+
+
+GPU_TIME_TAG = "nv/cold/time/gpu/mean"  # seconds
+
+
+def fmt_time(seconds):
+    """Format seconds in auto-scaled units."""
+    if seconds is None:
+        return "N/A"
+    if seconds < 1e-6:
+        return f"{seconds * 1e9:.3f} ns"
+    if seconds < 1e-3:
+        return f"{seconds * 1e6:.3f} us"
+    if seconds < 1.0:
+        return f"{seconds * 1e3:.3f} ms"
+    return f"{seconds:.3f} s"
+
+
+# ---------------------------------------------------------------------------
+# fltflt summary: per-op fp32 / fp64 / fltflt slowdown ratios.
+# ---------------------------------------------------------------------------
+
+# Map from nvbench type-axis "input_string" to a friendly precision label.
+_PRECISION_FROM_AXIS = {
+    "F32": "float",
+    "F64": "double",
+    "matx::fltflt": "fltflt",
+}
+
+
+def _fltflt_pick_time(states, *, prefer=None):
+    """For a list of states (one per type), return {precision: gpu_time_seconds}.
+
+    `prefer`, if given, is a dict of axis_name -> value that selects the row
+    used when the bench has additional axes (e.g. {Blocks: 1, Chain Length:
+    16384} for the latency bench). Without `prefer`, the first matching row
+    is taken (typical for benches with only the type axis).
+    """
+    out = {}
+    for st in states:
+        type_str = axis_value(st, "T")
+        prec = _PRECISION_FROM_AXIS.get(type_str)
+        if prec is None or prec in out:
+            continue
+        if prefer is not None:
+            ok = True
+            for k, v in prefer.items():
+                if axis_value(st, k) != v:
+                    ok = False
+                    break
+            if not ok:
+                continue
+        t = summary_float(st, GPU_TIME_TAG)
+        if t is not None:
+            out[prec] = t
+    return out
+
+
+def summarize_fltflt(json_path):
+    """Walk nvbench JSON for the fltflt profile and emit a slowdown table."""
+    with open(json_path) as f:
+        data = json.load(f)
+
+    # (display name, nvbench bench name, optional axis filter for picking a row).
+    rows = [
+        ("add_throughput",  "fltflt_bench_add_throughput", None),
+        ("add_latency",     "fltflt_bench_add_latency",
+            {"Blocks": 1, "Chain Length": 16384}),
+        ("sub",             "fltflt_bench_sub",            None),
+        ("mul",             "fltflt_bench_mul",            None),
+        ("div",             "fltflt_bench_div",            None),
+        ("sqrt",            "fltflt_bench_sqrt",           None),
+        ("sqrt_fast",       "fltflt_bench_sqrt_fast",      None),
+        ("norm3d",          "fltflt_bench_norm3d",         None),
+        ("abs",             "fltflt_bench_abs",            None),
+        ("fma",             "fltflt_bench_fma",            None),
+        ("madd",            "fltflt_bench_madd",           None),
+        ("round",           "fltflt_bench_round",          None),
+        ("trunc",           "fltflt_bench_trunc",          None),
+        ("floor",           "fltflt_bench_floor",          None),
+        ("fmod",            "fltflt_bench_fmod",           None),
+        ("cast2dbl",        "fltflt_bench_cast2dbl",       None),
+        ("cast2fltflt",     "fltflt_bench_cast2fltflt",    None),
+    ]
+
+    print()
+    print("=" * 86)
+    print("FLTFLT BENCHMARK SUMMARY")
+    print("=" * 86)
+    print("float / double / fltflt: slowdown vs float (lower is better, float = 1.0x).")
+    print("dbl/fltflt:              speedup of fltflt over double (higher is better).")
+    print()
+    print(f"{'Benchmark':<18}{'float':>12}{'double':>12}{'fltflt':>12}{'dbl/fltflt':>14}")
+    print("-" * 86)
+
+    for label, bench, prefer in rows:
+        states = list(states_for_benchmark(data, bench))
+        if not states:
+            continue
+        t = _fltflt_pick_time(states, prefer=prefer)
+        f = t.get("float")
+        d = t.get("double")
+        ff = t.get("fltflt")
+        if f is None:
+            continue
+
+        def slow(x):
+            return f"{x / f:.2f}x" if x is not None else "N/A"
+
+        # Speedup framing: bigger means fltflt wins more vs double.
+        speedup_vs_dbl = (
+            f"{d / ff:.2f}x" if (d is not None and ff is not None and ff > 0) else "N/A"
+        )
+        print(f"{label:<18}{slow(f):>12}{slow(d):>12}{slow(ff):>12}{speedup_vs_dbl:>14}")
+
+    print()
+    print("Raw GPU times (cold mean):")
+    print(f"{'Benchmark':<18}{'float':>14}{'double':>14}{'fltflt':>14}")
+    print("-" * 86)
+    for label, bench, prefer in rows:
+        states = list(states_for_benchmark(data, bench))
+        if not states:
+            continue
+        t = _fltflt_pick_time(states, prefer=prefer)
+        print(f"{label:<18}{fmt_time(t.get('float')):>14}"
+              f"{fmt_time(t.get('double')):>14}{fmt_time(t.get('fltflt')):>14}")
+    print("=" * 86)
+
+
+# ---------------------------------------------------------------------------
+# sarbp summary: Gproj/s = problem_size**3 / time, plus relative table.
+# ---------------------------------------------------------------------------
+
+def _sarbp_results(data):
+    """Return {variant: {problem_size: gpu_time_seconds}}."""
+    out = {}
+    for variant in ("float", "double", "mixed", "fltflt"):
+        bench_name = f"sarbp_{variant}"
+        per_size = {}
+        for st in states_for_benchmark(data, bench_name):
+            ps = axis_value(st, "Problem Size")
+            t = summary_float(st, GPU_TIME_TAG)
+            if ps is not None and t is not None:
+                per_size[ps] = t
+        if per_size:
+            out[variant] = per_size
+    return out
+
+
+def _gproj_per_sec(problem_size, time_s):
+    return (problem_size ** 3) / 1e9 / time_s
+
+
+def summarize_sarbp(json_path):
+    with open(json_path) as f:
+        data = json.load(f)
+
+    results = _sarbp_results(data)
+    if not results:
+        print("(sarbp: no results)")
+        return
+
+    variants = [v for v in ("float", "double", "mixed", "fltflt") if v in results]
+    all_sizes = sorted({s for r in results.values() for s in r})
+
+    print()
+    print("=" * 90)
+    print("SAR BACKPROJECTION BENCHMARK SUMMARY")
+    print("=" * 90)
+    print("Gigabackprojections per second (Gproj/s) -- operations = problem_size^3.")
+    print()
+
+    print(f"{'Problem Size':<14}" + "".join(f"{v:>14}" for v in variants))
+    print("-" * 90)
+    for ps in all_sizes:
+        row = f"{ps:<14}"
+        for v in variants:
+            t = results[v].get(ps)
+            row += f"{_gproj_per_sec(ps, t):>14.3f}" if t else f"{'N/A':>14}"
+        print(row)
+
+    if "float" in results:
+        print()
+        print("Relative throughput (float = 1.0x):")
+        print(f"{'Problem Size':<14}" + "".join(f"{v:>14}" for v in variants))
+        print("-" * 90)
+        for ps in all_sizes:
+            row = f"{ps:<14}"
+            f_t = results["float"].get(ps)
+            f_g = _gproj_per_sec(ps, f_t) if f_t else None
+            for v in variants:
+                t = results[v].get(ps)
+                if t is None or f_g is None:
+                    row += f"{'N/A':>14}"
+                else:
+                    row += f"{_gproj_per_sec(ps, t) / f_g:>13.3f}x"
+            print(row)
+    print("=" * 90)
+
+
+# ---------------------------------------------------------------------------
+# Profile registry -- one entry per logical bench family.
+# ---------------------------------------------------------------------------
+
+PROFILES = {
+    "fltflt": {
+        "exe_stems": ["bench_00_misc_fltflt_arithmetic", "matx_bench"],
+        "benchmarks": [
+            "fltflt_bench_add_throughput",
+            "fltflt_bench_add_latency",
+            "fltflt_bench_sub",
+            "fltflt_bench_mul",
+            "fltflt_bench_div",
+            "fltflt_bench_sqrt",
+            "fltflt_bench_sqrt_fast",
+            "fltflt_bench_norm3d",
+            "fltflt_bench_abs",
+            "fltflt_bench_fma",
+            "fltflt_bench_madd",
+            "fltflt_bench_round",
+            "fltflt_bench_trunc",
+            "fltflt_bench_floor",
+            "fltflt_bench_fmod",
+            "fltflt_bench_cast2dbl",
+            "fltflt_bench_cast2fltflt",
+        ],
+        "summary": summarize_fltflt,
+        # Inherited from the deleted run_fltflt_benchmarks.py.
+        "timeout_seconds": 300,
+    },
+    "sarbp": {
+        "exe_stems": ["bench_00_transform_sarbp", "matx_bench"],
+        "benchmarks": ["sarbp_float", "sarbp_double", "sarbp_mixed", "sarbp_fltflt"],
+        "summary": summarize_sarbp,
+        # Inherited from the deleted run_sarbp_benchmarks.py; sarbp at the
+        # default Problem Size runs longer than the fltflt sweep.
+        "timeout_seconds": 600,
+    },
+}
+
+# Default timeout for any future profile that doesn't set its own.
+DEFAULT_TIMEOUT_SECONDS = 600
+
+
+# ---------------------------------------------------------------------------
+# Build-dir / executable resolution.
+# ---------------------------------------------------------------------------
+
+def _resolve_exe(build_dir, stems):
+    bench_dir = build_dir / "bench"
+    for stem in stems:
+        for cand in (bench_dir / stem, bench_dir / f"{stem}.exe"):
+            if cand.is_file():
+                return cand
+    return None
+
+
+def find_default_build_dir():
+    cwd = Path.cwd()
+    if (cwd / "bench").is_dir() and any(
+        p.name.startswith("bench_") for p in (cwd / "bench").iterdir() if p.is_file()
+    ):
+        return cwd
+    script_dir = Path(__file__).resolve().parent
+    for candidate in (script_dir / "../../build", script_dir / "../../../build"):
+        c = candidate.resolve()
+        if c.exists() and (c / "bench").is_dir():
+            return c
+    return cwd  # let _resolve_exe produce a clear error
+
+
+# ---------------------------------------------------------------------------
+# Per-profile orchestration.
+# ---------------------------------------------------------------------------
+
+def run_profile(name, profile, build_dir, out_dir, extra_args):
+    exe = _resolve_exe(build_dir, profile["exe_stems"])
+    if exe is None:
+        print(f"[{name}] could not find any of {profile['exe_stems']} under {build_dir}/bench/",
+              file=sys.stderr)
+        return False
+
+    json_path = out_dir / f"{name}.json"
+    md_path   = out_dir / f"{name}.md"
+    csv_path  = out_dir / f"{name}.csv"
+
+    cmd = [str(exe)]
+    for b in profile["benchmarks"]:
+        cmd += ["--benchmark", b]
+    cmd += [
+        "--json", str(json_path),
+        "--md",   str(md_path),
+        "--csv",  str(csv_path),
+    ]
+    cmd += extra_args
+    timeout = profile.get("timeout_seconds", DEFAULT_TIMEOUT_SECONDS)
+    print(f"[{name}] {' '.join(cmd)}  (timeout {timeout}s)")
+    try:
+        res = subprocess.run(cmd, timeout=timeout)
+    except subprocess.TimeoutExpired:
+        print(f"[{name}] nvbench exceeded timeout of {timeout}s", file=sys.stderr)
+        return False
+    if res.returncode != 0:
+        print(f"[{name}] nvbench exited with status {res.returncode}", file=sys.stderr)
+        return False
+
+    if profile.get("summary"):
+        try:
+            profile["summary"](json_path)
+        except Exception as e:
+            print(f"[{name}] summary failed: {e}", file=sys.stderr)
+            return False
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--build-dir", type=Path, default=None,
+        help="Build directory containing bench/<exe>. "
+             "Defaults to the current directory if it has bench_*; otherwise <repo>/build.",
+    )
+    parser.add_argument(
+        "--profile",
+        choices=sorted(PROFILES) + ["all"],
+        default="all",
+        help="Which profile to run (default: all).",
+    )
+    parser.add_argument(
+        "--out-dir", type=Path, default=Path("bench_results"),
+        help="Directory for nvbench JSON/MD/CSV output (default: ./bench_results).",
+    )
+    parser.add_argument(
+        "nvbench_args", nargs=argparse.REMAINDER,
+        help="Extra args forwarded verbatim to the nvbench executable. "
+             "Use `--` to separate them from this script's flags.",
+    )
+    args = parser.parse_args()
+    extra = [a for a in args.nvbench_args if a != "--"]
+
+    build_dir = args.build_dir if args.build_dir else find_default_build_dir()
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    profiles = sorted(PROFILES) if args.profile == "all" else [args.profile]
+    failures = 0
+    for name in profiles:
+        ok = run_profile(name, PROFILES[name], build_dir, args.out_dir, extra)
+        if not ok:
+            failures += 1
+
+    print(f"\nDone. {len(profiles) - failures}/{len(profiles)} profile(s) succeeded; "
+          f"output under {args.out_dir}/")
+    sys.exit(1 if failures else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/scripts/run_fltflt_benchmarks.py b/bench/scripts/run_fltflt_benchmarks.py
deleted file mode 100755
index 6e0d7c44c..000000000
--- a/bench/scripts/run_fltflt_benchmarks.py
+++ /dev/null
@@ -1,512 +0,0 @@
-#!/usr/bin/env python3
-
-# BSD 3-Clause License
-#
-# Copyright (c) 2026, NVIDIA Corporation
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""
-Run fltflt arithmetic benchmarks and summarize results.
-Shows performance relative to single-precision (float = 1.0x baseline).
-"""
-
-import subprocess
-import re
-import sys
-import argparse
-from pathlib import Path
-from collections import defaultdict
-
-# Regex to strip ANSI escape codes from nvbench colored output
-ANSI_ESCAPE = re.compile(r'\x1b\[[0-9;]*[mK]')
-
-
-def strip_ansi(text):
-    """Remove ANSI escape codes from a string."""
-    return ANSI_ESCAPE.sub('', text)
-
-
-def _resolve_bench_executable(build_dir, stem):
-    """Return path to bench/<stem> or bench/<stem>.exe if present."""
-    bench_dir = build_dir / "bench"
-    for name in (stem, f"{stem}.exe"):
-        path = bench_dir / name
-        if path.is_file():
-            return path
-    return None
-
-
-def find_benchmark_executable(build_dir):
-    """Find the fltflt benchmark executable (per-source CMake target)."""
-    stem = "bench_00_misc_fltflt_arithmetic"
-    benchmark_path = _resolve_bench_executable(build_dir, stem)
-
-    if benchmark_path is not None:
-        return benchmark_path
-
-    print(
-        "Error: Could not find benchmark executable "
-        f"bench/{stem} under {build_dir}"
-    )
-    return None
-
-
-def build_dir_contains_benchmark_exes(build_dir):
-    """True if build_dir/bench contains at least one bench_* executable."""
-    bench_dir = build_dir / "bench"
-    if not bench_dir.is_dir():
-        return False
-    return any(
-        p.is_file() and p.name.startswith("bench_") for p in bench_dir.iterdir()
-    )
-
-
-def run_benchmark(executable_path, benchmark_name, verbose=False):
-    """Run a specific benchmark and capture output."""
-    print(f"Running benchmark: {benchmark_name}")
-
-    try:
-        result = subprocess.run(
-            [str(executable_path), "--benchmark", benchmark_name],
-            capture_output=True,
-            text=True,
-            timeout=300  # 5 minute timeout
-        )
-
-        if result.returncode != 0:
-            print(f"  Warning: Benchmark failed with return code {result.returncode}")
-            print(f"  stderr: {result.stderr}")
-            return None
-
-        if verbose:
-            print(f"  Raw output:\n{result.stdout}")
-
-        return result.stdout
-    except subprocess.TimeoutExpired:
-        print(f"  Benchmark timed out after 5 minutes")
-        return None
-    except Exception as e:
-        print(f"  Error running benchmark: {e}")
-        return None
-
-
-def parse_time_value(time_str):
-    """Parse time string like '668.707 us' or '6.785 ms' and convert to milliseconds."""
-    time_str = strip_ansi(time_str).strip()
-
-    # Match number and unit
-    match = re.match(r'([\d.]+)\s*(us|ms|ns|s)', time_str)
-    if not match:
-        return None
-
-    value = float(match.group(1))
-    unit = match.group(2)
-
-    # Convert to milliseconds
-    if unit == 'us':
-        return value / 1000.0
-    elif unit == 'ms':
-        return value
-    elif unit == 'ns':
-        return value / 1_000_000.0
-    elif unit == 's':
-        return value * 1000.0
-    else:
-        return value
-
-
-def parse_benchmark_output(output, verbose=False):
-    """
-    Parse the table format output from nvbench.
-
-    Expected format:
-    |      T       |   Array Size    | ... |  GPU Time  | ...
-    |--------------|-----------------|-----|------------|-----
-    |          F32 | ...                   | 668.707 us | ...
-    |          F64 | ...                   |  47.650 ms | ...
-    | matx::fltflt | ...                   |   6.785 ms | ...
-    """
-    results = {}
-    # Strip ANSI codes from the entire output before line-by-line processing
-    output = strip_ansi(output)
-    lines = output.strip().split('\n')
-
-    # Find the header line to locate GPU Time column
-    gpu_time_col_idx = None
-    for i, line in enumerate(lines):
-        if '|' in line and 'GPU Time' in line:
-            # Split by | and find GPU Time column index
-            cols = [col.strip() for col in line.split('|')]
-            for j, col in enumerate(cols):
-                if col == 'GPU Time':
-                    gpu_time_col_idx = j
-                    break
-            if gpu_time_col_idx is not None:
-                if verbose:
-                    print(f"  Found GPU Time at column index {gpu_time_col_idx} in: {line.rstrip()}")
-                break
-
-    if gpu_time_col_idx is None:
-        print("  Warning: Could not find GPU Time column in output")
-        return results
-
-    # Parse data rows
-    for line in lines:
-        if '|' not in line:
-            continue
-
-        # Skip header and separator lines:
-        #   - any line containing 'GPU Time' is a column header
-        #   - any line with '---' is a separator/divider row
-        #   - lines where the type column (stripped) is exactly 'T' are header rows
-        #     (nvbench labels the type axis column as 'T')
-        cols_raw = line.split('|')
-        if len(cols_raw) < 3:
-            continue
-
-        type_col_raw = cols_raw[1]  # unstripped, between first two '|'
-        if 'GPU Time' in line or '---' in line or type_col_raw.strip() == 'T':
-            continue
-
-        cols = [col.strip() for col in cols_raw]
-
-        if len(cols) <= gpu_time_col_idx:
-            continue
-
-        # Get type column (first data column after the leading empty string)
-        type_col = cols[1]
-
-        if not type_col:
-            continue
-
-        # Map type names (nvbench aliases float->F32, double->F64)
-        if 'F32' in type_col:
-            precision = 'float'
-        elif 'F64' in type_col:
-            precision = 'double'
-        elif 'fltflt' in type_col:
-            precision = 'fltflt'
-        else:
-            continue
-
-        # Extract GPU time
-        gpu_time_str = cols[gpu_time_col_idx]
-        gpu_time_ms = parse_time_value(gpu_time_str)
-
-        if gpu_time_ms is not None:
-            if verbose:
-                print(f"  Parsed: type={precision}, gpu_time_col={gpu_time_str!r}, value={gpu_time_ms:.6f} ms")
-            results[precision] = gpu_time_ms
-        elif verbose:
-            print(f"  Warning: Could not parse GPU time from col {gpu_time_col_idx}: {gpu_time_str!r}")
-
-    return results
-
-
-def parse_benchmark_output_no_type(output, verbose=False):
-    """
-    Parse nvbench output for benchmarks without a type axis (fltflt-only).
-    Returns a dict with a single 'fltflt' key.
-    """
-    results = {}
-    output = strip_ansi(output)
-    lines = output.strip().split('\n')
-
-    gpu_time_col_idx = None
-    for line in lines:
-        if '|' in line and 'GPU Time' in line:
-            cols = [col.strip() for col in line.split('|')]
-            for j, col in enumerate(cols):
-                if col == 'GPU Time':
-                    gpu_time_col_idx = j
-                    break
-            if gpu_time_col_idx is not None:
-                if verbose:
-                    print(f"  Found GPU Time at column index {gpu_time_col_idx} in: {line.rstrip()}")
-                break
-
-    if gpu_time_col_idx is None:
-        print("  Warning: Could not find GPU Time column in output")
-        return results
-
-    for line in lines:
-        if '|' not in line or 'GPU Time' in line or '---' in line:
-            continue
-        cols = [col.strip() for col in line.split('|')]
-        if len(cols) <= gpu_time_col_idx:
-            continue
-        gpu_time_str = cols[gpu_time_col_idx]
-        gpu_time_ms = parse_time_value(gpu_time_str)
-        if gpu_time_ms is not None:
-            if verbose:
-                print(f"  Parsed: type=fltflt, gpu_time_col={gpu_time_str!r}, value={gpu_time_ms:.6f} ms")
-            results['fltflt'] = gpu_time_ms
-            break
-
-    return results
-
-
-def format_time(time_ms):
-    """Format a time in ms with appropriate precision and units."""
-    if time_ms is None:
-        return "N/A"
-    if time_ms < 0.001:
-        return f"{time_ms * 1e6:.3f} ns"
-    elif time_ms < 1.0:
-        return f"{time_ms * 1000.0:.3f} us"
-    else:
-        return f"{time_ms:.3f} ms"
-
-
-def calculate_relative_performance(results):
-    """
-    Calculate performance relative to float (single-precision).
-    float = 1.0x (baseline)
-    Higher values mean slower (took more time relative to float)
-    """
-    relative = {}
-
-    for bench_name, timings in results.items():
-        if 'float' not in timings:
-            print(f"Warning: No float baseline for {bench_name}, skipping")
-            continue
-
-        float_time = timings['float']
-        relative[bench_name] = {}
-
-        for precision, time_value in timings.items():
-            # Relative slowdown: how many times slower than float
-            relative[bench_name][precision] = time_value / float_time
-
-    return relative
-
-
-def print_summary(results, relative):
-    """Print a formatted summary table."""
-    print("\n")
-    print("=" * 80)
-    print("FLTFLT BENCHMARK SUMMARY")
-    print("=" * 80)
-    print()
-    print("Performance relative to single-precision (float = 1.0x baseline)")
-    print("Higher values indicate slower performance")
-    print()
-
-    # Print header
-    print(f"{'Benchmark':<15} {'float':<12} {'double':<12} {'fltflt':<12} {'fltflt vs dbl':<15}")
-    print("-" * 66)
-
-    # Order benchmarks - use the canonical order but only show benchmarks that were actually run
-    bench_order = ['add', 'sub', 'mul', 'div', 'sqrt', 'sqrt_fast', 'norm3d', 'abs', 'fma', 'madd', 'round', 'trunc', 'floor', 'fmod', 'cast2dbl', 'cast2fltflt']
-    # Filter to only benchmarks present in results
-    bench_order = [b for b in bench_order if b in results]
-
-    for bench in bench_order:
-        if bench not in relative:
-            continue
-
-        rel = relative[bench]
-        timings = results[bench]
-
-        # Get values with defaults
-        float_rel = rel.get('float', 1.0)
-        double_rel = rel.get('double', None)
-        fltflt_rel = rel.get('fltflt', None)
-
-        # Calculate fltflt speedup vs double (double_time / fltflt_time)
-        fltflt_vs_double = None
-        if 'double' in timings and 'fltflt' in timings:
-            fltflt_vs_double = timings['double'] / timings['fltflt']
-
-        # Format output
-        float_str = f"{float_rel:.2f}x"
-        double_str = f"{double_rel:.2f}x" if double_rel is not None else "N/A"
-        fltflt_str = f"{fltflt_rel:.2f}x" if fltflt_rel is not None else "N/A"
-        speedup_str = f"{fltflt_vs_double:.2f}x" if fltflt_vs_double is not None else "N/A"
-
-        print(f"{bench:<15} {float_str:<12} {double_str:<12} {fltflt_str:<12} {speedup_str:<15}")
-
-    print()
-    print("-" * 80)
-    print("Raw timings (auto-scaled units):")
-    print()
-    print(f"{'Benchmark':<15} {'float':<15} {'double':<15} {'fltflt':<15} {'fltflt vs dbl':<15}")
-    print("-" * 75)
-
-    for bench in bench_order:
-        timings = results[bench]
-
-        float_time = timings.get('float', None)
-        double_time = timings.get('double', None)
-        fltflt_time = timings.get('fltflt', None)
-
-        # Calculate fltflt speedup vs double
-        fltflt_vs_double = None
-        if double_time is not None and fltflt_time is not None:
-            fltflt_vs_double = double_time / fltflt_time
-
-        float_str = format_time(float_time)
-        double_str = format_time(double_time)
-        fltflt_str = format_time(fltflt_time)
-        speedup_str = f"{fltflt_vs_double:.2f}x" if fltflt_vs_double is not None else "N/A"
-
-        print(f"{bench:<15} {float_str:<15} {double_str:<15} {fltflt_str:<15} {speedup_str:<15}")
-
-    print("=" * 80)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Run fltflt arithmetic benchmarks and summarize results."
-    )
-    parser.add_argument(
-        "--build-dir",
-        type=Path,
-        default=None,
-        help="Path to the MatX build directory containing bench/bench_00_misc_fltflt_arithmetic. "
-             "If not specified, common locations are searched automatically.",
-    )
-    parser.add_argument(
-        "--verbose", "-v",
-        action="store_true",
-        help="Print verbose output including raw benchmark output and parsed values.",
-    )
-    parser.add_argument(
-        "--benchmarks",
-        nargs="+",
-        default=None,
-        metavar="BENCH",
-        help="Run only specific benchmarks (e.g. add sub mul). "
-             "Defaults to all benchmarks.",
-    )
-    args = parser.parse_args()
-
-    # Find MatX build directory
-    if args.build_dir is not None:
-        build_dir = args.build_dir
-        if not build_dir.exists():
-            print(f"Error: Specified build directory does not exist: {build_dir}")
-            sys.exit(1)
-    else:
-        script_dir = Path(__file__).parent
-
-        # Check if the current working directory looks like a valid build directory
-        # (i.e. it already contains bench/bench_* executables). This lets users run the script
-        # from any build directory without needing --build-dir.
-        cwd = Path.cwd()
-        if build_dir_contains_benchmark_exes(cwd):
-            build_dir = cwd
-        else:
-            # Fall back to searching common locations relative to the script
-            possible_build_dirs = [
-                script_dir / "build",
-                script_dir / "repos" / "MatX" / "build",
-                script_dir / "../build",
-                script_dir / "../../build",
-            ]
-
-            build_dir = None
-            for bd in possible_build_dirs:
-                bd_resolved = bd.resolve()
-                if bd_resolved.exists() and build_dir_contains_benchmark_exes(bd_resolved):
-                    build_dir = bd_resolved
-                    break
-
-            if build_dir is None:
-                print("Error: Could not find MatX build directory")
-                print("Try running from a build directory, or use --build-dir to specify one")
-                sys.exit(1)
-
-    print(f"Using build directory: {build_dir}")
-
-    # Find benchmark executable
-    benchmark_exe = find_benchmark_executable(build_dir)
-
-    if benchmark_exe is None:
-        sys.exit(1)
-
-    print(f"Found benchmark: {benchmark_exe}")
-    print()
-
-    # List of benchmarks to run
-    all_benchmarks = ['add', 'sub', 'mul', 'div', 'sqrt', 'sqrt_fast', 'norm3d', 'abs', 'fma', 'madd', 'round', 'trunc', 'floor', 'fmod', 'cast2dbl', 'cast2fltflt']
-    # Benchmarks that only have a fltflt variant (no float/double type axis)
-    fltflt_only_benchmarks = set()
-    benchmarks = args.benchmarks if args.benchmarks is not None else all_benchmarks
-
-    # Validate user-provided benchmarks
-    if args.benchmarks is not None:
-        invalid_benchmarks = [b for b in args.benchmarks if b not in all_benchmarks]
-        if invalid_benchmarks:
-            print(f"Error: Unknown benchmark(s): {', '.join(invalid_benchmarks)}")
-            print(f"Valid benchmarks are: {', '.join(all_benchmarks)}")
-            sys.exit(1)
-
-    all_results = {}
-
-    # Run each benchmark
-    for bench in benchmarks:
-        bench_name = f"fltflt_bench_{bench}"
-        print(f"\n{'=' * 80}")
-        output = run_benchmark(benchmark_exe, bench_name, verbose=args.verbose)
-
-        if output is None:
-            print(f"  Skipping {bench} due to error")
-            continue
-
-        # Parse results
-        if bench in fltflt_only_benchmarks:
-            results = parse_benchmark_output_no_type(output, verbose=args.verbose)
-        else:
-            results = parse_benchmark_output(output, verbose=args.verbose)
-
-        if not results:
-            print(f"  Warning: Could not parse results for {bench}")
-            print("  Raw output:")
-            print(output)
-            continue
-
-        all_results[bench] = results
-        parsed_parts = [f"{k}={format_time(v)}" for k, v in results.items()]
-        print(f"  Parsed: {', '.join(parsed_parts)}")
-
-    print(f"\n{'=' * 80}")
-
-    if not all_results:
-        print("\nError: No benchmark results collected")
-        sys.exit(1)
-
-    print(f"\nSuccessfully collected results for {len(all_results)} benchmarks")
-
-    # Calculate relative performance
-    relative = calculate_relative_performance(all_results)
-
-    # Print summary
-    print_summary(all_results, relative)
-
-if __name__ == "__main__":
-    main()
diff --git a/bench/scripts/run_sarbp_benchmarks.py b/bench/scripts/run_sarbp_benchmarks.py
deleted file mode 100755
index a13d49f8e..000000000
--- a/bench/scripts/run_sarbp_benchmarks.py
+++ /dev/null
@@ -1,466 +0,0 @@
-#!/usr/bin/env python3
-
-# BSD 3-Clause License
-#
-# Copyright (c) 2026, NVIDIA Corporation
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""
-Run SAR backprojection benchmarks and summarize results.
-Computes gigabackprojections per second (Gproj/s) for each benchmark variant.
-"""
-
-import subprocess
-import re
-import sys
-import argparse
-from pathlib import Path
-
-# Regex to strip ANSI escape codes from nvbench colored output
-ANSI_ESCAPE = re.compile(r'\x1b\[[0-9;]*[mK]')
-
-
-def strip_ansi(text):
-    """Remove ANSI escape codes from a string."""
-    return ANSI_ESCAPE.sub('', text)
-
-
-def _resolve_bench_executable(build_dir, stem):
-    """Return path to bench/<stem> or bench/<stem>.exe if present."""
-    bench_dir = build_dir / "bench"
-    for name in (stem, f"{stem}.exe"):
-        path = bench_dir / name
-        if path.is_file():
-            return path
-    return None
-
-
-def find_benchmark_executable(build_dir):
-    """Find the SAR BP benchmark executable (per-source CMake target)."""
-    stem = "bench_00_transform_sarbp"
-    benchmark_path = _resolve_bench_executable(build_dir, stem)
-
-    if benchmark_path is not None:
-        return benchmark_path
-
-    print(
-        "Error: Could not find benchmark executable "
-        f"bench/{stem} under {build_dir}"
-    )
-    return None
-
-
-def build_dir_contains_benchmark_exes(build_dir):
-    """True if build_dir/bench contains at least one bench_* executable."""
-    bench_dir = build_dir / "bench"
-    if not bench_dir.is_dir():
-        return False
-    return any(
-        p.is_file() and p.name.startswith("bench_") for p in bench_dir.iterdir()
-    )
-
-
-def run_benchmark(executable_path, benchmark_name, verbose=False):
-    """Run a specific benchmark and capture output."""
-    print(f"Running benchmark: {benchmark_name}")
-
-    try:
-        result = subprocess.run(
-            [str(executable_path), "--benchmark", benchmark_name],
-            capture_output=True,
-            text=True,
-            timeout=600  # 10 minute timeout for larger benchmarks
-        )
-
-        if result.returncode != 0:
-            print(f"  Warning: Benchmark failed with return code {result.returncode}")
-            print(f"  stderr: {result.stderr}")
-            return None
-
-        if verbose:
-            print(f"  Raw output:\n{result.stdout}")
-
-        return result.stdout
-    except subprocess.TimeoutExpired:
-        print(f"  Benchmark timed out after 10 minutes")
-        return None
-    except Exception as e:
-        print(f"  Error running benchmark: {e}")
-        return None
-
-
-def parse_time_value(time_str):
-    """Parse time string like '668.707 us' or '6.785 ms' and convert to seconds."""
-    time_str = strip_ansi(time_str).strip()
-
-    # Match number and unit
-    match = re.match(r'([\d.]+)\s*(us|ms|ns|s)', time_str)
-    if not match:
-        return None
-
-    value = float(match.group(1))
-    unit = match.group(2)
-
-    # Convert to seconds
-    if unit == 'us':
-        return value / 1_000_000.0
-    elif unit == 'ms':
-        return value / 1_000.0
-    elif unit == 'ns':
-        return value / 1_000_000_000.0
-    elif unit == 's':
-        return value
-    else:
-        return value
-
-
-def parse_benchmark_output(output, verbose=False):
-    """
-    Parse the table format output from nvbench for sarbp benchmarks.
-
-    Expected format:
-    | Problem Size | ... |  GPU Time  | ...
-    |--------------|-----|------------|-----
-    |         1000 | ... | 123.456 ms | ...
-    |         2000 | ... | 987.654 ms | ...
-    """
-    results = {}
-    output = strip_ansi(output)
-    lines = output.strip().split('\n')
-
-    # Find the header line to locate GPU Time and Problem Size columns
-    gpu_time_col_idx = None
-    problem_size_col_idx = None
-    for i, line in enumerate(lines):
-        if '|' in line and 'GPU Time' in line:
-            # Split by | and find column indices
-            cols = [col.strip() for col in line.split('|')]
-            for j, col in enumerate(cols):
-                if col == 'GPU Time':
-                    gpu_time_col_idx = j
-                elif col == 'Problem Size':
-                    problem_size_col_idx = j
-            if gpu_time_col_idx is not None and problem_size_col_idx is not None:
-                if verbose:
-                    print(f"  Found GPU Time at column index {gpu_time_col_idx}, "
-                          f"Problem Size at column index {problem_size_col_idx} in: {line.rstrip()}")
-                break
-
-    if gpu_time_col_idx is None:
-        print("  Warning: Could not find GPU Time column in output")
-        return results
-
-    if problem_size_col_idx is None:
-        print("  Warning: Could not find Problem Size column in output")
-        return results
-
-    # Parse data rows
-    for line in lines:
-        if '|' not in line:
-            continue
-
-        # Skip header and separator lines
-        if 'GPU Time' in line or '---' in line or 'Problem Size' in line:
-            continue
-
-        cols = [col.strip() for col in line.split('|')]
-
-        if len(cols) <= max(gpu_time_col_idx, problem_size_col_idx):
-            continue
-
-        # Get problem size
-        problem_size_str = cols[problem_size_col_idx]
-        try:
-            problem_size = int(problem_size_str)
-        except ValueError:
-            continue
-
-        # Extract GPU time
-        gpu_time_str = cols[gpu_time_col_idx]
-        gpu_time_s = parse_time_value(gpu_time_str)
-
-        if gpu_time_s is not None:
-            if verbose:
-                print(f"  Parsed: problem_size={problem_size}, gpu_time_col={gpu_time_str!r}, value={gpu_time_s*1000:.6f} ms")
-            results[problem_size] = gpu_time_s
-        elif verbose:
-            print(f"  Warning: Could not parse GPU time from col {gpu_time_col_idx}: {gpu_time_str!r}")
-
-    return results
-
-
-def calculate_gproj_per_sec(problem_size, time_seconds):
-    """
-    Calculate gigabackprojections per second.
-
-    Each sarbp execution computes:
-    num_pulses * image_width * image_height backprojection operations
-
-    For our benchmarks: all dimensions = problem_size
-    So: operations = problem_size^3
-    """
-    operations = problem_size ** 3
-    giga_operations = operations / 1e9
-    gproj_per_sec = giga_operations / time_seconds
-    return gproj_per_sec
-
-
-def print_summary(all_results):
-    """Print a formatted summary table."""
-    print("\n")
-    print("=" * 100)
-    print("SAR BACKPROJECTION BENCHMARK SUMMARY")
-    print("=" * 100)
-    print()
-    print("Performance in Gigabackprojections per second (Gproj/s)")
-    print("Higher values indicate better performance")
-    print()
-    print(f"Note: Operations = num_pulses × image_width × image_height = problem_size³")
-    print()
-
-    # Print detailed results for each variant
-    # Use only the variants that were actually run
-    variants = sorted(all_results.keys())
-
-    for variant in variants:
-        print(f"\n{variant.upper()} Precision:")
-        print("-" * 80)
-        print(f"{'Problem Size':<15} {'Operations':<18} {'Time (ms)':<15} {'Gproj/s':<15}")
-        print("-" * 80)
-
-        for problem_size in sorted(all_results[variant].keys()):
-            time_s = all_results[variant][problem_size]
-            time_ms = time_s * 1000.0
-            operations = problem_size ** 3
-            gproj_s = calculate_gproj_per_sec(problem_size, time_s)
-
-            print(f"{problem_size:<15} {operations:<18,} {time_ms:<15.3f} {gproj_s:<15.3f}")
-
-    # Print comparative summary
-    print("\n")
-    print("=" * 100)
-    print("COMPARATIVE SUMMARY (Gproj/s)")
-    print("=" * 100)
-    print()
-
-    # Get all problem sizes
-    all_problem_sizes = set()
-    for variant_results in all_results.values():
-        all_problem_sizes.update(variant_results.keys())
-    all_problem_sizes = sorted(all_problem_sizes)
-
-    # Print header
-    header = f"{'Problem Size':<15}"
-    for variant in variants:
-        header += f" {variant:<15}"
-    print(header)
-    print("-" * 100)
-
-    # Print data rows
-    for problem_size in all_problem_sizes:
-        row = f"{problem_size:<15}"
-        for variant in variants:
-            if problem_size in all_results[variant]:
-                time_s = all_results[variant][problem_size]
-                gproj_s = calculate_gproj_per_sec(problem_size, time_s)
-                row += f" {gproj_s:<15.3f}"
-            else:
-                row += f" {'N/A':<15}"
-        print(row)
-
-    # Print relative performance (relative to float) - only if float was run
-    if 'float' in all_results:
-        print("\n")
-        print("=" * 100)
-        print("RELATIVE PERFORMANCE (float = 1.0x baseline)")
-        print("=" * 100)
-        print()
-
-        # Print header
-        header = f"{'Problem Size':<15}"
-        for variant in variants:
-            header += f" {variant:<15}"
-        print(header)
-        print("-" * 100)
-
-        # Print data rows
-        for problem_size in all_problem_sizes:
-            row = f"{problem_size:<15}"
-
-            # Get float baseline
-            float_gproj_s = None
-            if problem_size in all_results['float']:
-                time_s = all_results['float'][problem_size]
-                float_gproj_s = calculate_gproj_per_sec(problem_size, time_s)
-
-            for variant in variants:
-                if problem_size in all_results[variant]:
-                    time_s = all_results[variant][problem_size]
-                    gproj_s = calculate_gproj_per_sec(problem_size, time_s)
-
-                    if float_gproj_s is not None and float_gproj_s > 0:
-                        relative = gproj_s / float_gproj_s
-                        row += f" {relative:<15.3f}"
-                    else:
-                        row += f" {1.0:<15.3f}" if variant == 'float' else f" {'N/A':<15}"
-                else:
-                    row += f" {'N/A':<15}"
-            print(row)
-
-        print("=" * 100)
-    else:
-        print("=" * 100)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Run SAR backprojection benchmarks and summarize results."
-    )
-    parser.add_argument(
-        "--build-dir",
-        type=Path,
-        default=None,
-        help="Path to the MatX build directory containing bench/bench_00_transform_sarbp. "
-             "If not specified, the current working directory is checked first, "
-             "then common locations relative to the script are searched.",
-    )
-    parser.add_argument(
-        "--verbose", "-v",
-        action="store_true",
-        help="Print verbose output including raw benchmark output and parsed values.",
-    )
-    parser.add_argument(
-        "--variants",
-        nargs="+",
-        default=None,
-        metavar="VARIANT",
-        help="Run only specific benchmark variants (e.g. float double). "
-             "Defaults to all variants: float double mixed fltflt.",
-    )
-    args = parser.parse_args()
-
-    # Find MatX build directory
-    if args.build_dir is not None:
-        build_dir = args.build_dir
-        if not build_dir.exists():
-            print(f"Error: Specified build directory does not exist: {build_dir}")
-            sys.exit(1)
-    else:
-        # Check if the current working directory looks like a valid build directory
-        # (i.e. it already contains bench/bench_* executables). This lets users run the script
-        # from any build directory without needing --build-dir.
-        cwd = Path.cwd()
-        if build_dir_contains_benchmark_exes(cwd):
-            build_dir = cwd
-        else:
-            # Fall back to searching common locations relative to the script
-            script_dir = Path(__file__).parent
-            possible_build_dirs = [
-                script_dir / "build",
-                script_dir / "repos" / "MatX" / "build",
-                script_dir / "../build",
-                script_dir / "../../build",
-            ]
-
-            build_dir = None
-            for bd in possible_build_dirs:
-                bd_resolved = bd.resolve()
-                if bd_resolved.exists() and build_dir_contains_benchmark_exes(bd_resolved):
-                    build_dir = bd_resolved
-                    break
-
-            if build_dir is None:
-                print("Error: Could not find MatX build directory")
-                print("Try running from a build directory, or use --build-dir to specify one")
-                sys.exit(1)
-
-    print(f"Using build directory: {build_dir}")
-
-    # Find benchmark executable
-    benchmark_exe = find_benchmark_executable(build_dir)
-
-    if benchmark_exe is None:
-        sys.exit(1)
-
-    print(f"Found benchmark: {benchmark_exe}")
-    print()
-
-    # List of SAR BP benchmark variants
-    all_variants = ['float', 'double', 'mixed', 'fltflt']
-    variants = args.variants if args.variants is not None else all_variants
-
-    # Validate user-provided variants
-    if args.variants is not None:
-        invalid_variants = [v for v in args.variants if v not in all_variants]
-        if invalid_variants:
-            print(f"Error: Unknown variant(s): {', '.join(invalid_variants)}")
-            print(f"Valid variants are: {', '.join(all_variants)}")
-            sys.exit(1)
-
-    all_results = {}
-
-    # Run each benchmark variant
-    for variant in variants:
-        bench_name = f"sarbp_{variant}"
-        print(f"\n{'=' * 100}")
-        output = run_benchmark(benchmark_exe, bench_name, verbose=args.verbose)
-
-        if output is None:
-            print(f"  Skipping {variant} due to error")
-            continue
-
-        # Parse results
-        results = parse_benchmark_output(output, verbose=args.verbose)
-
-        if not results:
-            print(f"  Warning: Could not parse results for {variant}")
-            print("  Raw output:")
-            print(output)
-            continue
-
-        all_results[variant] = results
-
-        # Print parsed results with Gproj/s
-        parsed_str = ', '.join([
-            f'size={size}: {time_s*1000:.3f}ms ({calculate_gproj_per_sec(size, time_s):.3f} Gproj/s)'
-            for size, time_s in sorted(results.items())
-        ])
-        print(f"  Parsed: {parsed_str}")
-
-    print(f"\n{'=' * 100}")
-
-    if not all_results:
-        print("\nError: No benchmark results collected")
-        sys.exit(1)
-
-    print(f"\nSuccessfully collected results for {len(all_results)} benchmark variants")
-
-    # Print summary
-    print_summary(all_results)
-
-if __name__ == "__main__":
-    main()
diff --git a/include/matx/kernels/fltflt.h b/include/matx/kernels/fltflt.h
index 99673f0f1..b876d7b1c 100644
--- a/include/matx/kernels/fltflt.h
+++ b/include/matx/kernels/fltflt.h
@@ -47,6 +47,10 @@ namespace matx {
 //   "Extended-Precision Floating-Point Numbers for GPU Computation", Andrew Thall,
 //   https://andrewthall.org/papers/df64_qf128.pdf
 // That paper cites key work from D. E. Knuth, T. J. Dekker, A. H. Karp and others.
+// The reference for the FPAN-based implementation of fltflt_add() is:
+//   "High-Performance Branch-Free Algorithms for Extended-Precision Floating-Point Arithmetic",
+//   David K. Zhang and Alex Aiken, Proceedings of the International Conference for High Performance
+//   Computing, Networking, Storage and Analysis, 2025.
 
 // fltflt represents an unevaluated floating point sum of two non-overlapping fp32 components.
 // The hi component is the most significant part of the sum, and the lo component is the least significant part.
@@ -250,21 +254,40 @@ static __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt fltflt_two_prod_fma(
     return fltflt{ x, y };
 }
 
-// fltflt_add is the df64_add() function given by Thall. This function uses two_sum()
-// for the hi and lo components followed by addition of the cross terms and
-// re-normalization to a non-overlapping expansion.
+// FPAN-based two-term addition from Zhang & Aiken (SC'25), Figure 2.
+//
+// The Thall df64_add() form chains two FastTwoSums in series on the
+// critical path (the lo-side path traverses both), giving depth 13 fp32
+// ops. The FPAN form below runs the FastTwoSum on the hi parts (q)
+// concurrently with the lo-side add (st_lo), so q's 3 ops sit entirely
+// off the critical path and the lo-side path traverses only one
+// FastTwoSum. Critical-path depth: 10 fp32 ops vs 13 for Thall.
+//
+// Both forms use the same 20 fp32 ops total, so steady-state throughput
+// is identical. The latency win shows up when the SM cannot fully hide
+// the per-call dependency chain (low occupancy, serial accumulators,
+// reduction tails). For reference, Thall's df64_add reads:
+//   fltflt s = fltflt_two_sum(a.hi, b.hi);
+//   const fltflt t = fltflt_two_sum(a.lo, b.lo);
+//   s.lo = detail::fadd_rn(s.lo, t.hi);
+//   s = fltflt_fast_two_sum(s.hi, s.lo);
+//   s.lo = detail::fadd_rn(s.lo, t.lo);
+//   s = fltflt_fast_two_sum(s.hi, s.lo);
+//   return s;
 static __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt fltflt_add(fltflt a, fltflt b) {
     fltflt s = fltflt_two_sum(a.hi, b.hi);
     const fltflt t = fltflt_two_sum(a.lo, b.lo);
-    s.lo = detail::fadd_rn(s.lo, t.hi);
-    s = fltflt_fast_two_sum(s.hi, s.lo);
-    s.lo = detail::fadd_rn(s.lo, t.lo);
-    s = fltflt_fast_two_sum(s.hi, s.lo);
-    return s;
+    const fltflt q = fltflt_fast_two_sum(s.hi, t.hi);
+    const float st_lo = detail::fadd_rn(s.lo, t.lo);
+    const float stq_lo = detail::fadd_rn(st_lo, q.lo);
+    return fltflt_fast_two_sum(q.hi, stq_lo);
 }
 
 // This overload is an optimization of fltflt_add() for the case where b is
-// a float, and thus b.lo is zero.
+// a float, and thus b.lo is zero. The FPAN restructuring above does not
+// apply here because there is no second TwoSum to lift; this form
+// remains the Thall-style chain (TwoSum -> add -> FastTwoSum, ~9 fp32
+// ops on the critical path).
 static __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt fltflt_add(fltflt a, float b) {
     fltflt s = fltflt_two_sum(a.hi, b);
     s.lo = detail::fadd_rn(s.lo, a.lo);