diff --git a/docs/sphinx/api/qec/cpp_api.rst b/docs/sphinx/api/qec/cpp_api.rst index 4c6413b7..c6de71fb 100644 --- a/docs/sphinx/api/qec/cpp_api.rst +++ b/docs/sphinx/api/qec/cpp_api.rst @@ -34,6 +34,25 @@ Detector Error Model .. doxygenfunction:: cudaq::qec::x_dem_from_memory_circuit(const code &, operation, std::size_t, cudaq::noise_model &) .. doxygenfunction:: cudaq::qec::z_dem_from_memory_circuit(const code &, operation, std::size_t, cudaq::noise_model &) +.. _dem_sampling_cpp_api: + +DEM Sampling +============ + +CPU sampling (host tensors): + +.. doxygenfunction:: cudaq::qec::dem_sampler::cpu::sample_dem(const cudaqx::tensor &, std::size_t, const std::vector &) +.. doxygenfunction:: cudaq::qec::dem_sampler::cpu::sample_dem(const cudaqx::tensor &, std::size_t, const std::vector &, unsigned) + +GPU sampling (device pointers, requires cuStabilizer): + +.. doxygenfunction:: cudaq::qec::dem_sampler::gpu::sample_dem + +Convenience wrappers (delegate to ``cpu::sample_dem``): + +.. doxygenfunction:: cudaq::qec::dem_sampling(const cudaqx::tensor &, std::size_t, const std::vector &) +.. doxygenfunction:: cudaq::qec::dem_sampling(const cudaqx::tensor &, std::size_t, const std::vector &, unsigned) + Decoder Interfaces ================== diff --git a/docs/sphinx/api/qec/python_api.rst b/docs/sphinx/api/qec/python_api.rst index 8ac30a73..bce5cae5 100644 --- a/docs/sphinx/api/qec/python_api.rst +++ b/docs/sphinx/api/qec/python_api.rst @@ -84,6 +84,13 @@ Common .. autofunction:: cudaq_qec.sample_code_capacity +.. _dem_sampling_python_api: + +DEM Sampling +============ + +.. autofunction:: cudaq_qec.dem_sampling + Parity Check Matrix Utilities ============================= diff --git a/docs/sphinx/components/qec/introduction.rst b/docs/sphinx/components/qec/introduction.rst index 0fca9df9..1e01f743 100644 --- a/docs/sphinx/components/qec/introduction.rst +++ b/docs/sphinx/components/qec/introduction.rst @@ -1355,3 +1355,51 @@ Additional Noise Models noise.add_all_qubit_channel( "x", cudaq::depolarization2(/*probability*/ 0.01), /*numControls*/ 1); + +DEM Sampling +^^^^^^^^^^^^ + +The ``dem_sampling`` function samples errors and syndromes from a detector error +model (DEM). Given a binary check matrix :math:`H` of shape +``[num_checks x num_error_mechanisms]`` and a vector of per-mechanism Bernoulli +probabilities, it generates random error vectors and computes +:math:`\text{syndromes} = \text{errors} \cdot H^T \pmod{2}`. + +In Python, the ``backend`` parameter (``"auto"``, ``"gpu"``, or ``"cpu"``) +controls whether sampling runs on the GPU via cuStabilizer or on the CPU. The +function accepts NumPy arrays and PyTorch CUDA tensors. In C++ the CPU and GPU +paths live in separate namespaces (``cudaq::qec::dem_sampler::cpu`` and +``cudaq::qec::dem_sampler::gpu``). + +.. tab:: Python + + .. code-block:: python + + import cudaq_qec as qec + import numpy as np + + H = np.array([[1, 1, 0], + [0, 1, 1]], dtype=np.uint8) + error_probs = np.array([0.05, 0.10, 0.05]) + + # backend="auto": GPU when available, else CPU + syndromes, errors = qec.dem_sampling( + H, num_shots=1000, error_probabilities=error_probs, seed=42) + # syndromes: uint8 [1000 x 2], errors: uint8 [1000 x 3] + +.. tab:: C++ + + .. literalinclude:: ../../examples/qec/cpp/dem_sampling.cpp + :language: cpp + :start-after: [Begin Documentation] + + Compile and run with + + .. code-block:: bash + + nvq++ --enable-mlir -lcudaq-qec dem_sampling.cpp -o dem_sampling + ./dem_sampling + +For a complete walkthrough including GPU acceleration, input type handling, and +backend selection details, see the +:doc:`DEM Sampling example `. diff --git a/docs/sphinx/examples/qec/cpp/dem_sampling.cpp b/docs/sphinx/examples/qec/cpp/dem_sampling.cpp new file mode 100644 index 00000000..e7fd7dab --- /dev/null +++ b/docs/sphinx/examples/qec/cpp/dem_sampling.cpp @@ -0,0 +1,70 @@ +/******************************************************************************* + * Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ +// [Begin Documentation] +// DEM Sampling — sample errors and syndromes from a detector error model. +// +// Compile and run with: +// nvq++ --enable-mlir --target=stim -lcudaq-qec dem_sampling.cpp +// ./a.out + +#include +#include +#include + +#include "cudaq/qec/dem_sampling.h" + +int main() { + // [3,1] repetition code check matrix: + // H = | 1 1 0 | + // | 0 1 1 | + std::vector H_data = {1, 1, 0, 0, 1, 1}; + size_t num_checks = 2; + size_t num_mechanisms = 3; + + cudaqx::tensor H({num_checks, num_mechanisms}); + H.copy(H_data.data(), H.shape()); + + std::vector error_probs = {0.05, 0.10, 0.05}; + size_t num_shots = 10; + unsigned seed = 42; + + // CPU sampling + auto [syndromes, errors] = + cudaq::qec::dem_sampler::cpu::sample_dem(H, num_shots, error_probs, seed); + + std::cout << "Syndromes [" << syndromes.shape()[0] << " x " + << syndromes.shape()[1] << "]:\n"; + for (size_t shot = 0; shot < num_shots; shot++) { + for (size_t c = 0; c < num_checks; c++) + std::cout << static_cast(syndromes.at({shot, c})) << " "; + std::cout << "\n"; + } + + std::cout << "\nErrors [" << errors.shape()[0] << " x " << errors.shape()[1] + << "]:\n"; + for (size_t shot = 0; shot < num_shots; shot++) { + for (size_t e = 0; e < num_mechanisms; e++) + std::cout << static_cast(errors.at({shot, e})) << " "; + std::cout << "\n"; + } + + // Verify: syndromes == (errors * H^T) mod 2 + bool ok = true; + for (size_t shot = 0; shot < num_shots && ok; shot++) { + for (size_t c = 0; c < num_checks && ok; c++) { + uint8_t expected = 0; + for (size_t e = 0; e < num_mechanisms; e++) + expected ^= errors.at({shot, e}) & H.at({c, e}); + if (syndromes.at({shot, c}) != expected) + ok = false; + } + } + std::cout << "\nVerification: " << (ok ? "PASSED" : "FAILED") << "\n"; + + return ok ? 0 : 1; +} diff --git a/docs/sphinx/examples/qec/python/dem_sampling.py b/docs/sphinx/examples/qec/python/dem_sampling.py new file mode 100644 index 00000000..6242369b --- /dev/null +++ b/docs/sphinx/examples/qec/python/dem_sampling.py @@ -0,0 +1,50 @@ +# ============================================================================ # +# Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# [Begin Documentation] +import numpy as np +import cudaq_qec as qec + +# Define a check matrix for a [3,1] repetition code. +# Rows = checks (stabilizers), columns = error mechanisms. +H = np.array([[1, 1, 0], [0, 1, 1]], dtype=np.uint8) + +# Independent error probability for each mechanism. +error_probs = np.array([0.05, 0.10, 0.05]) + +num_shots = 10 + +# Sample syndromes and errors from the detector error model. +# backend="auto" (default) tries GPU first, then falls back to CPU. +syndromes, errors = qec.dem_sampling(H, num_shots, error_probs, seed=42) + +print(f"Check matrix H ({H.shape[0]} checks x {H.shape[1]} mechanisms):") +print(H) +print(f"\nError probabilities: {error_probs}") +print(f"\nSampled errors ({errors.shape}):\n{errors}") +print(f"\nSampled syndromes ({syndromes.shape}):\n{syndromes}") + +# Verify: syndromes should equal (errors @ H^T) mod 2. +expected = (errors @ H.T) % 2 +assert np.array_equal(syndromes, expected), "Mismatch!" +print("\nVerification passed: syndromes == (errors @ H^T) mod 2") + +# Reproducibility: the same seed yields the same results. +s1, e1 = qec.dem_sampling(H, num_shots, error_probs, seed=123) +s2, e2 = qec.dem_sampling(H, num_shots, error_probs, seed=123) +assert np.array_equal(e1, e2) +print("Reproducibility check passed: same seed -> same output") + +# Force a specific backend (cpu or gpu). +syndromes_cpu, errors_cpu = qec.dem_sampling(H, + num_shots, + error_probs, + seed=42, + backend="cpu") +print(f"\nCPU backend result shapes: syndromes {syndromes_cpu.shape}, " + f"errors {errors_cpu.shape}") diff --git a/docs/sphinx/examples_rst/qec/dem_sampling.rst b/docs/sphinx/examples_rst/qec/dem_sampling.rst new file mode 100644 index 00000000..98a5c5dc --- /dev/null +++ b/docs/sphinx/examples_rst/qec/dem_sampling.rst @@ -0,0 +1,113 @@ +.. _dem_sampling_example: + +DEM Sampling — Monte-Carlo Sampling from Detector Error Models +-------------------------------------------------------------- + +A **detector error model** (DEM) describes the probabilistic relationship +between independent error mechanisms and the detectors (syndrome bits) that +observe them. Given a binary check matrix :math:`H` and a vector of per-mechanism +error probabilities, DEM sampling generates random error vectors and the +corresponding syndromes via + +.. math:: + + \text{errors}_{ij} \sim \text{Bernoulli}(p_j), \qquad + \text{syndromes} = \text{errors} \cdot H^T \pmod{2}. + +In Python, ``cudaq_qec.dem_sampling`` provides this capability with automatic +backend selection: it uses GPU-accelerated sampling via cuStabilizer when +available and falls back to a CPU implementation otherwise. In C++ the CPU and +GPU paths are exposed as separate functions in the ``cudaq::qec::dem_sampler`` +namespace (see :ref:`dem_sampling_cpp_api`). + +Example ++++++++ + +.. tab:: Python + + .. literalinclude:: ../../examples/qec/python/dem_sampling.py + :language: python + :start-after: [Begin Documentation] + +.. tab:: C++ + + .. literalinclude:: ../../examples/qec/cpp/dem_sampling.cpp + :language: cpp + :start-after: [Begin Documentation] + + Compile and run with + + .. code-block:: bash + + nvq++ --enable-mlir -lcudaq-qec dem_sampling.cpp -o dem_sampling + ./dem_sampling + +GPU Acceleration +++++++++++++++++ + +When a CUDA-capable GPU is available, ``dem_sampling`` uses a fully on-device +pipeline that is significantly faster than per-shot CPU sampling, especially +for large numbers of shots and sparse error models (low probabilities): + +1. **Sparse Bernoulli sampling** — Errors are generated directly in compressed + sparse row (CSR) format. For low error probabilities the CSR representation + is compact, and the sampler skips mechanisms with zero probability entirely + rather than evaluating a Bernoulli trial for every mechanism in every shot. + +2. **GF(2) sparse-dense matrix multiply** — Syndromes are computed as + :math:`\text{errors} \times H^T \pmod{2}` using a sparse-dense multiply + over GF(2). The check matrix :math:`H^T` is stored in a bitpacked layout, + reducing memory bandwidth by 8x compared to one byte per entry. + +3. **On-device packing and unpacking** — :math:`H` is transposed and bitpacked + on the GPU in a single kernel. Syndromes are unpacked from the bitpacked + result, and the dense error matrix is produced from the CSR representation + via a fused zero-and-scatter kernel. + +The CPU path uses ``std::bernoulli_distribution`` per mechanism per shot +followed by a dense dot product for the syndrome. + +Input Types and Backend Selection ++++++++++++++++++++++++++++++++++ + +The ``backend`` parameter controls where sampling runs: + +- ``"auto"`` (default) — try GPU first, fall back to CPU. +- ``"gpu"`` — require GPU; raise ``RuntimeError`` if unavailable. +- ``"cpu"`` — always use the CPU path. + +The Python binding accepts several input types, each routed through a different +code path: + +1. **NumPy arrays** (most common) — When the GPU is available the bindings + automatically allocate device memory, copy inputs host-to-device, run + cuStabilizer, and copy results back as NumPy ``uint8`` arrays. With + ``backend="cpu"`` the GPU path is skipped entirely. No user action is + required beyond passing standard ``uint8`` and ``float64`` arrays. + +2. **PyTorch CUDA tensors** — The GPU path reads input device pointers directly + via ``data_ptr()`` and writes outputs into ``torch.empty`` tensors on the + same device, avoiding any host-device copies. This is the fastest path when + inputs are already on the GPU. PyTorch is an optional dependency; install + with ``pip install torch``. + +3. **PyTorch CPU tensors** — With ``backend="gpu"`` the tensors are + automatically moved to CUDA (via ``.to(device)``) before sampling. With + ``backend="auto"`` CPU tensors are rejected with an error; convert them to + NumPy with ``.numpy()`` first. + +The C++ API exposes two namespaces: + +- ``cudaq::qec::dem_sampler::cpu::sample_dem`` — takes a ``cudaqx::tensor`` + check matrix and a ``std::vector`` of probabilities; returns + ``(syndromes, errors)`` as tensors. +- ``cudaq::qec::dem_sampler::gpu::sample_dem`` — takes raw device pointers and + writes results into caller-provided device buffers; returns ``false`` if + cuStabilizer is not available at runtime. + +See Also +++++++++ + +- :doc:`/api/qec/python_api` — ``dem_sampling`` Python API reference +- :doc:`/api/qec/cpp_api` — ``dem_sampler`` C++ API reference +- :doc:`/examples_rst/qec/decoders` — Decoder examples that consume syndromes diff --git a/docs/sphinx/examples_rst/qec/examples.rst b/docs/sphinx/examples_rst/qec/examples.rst index 6f371c1c..8dce404d 100644 --- a/docs/sphinx/examples_rst/qec/examples.rst +++ b/docs/sphinx/examples_rst/qec/examples.rst @@ -10,6 +10,7 @@ Examples that illustrate how to use CUDA-QX for application development are avai Code-Capacity QEC Circuit-Level QEC Decoders + DEM Sampling Real-Time Decoding AI Predecoder with CUDA-Q Realtime AI Predecoder with CUDA-Q Realtime (with FPGA Data Injection)