diff --git a/kwave/kspaceFirstOrder.py b/kwave/kspaceFirstOrder.py
index 3a71cccd..351de1b8 100644
--- a/kwave/kspaceFirstOrder.py
+++ b/kwave/kspaceFirstOrder.py
@@ -82,6 +82,40 @@ def _strip_pml(result, pml_size, ndim, suffixes=_FULL_GRID_SUFFIXES):
     }
 
 
+def _resolve_dtype(value):
+    """Normalize a dtype-like input to ``np.float32`` or ``np.float64``.
+
+    Accepts numpy dtypes/types (``np.float32``, ``np.float64``), strings
+    (``"float32"`` etc., plus MATLAB aliases ``"single"`` / ``"double"``),
+    Python ``float``, ``None`` (default → float64), and the legacy MATLAB
+    ``"off"`` alias for float64.  Anything that resolves to a non-float32 /
+    non-float64 dtype raises ``ValueError`` — the solver isn't validated
+    for ``float16`` / complex dtypes.
+
+    Cupy dtypes (``cp.float32``, ``cp.float64``) work for free because cupy
+    re-exports numpy's scalar types.  Torch / JAX dtypes are not accepted —
+    they live in different ecosystems and don't translate via ``np.dtype()``;
+    the error message points the user at the equivalent numpy dtype.
+    """
+    if value is None or value == "off":
+        return np.float64
+    try:
+        resolved = np.dtype(value).type
+    except TypeError as e:
+        framework = getattr(type(value), "__module__", "").split(".")[0]
+        hint = ""
+        if framework in ("torch", "jax", "jaxlib", "tensorflow"):
+            hint = f" {framework}.dtype objects aren't supported; pass the equivalent numpy dtype (np.float32 / np.float64)."
+        raise ValueError(
+            f"dtype must be a numpy dtype, type, or string (e.g. 'float32', 'single'), got {value!r}.{hint}"
+        ) from e
+    if resolved is np.float32:
+        return np.float32
+    if resolved is np.float64:
+        return np.float64
+    raise ValueError(f"dtype must resolve to float32 or float64; got {resolved.__name__} from {value!r}")
+
+
 def kspaceFirstOrder(
     kgrid: kWaveGrid,
     medium: kWaveMedium,
@@ -96,6 +130,7 @@ def kspaceFirstOrder(
     smooth_p0: bool = True,
     backend: str = "python",
     device: str = "cpu",
+    dtype=None,
     save_only: bool = False,
     data_path: Optional[str] = None,
     quiet: bool = False,
@@ -142,6 +177,24 @@ def kspaceFirstOrder(
         device: ``"cpu"`` or ``"gpu"``.  For ``backend="python"`` this
             selects NumPy (cpu) vs CuPy (gpu).  For ``backend="cpp"`` it
             selects the OMP vs CUDA binary.  Default ``"cpu"``.
+        dtype: Numerical precision for state arrays in the Python backend.
+            Accepts dtype-like input — a numpy dtype (``np.float32``,
+            ``np.float64``; cupy aliases like ``cp.float32`` work since cupy
+            re-exports numpy's scalar types), a string (``"float32"``,
+            ``"float64"``, ``"single"``, ``"double"``), a Python type
+            (``float``), or ``None`` for the default (float64).  The
+            MATLAB-style alias ``"off"`` is accepted as a synonym for
+            float64 to ease migration from the legacy
+            ``SimulationOptions.data_cast``.  Torch / JAX dtypes are not
+            accepted; pass the numpy equivalent (e.g. ``np.float32`` for
+            ``torch.float32``).
+            ``np.float32`` uses roughly half the memory and is faster on
+            most hardware, at the cost of reduced numerical accuracy.
+            Only ``float32`` and ``float64`` are supported; other dtypes
+            raise ``ValueError``.  Has no effect on ``backend="cpp"`` (the
+            C++ binary uses fixed internal precision regardless); a warning
+            is emitted if ``dtype`` resolves to anything other than float64
+            with the C++ backend.  Default ``None`` (float64).
         save_only: When ``True`` (``backend="cpp"`` only), write the HDF5
             input file and return without running the binary.  Useful for
             cluster submission.  Default ``False``.
@@ -167,6 +220,7 @@ def kspaceFirstOrder(
         raise ValueError(f"device must be 'cpu' or 'gpu', got {device!r}")
     if backend not in ("python", "cpp"):
         raise ValueError(f"Unknown backend: {backend!r}. Use 'python' or 'cpp'.")
+    dtype = _resolve_dtype(dtype)
 
     if isinstance(pml_size, str) and pml_size.lower() == "auto":
         pml_size = tuple(int(x) for x in get_optimal_pml_size(kgrid))
@@ -206,6 +260,7 @@ def kspaceFirstOrder(
             pml_size=pml_size,
             pml_alpha=pml_alpha,
             quiet=quiet,
+            dtype=dtype,
         ).run()
 
     elif backend == "cpp":
@@ -215,6 +270,14 @@ def kspaceFirstOrder(
         check_alpha_mode_cpp_compatible(medium)
         warn_alpha_power_near_unity_cpp(medium)
 
+        if dtype is not np.float64:
+            warnings.warn(
+                f"dtype={np.dtype(dtype).name!r} has no effect with backend='cpp'; the C++ binary "
+                "uses fixed internal precision regardless. Use backend='python' to control "
+                "computational precision.",
+                stacklevel=2,
+            )
+
         if not use_kspace:
             warnings.warn(
                 "use_kspace=False has no effect with backend='cpp'; the C++ binary always applies k-space correction.",
diff --git a/kwave/solvers/kspace_solver.py b/kwave/solvers/kspace_solver.py
index 2ce239d0..0ec70ce4 100644
--- a/kwave/solvers/kspace_solver.py
+++ b/kwave/solvers/kspace_solver.py
@@ -32,40 +32,55 @@ def _to_cpu(x):
     return x.get() if hasattr(x, "get") else x
 
 
-def _expand_to_grid(val, grid_shape, xp, name="parameter"):
+def _array_sum(arrays):
+    """Sum arrays, preserving dtype.
+
+    ``sum(arrays)`` starts from Python ``int 0``; under numpy < 2 (NEP 50)
+    that promotes float32 inputs to float64.  Starting from ``arrays[0]``
+    keeps the result's dtype equal to the elements'.
+    """
+    out = arrays[0]
+    for a in arrays[1:]:
+        out = out + a
+    return out
+
+
+def _expand_to_grid(val, grid_shape, xp, name="parameter", dtype=float):
     if val is None:
         raise ValueError(f"Missing required parameter: {name}")
-    arr = xp.array(val, dtype=float).ravel()
+    arr = xp.array(val, dtype=dtype).ravel()
     grid_size = int(np.prod(grid_shape))
     if arr.size == 1:
-        return xp.full(grid_shape, float(arr[0]), dtype=float)
+        return xp.full(grid_shape, arr[0], dtype=dtype)
     if arr.size == grid_size:
         return arr.reshape(grid_shape)
     raise ValueError(f"{name} size {arr.size} incompatible with grid size {grid_size}")
 
 
-def _build_source_op(mask_raw, signal_raw, mode, scale, *, xp, grid_shape, grid_size, source_kappa, diff_fn):
+def _build_source_op(mask_raw, signal_raw, mode, scale, *, xp, grid_shape, grid_size, source_kappa, diff_fn, dtype=float):
     """Build a source injection operator for one field variable.
 
     Returns a callable (t, field) → field that injects scaled source values.
+    ``dtype`` controls the precision of the source signal buffer (matches the
+    field precision set by ``data_cast``).
     """
     mask = xp.array(mask_raw, dtype=bool).ravel()
     if mask.size == 1:
         mask = xp.full(grid_shape, bool(mask[0]), dtype=bool).ravel()
     n_src = int(xp.sum(mask))
 
-    signal_arr = xp.array(signal_raw, dtype=float)
+    signal_arr = xp.array(signal_raw, dtype=dtype)
     if signal_arr.ndim == 1:
         signal = signal_arr.reshape(1, -1)
     else:
         signal = signal_arr.reshape(-1, signal_arr.shape[-1]) if signal_arr.ndim > 2 else signal_arr
 
-    scaled = signal * xp.atleast_1d(xp.asarray(scale))[:, None]
+    scaled = signal * xp.atleast_1d(xp.asarray(scale, dtype=dtype))[:, None]
     signal_len = scaled.shape[1]
 
     def get_val(t):
         if scaled.shape[0] == 1:
-            return xp.full(n_src, float(scaled[0, t]))
+            return xp.full(n_src, scaled[0, t], dtype=dtype)
         return scaled[:, t]
 
     def dirichlet(t, field):
@@ -76,7 +91,7 @@ def dirichlet(t, field):
         return flat.reshape(grid_shape)
 
     # Pre-allocate buffer to avoid per-step allocation
-    _src_buf = xp.zeros(grid_size, dtype=float)
+    _src_buf = xp.zeros(grid_size, dtype=dtype)
 
     def additive_kspace(t, field):
         if t >= signal_len:
@@ -131,6 +146,7 @@ def __init__(
         pml_size=None,
         pml_alpha=None,
         quiet=False,
+        dtype=None,
     ):
         self.kgrid = kgrid
         self.medium = medium
@@ -142,6 +158,25 @@ def __init__(
         self.quiet = quiet
         self._pml_size_override = pml_size
         self._pml_alpha_override = pml_alpha
+        # Compute precision for state arrays. ``None`` defaults to float64 (matches
+        # MATLAB k-Wave). Only float32 / float64 are validated by the solver.
+        if dtype is None:
+            self._dtype = np.float64
+        elif dtype in (np.float32, np.float64):
+            self._dtype = dtype
+        else:
+            try:
+                resolved = np.dtype(dtype).type
+            except TypeError as e:
+                raise ValueError(f"dtype must be np.float32 or np.float64 (or string equivalent), got {dtype!r}") from e
+            if resolved not in (np.float32, np.float64):
+                raise ValueError(f"dtype must resolve to float32 or float64, got {resolved.__name__}")
+            self._dtype = resolved
+        # Companion complex dtype for FFT outputs.  numpy<2 (np.fft) always upcasts
+        # to complex128 regardless of input precision; we cast back so the rest of
+        # the pipeline stays in self._dtype.  Harmless on numpy 2+ and on cupy
+        # (which already respects input precision).
+        self._complex_dtype = np.complex64 if self._dtype is np.float32 else np.complex128
         # kWaveGrid doesn't have pml_size_x attrs; warn if PML will silently be disabled
         if pml_size is None:
             from kwave.kgrid import kWaveGrid as _KWG
@@ -190,9 +225,9 @@ def setup(self):
         self.Nt = int(self.kgrid.Nt)
         self.dt = float(self.kgrid.dt)
 
-        self.c0 = _expand_to_grid(self.medium.sound_speed, self.grid_shape, xp, "sound_speed")
+        self.c0 = _expand_to_grid(self.medium.sound_speed, self.grid_shape, xp, "sound_speed", dtype=self._dtype)
         density = getattr(self.medium, "density", None)
-        self.rho0 = _expand_to_grid(density if density is not None else 1000.0, self.grid_shape, xp, "density")
+        self.rho0 = _expand_to_grid(density if density is not None else 1000.0, self.grid_shape, xp, "density", dtype=self._dtype)
         self.c_ref = float(xp.max(self.c0))
 
         self._setup_sensor_mask()
@@ -356,26 +391,32 @@ def _setup_pml(self):
             if pml_size == 0 or pml_alpha == 0:
                 shape = [1] * self.ndim
                 shape[axis] = N
-                self.pml_list.append(xp.ones(shape, dtype=float))
-                self.pml_sg_list.append(xp.ones(shape, dtype=float))
+                self.pml_list.append(xp.ones(shape, dtype=self._dtype))
+                self.pml_sg_list.append(xp.ones(shape, dtype=self._dtype))
             else:
-                # dimension=2 gives shape (1, N) which we reshape for broadcasting
+                # dimension=2 gives shape (1, N) which we reshape for broadcasting.
+                # get_pml returns float64; cast so the per-step PML multiply doesn't
+                # upcast self.p / self.u (would silently break dtype='single').
                 pml = get_pml(N, dx, self.dt, self.c_ref, pml_size, pml_alpha, staggered=False, dimension=2, xp=xp)
                 pml_sg = get_pml(N, dx, self.dt, self.c_ref, pml_size, pml_alpha, staggered=True, dimension=2, xp=xp)
 
                 shape = [1] * self.ndim
                 shape[axis] = N
-                self.pml_list.append(pml.flatten().reshape(shape))
-                self.pml_sg_list.append(pml_sg.flatten().reshape(shape))
+                self.pml_list.append(pml.flatten().reshape(shape).astype(self._dtype))
+                self.pml_sg_list.append(pml_sg.flatten().reshape(shape).astype(self._dtype))
 
     def _setup_kspace_operators(self):
         """Build k-space gradient/divergence operators for each dimension."""
         xp = self.xp
         self.k_list = []
 
-        # First pass: build k-vectors for each dimension
+        # First pass: build k-vectors for each dimension.
+        # ``fftfreq`` returns float64 by default; cast so the k-space operators
+        # (kappa, op_grad_list, op_div_list, _k_mag) match self._dtype.  Without
+        # this cast, _diff's FFT round-trip with a float64 op upcasts the
+        # float32 field back to float64 -- silently breaking dtype='single'.
         for axis, (N, dx) in enumerate(zip(self.grid_shape, self.spacing)):
-            k = 2 * np.pi * xp.fft.fftfreq(N, d=dx)
+            k = (2 * np.pi * xp.fft.fftfreq(N, d=dx)).astype(self._dtype)
             shape = [1] * self.ndim
             shape[axis] = N
             self.k_list.append(k.reshape(shape))
@@ -424,7 +465,7 @@ def _alpha_neper_and_power(self):
         if not _is_enabled(getattr(self.medium, "alpha_coeff", 0)):
             return None, None
         alpha_power = float(self.xp.array(getattr(self.medium, "alpha_power", 1.5)).flatten()[0])
-        alpha_coeff = _expand_to_grid(self.medium.alpha_coeff, self.grid_shape, self.xp, "alpha_coeff")
+        alpha_coeff = _expand_to_grid(self.medium.alpha_coeff, self.grid_shape, self.xp, "alpha_coeff", dtype=self._dtype)
         return db2neper(alpha_coeff, alpha_power), alpha_power
 
     def _init_absorption(self, alpha_np, alpha_power):
@@ -504,9 +545,9 @@ def _init_nonlinearity(self):
             self._nonlinearity = lambda rho: 0
             self._nl_factor = lambda rho_split: 1.0
         else:
-            self.BonA = _expand_to_grid(BonA_raw, self.grid_shape, self.xp, "BonA")
+            self.BonA = _expand_to_grid(BonA_raw, self.grid_shape, self.xp, "BonA", dtype=self._dtype)
             self._nonlinearity = lambda rho: self.BonA * rho**2 / (2 * self.rho0)
-            self._nl_factor = lambda rho_split: (2 * sum(rho_split) + self.rho0) / self.rho0
+            self._nl_factor = lambda rho_split: (2 * _array_sum(rho_split) + self.rho0) / self.rho0
 
     def _setup_source_operators(self):
         """Build time-varying source injection operators.
@@ -560,6 +601,7 @@ def build_op(mask_raw, signal_raw, mode, scale):
                 grid_size=grid_size,
                 source_kappa=self.source_kappa,
                 diff_fn=self._diff,
+                dtype=self._dtype,
             )
 
         # --- Pressure source (per-axis spacing for non-isotropic grids) ---
@@ -606,10 +648,10 @@ def _setup_fields(self):
         """Initialize pressure, velocity, and density fields."""
         xp = self.xp
 
-        self.p = xp.zeros(self.grid_shape, dtype=float)
-        self.u = [xp.zeros(self.grid_shape, dtype=float) for _ in range(self.ndim)]
+        self.p = xp.zeros(self.grid_shape, dtype=self._dtype)
+        self.u = [xp.zeros(self.grid_shape, dtype=self._dtype) for _ in range(self.ndim)]
         # Split density per dimension enables independent PML absorption in each direction
-        self.rho_split = [xp.zeros(self.grid_shape, dtype=float) for _ in range(self.ndim)]
+        self.rho_split = [xp.zeros(self.grid_shape, dtype=self._dtype) for _ in range(self.ndim)]
 
         if self.use_sg:
             self.rho0_staggered = [self._stagger(self.rho0, axis) for axis in range(self.ndim)]
@@ -623,12 +665,12 @@ def _setup_fields(self):
         # Sensor data storage (sized based on record_start_index)
         self.sensor_data = {}
         if "p" in self.record:
-            self.sensor_data["p"] = xp.zeros((self.n_sensor_points, self.num_recorded_time_points), dtype=float)
+            self.sensor_data["p"] = xp.zeros((self.n_sensor_points, self.num_recorded_time_points), dtype=self._dtype)
         for a in "xyz"[: self.ndim]:
             for suffix in ("", "_staggered"):
                 v = f"u{a}{suffix}"
                 if v in self.record:
-                    self.sensor_data[v] = xp.zeros((self.n_sensor_points, self.num_recorded_time_points), dtype=float)
+                    self.sensor_data[v] = xp.zeros((self.n_sensor_points, self.num_recorded_time_points), dtype=self._dtype)
 
         # Spectral shift: move velocity from staggered (mid-cell) to collocated (pressure) grid
         self.unstagger_ops = [xp.exp(-1j * self.k_list[ax] * self.spacing[ax] / 2) for ax in range(self.ndim)]
@@ -636,12 +678,12 @@ def _setup_fields(self):
         # Initial pressure source (p0)
         p0_raw = getattr(self.source, "p0", 0)
         if _is_enabled(p0_raw):
-            p0 = _expand_to_grid(p0_raw, self.grid_shape, xp, "p0")
+            p0 = _expand_to_grid(p0_raw, self.grid_shape, xp, "p0", dtype=self._dtype)
             if self.smooth_p0 and self.ndim >= 2:
                 from kwave.utils.filters import smooth
 
                 # smooth() is order-agnostic (uses FFT on shape)
-                p0 = xp.asarray(smooth(_to_cpu(p0), restore_max=True))
+                p0 = xp.asarray(smooth(_to_cpu(p0), restore_max=True), dtype=self._dtype)
             self._p0_initial = p0
         else:
             self._p0_initial = None
@@ -657,16 +699,16 @@ def step(self):
 
         # Momentum equation: du_i/dt = -grad_i(p)/rho, with PML
         # Share forward FFT of p across all gradient axes
-        P = xp.fft.fftn(self.p)
+        P = xp.fft.fftn(self.p).astype(self._complex_dtype, copy=False)
         for i in range(self.ndim):
             pml_sg = self.pml_sg_list[i]
-            grad_p_i = xp.real(xp.fft.ifftn(self.op_grad_list[i] * P))
+            grad_p_i = xp.real(xp.fft.ifftn(self.op_grad_list[i] * P)).astype(self._dtype, copy=False)
             self.u[i] = pml_sg * (pml_sg * self.u[i] - self.dt_over_rho0[i] * grad_p_i)
             self.u[i] = self._source_u_ops[i](self.t, self.u[i])
 
         # Mass conservation: drho_i/dt = -rho0 * div_i(u_i) * nl_factor, with PML
         nl_factor = self._nl_factor(self.rho_split)
-        div_u_total = xp.zeros(self.grid_shape, dtype=float)
+        div_u_total = xp.zeros(self.grid_shape, dtype=self._dtype)
         for i in range(self.ndim):
             pml = self.pml_list[i]
             div_u_i = self._diff(self.u[i], self.op_div_list[i])
@@ -675,7 +717,7 @@ def step(self):
             self.rho_split[i] = self._source_p_ops[i](self.t, self.rho_split[i])
 
         # Equation of state:  p = c0^2 * (rho + absorption - dispersion + nonlinearity)
-        rho_total = sum(self.rho_split)
+        rho_total = _array_sum(self.rho_split)
         self.p = self.c0_sq * (
             rho_total + self._absorption(div_u_total) - self._dispersion(rho_total) + self._nonlinearity(rho_total)
         )
@@ -697,7 +739,7 @@ def step(self):
                 self.sensor_data["p"][:, file_index] = self._extract(self.p)
             for i, a in enumerate("xyz"[: self.ndim]):
                 if f"u{a}" in self.sensor_data:  # non-staggered (collocated with pressure)
-                    shifted = xp.real(xp.fft.ifftn(self.unstagger_ops[i] * xp.fft.fftn(self.u[i])))
+                    shifted = xp.real(xp.fft.ifftn(self.unstagger_ops[i] * xp.fft.fftn(self.u[i]))).astype(self._dtype, copy=False)
                     self.sensor_data[f"u{a}"][:, file_index] = self._extract(shifted)
                 if f"u{a}_staggered" in self.sensor_data:  # raw staggered grid
                     self.sensor_data[f"u{a}_staggered"][:, file_index] = self._extract(self.u[i])
@@ -751,7 +793,7 @@ def _diff(self, f, op):
         if op is None:
             return f
         xp = self.xp
-        return xp.real(xp.fft.ifftn(op * xp.fft.fftn(f)))
+        return xp.real(xp.fft.ifftn(op * xp.fft.fftn(f))).astype(self._dtype, copy=False)
 
     def _stagger(self, arr, axis):
         """Compute staggered grid values (average neighbors along axis)."""
diff --git a/tests/test_data_cast.py b/tests/test_data_cast.py
new file mode 100644
index 00000000..5eda1705
--- /dev/null
+++ b/tests/test_data_cast.py
@@ -0,0 +1,299 @@
+"""Tests for the ``dtype`` parameter on the modern ``kspaceFirstOrder()`` API.
+
+Issue #695: expose precision control on the modern API.
+
+The Python backend honors ``dtype`` (numpy-style dtype-like input):
+  - ``None`` / ``np.float64`` / ``"float64"`` / ``"double"`` / ``float`` /
+    ``"off"`` (legacy MATLAB alias) → float64 (default)
+  - ``np.float32`` / ``"float32"`` / ``"single"`` → float32
+
+The C++ backend uses fixed internal precision regardless; setting ``dtype``
+to anything other than float64 warns.
+"""
+from copy import deepcopy
+
+import numpy as np
+import pytest
+
+from kwave.data import Vector
+from kwave.kgrid import kWaveGrid
+from kwave.kmedium import kWaveMedium
+from kwave.ksensor import kSensor
+from kwave.ksource import kSource
+from kwave.kspaceFirstOrder import kspaceFirstOrder
+
+
+def _build_minimal():
+    N = Vector([48, 48])
+    dx = Vector([0.1e-3, 0.1e-3])
+    kgrid = kWaveGrid(N, dx)
+    kgrid.makeTime(1500)
+    medium = kWaveMedium(sound_speed=1500 * np.ones(tuple(N)), density=1000 * np.ones(tuple(N)))
+    source = kSource()
+    source.p_mask = np.zeros(tuple(N), dtype=bool)
+    source.p_mask[N.x // 2, N.y // 2] = True
+    t = kgrid.t_array.squeeze()
+    source.p = (np.sin(2 * np.pi * 1e6 * t) * np.exp(-((t - 5e-7) ** 2) / (2e-7) ** 2))[np.newaxis, :]
+    sensor = kSensor(mask=np.ones(tuple(N), dtype=bool))
+    return kgrid, medium, source, sensor
+
+
+# Cover every input form the resolver should accept.
+_FLOAT64_INPUTS = [None, np.float64, "float64", "double", float, "off", np.dtype("f8")]
+_FLOAT32_INPUTS = [np.float32, "float32", "single", np.dtype("f4")]
+
+
+@pytest.mark.parametrize("dtype_arg", _FLOAT64_INPUTS, ids=lambda x: repr(x))
+def test_python_backend_float64_inputs(dtype_arg):
+    """All float64-equivalent inputs must produce float64 output across every recorded field."""
+    kgrid, medium, source, sensor = _build_minimal()
+    sensor.record = ("p", "p_final", "p_max", "p_min", "p_rms")
+    result = kspaceFirstOrder(
+        kgrid=kgrid,
+        medium=medium,
+        source=deepcopy(source),
+        sensor=sensor,
+        backend="python",
+        device="cpu",
+        dtype=dtype_arg,
+        pml_inside=False,
+        smooth_p0=False,
+        quiet=True,
+    )
+    for field in ("p", "p_final", "p_max", "p_min", "p_rms"):
+        arr = np.asarray(result[field])
+        assert arr.dtype == np.float64, f"dtype={dtype_arg!r}: {field} produced {arr.dtype}, expected float64"
+    assert not np.any(np.isnan(result["p"]))
+    assert np.nanmax(np.abs(result["p"])) > 0
+
+
+@pytest.mark.parametrize("dtype_arg", _FLOAT32_INPUTS, ids=lambda x: repr(x))
+def test_python_backend_float32_inputs(dtype_arg):
+    """All float32-equivalent inputs must produce float32 output across every recorded field.
+
+    Includes p_final + aggregates (p_max/min/rms) — these caught a real dtype-drift bug
+    where k-space operators (kappa, op_grad/div_list) and the PML arrays were float64
+    by default and silently upcast self.p back to float64 mid-step().
+    """
+    kgrid, medium, source, sensor = _build_minimal()
+    sensor.record = ("p", "p_final", "p_max", "p_min", "p_rms")
+    result = kspaceFirstOrder(
+        kgrid=kgrid,
+        medium=medium,
+        source=deepcopy(source),
+        sensor=sensor,
+        backend="python",
+        device="cpu",
+        dtype=dtype_arg,
+        pml_inside=False,
+        smooth_p0=False,
+        quiet=True,
+    )
+    for field in ("p", "p_final", "p_max", "p_min", "p_rms"):
+        arr = np.asarray(result[field])
+        assert arr.dtype == np.float32, f"dtype={dtype_arg!r}: {field} produced {arr.dtype}, expected float32"
+    assert not np.any(np.isnan(result["p"]))
+    assert np.nanmax(np.abs(result["p"])) > 0
+
+
+def test_default_dtype_is_float64():
+    """Calling without dtype must produce float64 (back-compat with pre-#716 behavior)."""
+    kgrid, medium, source, sensor = _build_minimal()
+    result = kspaceFirstOrder(
+        kgrid=kgrid,
+        medium=medium,
+        source=deepcopy(source),
+        sensor=sensor,
+        backend="python",
+        device="cpu",
+        pml_inside=False,
+        smooth_p0=False,
+        quiet=True,
+    )
+    assert np.asarray(result["p"]).dtype == np.float64
+
+
+@pytest.mark.parametrize("bad", [np.float16, np.complex64, "float16", "complex64", "quad", 42, "garbage"])
+def test_invalid_dtype_raises(bad):
+    """Non-float64/float32 dtypes (and unparseable values) must raise ValueError."""
+    kgrid, medium, source, sensor = _build_minimal()
+    with pytest.raises(ValueError, match="dtype"):
+        kspaceFirstOrder(
+            kgrid=kgrid,
+            medium=medium,
+            source=deepcopy(source),
+            sensor=sensor,
+            backend="python",
+            device="cpu",
+            dtype=bad,
+        )
+
+
+def test_python_backend_dtype_preserved_for_staggered_velocity():
+    """Recorded staggered velocity (``ux_staggered`` etc.) must preserve dtype.
+
+    The staggered-grid extraction goes through ``unstagger_ops * fftn(u)`` ->
+    ``ifftn`` -> ``.real``.  In numpy < 2 ``np.fft.fftn`` returns complex128
+    regardless of input precision, so without an explicit cast the staggered
+    output would silently come back as float64 even when ``dtype=np.float32``.
+    """
+    kgrid, medium, source, sensor = _build_minimal()
+    sensor.record = ("p", "ux_staggered", "uy_staggered")
+    result = kspaceFirstOrder(
+        kgrid=kgrid,
+        medium=medium,
+        source=deepcopy(source),
+        sensor=sensor,
+        backend="python",
+        device="cpu",
+        dtype=np.float32,
+        pml_inside=False,
+        smooth_p0=False,
+        quiet=True,
+    )
+    for field in ("p", "ux_staggered", "uy_staggered"):
+        arr = np.asarray(result[field])
+        assert arr.dtype == np.float32, f"{field} produced {arr.dtype}, expected float32"
+
+
+def test_python_backend_dtype_preserved_with_nonlinearity():
+    """BonA path must preserve float32 dtype.
+
+    Regression guard for the third dtype-drift bug: ``sum(rho_split)`` in
+    ``_nl_factor`` and the equation-of-state line starts with Python ``int 0``,
+    which under numpy < 2 (NEP 50) promotes float32 to float64.  Fixed by
+    using ``_array_sum`` which starts from the first element.
+    """
+    kgrid, medium, source, sensor = _build_minimal()
+    medium.BonA = 6.0  # water-like nonlinearity
+    sensor.record = ("p", "p_final", "p_max")
+    result = kspaceFirstOrder(
+        kgrid=kgrid,
+        medium=medium,
+        source=deepcopy(source),
+        sensor=sensor,
+        backend="python",
+        device="cpu",
+        dtype=np.float32,
+        pml_inside=False,
+        smooth_p0=False,
+        quiet=True,
+    )
+    for field in ("p", "p_final", "p_max"):
+        arr = np.asarray(result[field])
+        assert arr.dtype == np.float32, f"With nonlinearity, {field} produced {arr.dtype}, expected float32"
+
+
+def test_torch_like_dtype_gets_helpful_error():
+    """Non-numpy framework dtypes get a hint pointing to numpy equivalents.
+
+    Uses a fake stand-in (`__module__ = "torch"`) to avoid importing torch.
+    """
+    kgrid, medium, source, sensor = _build_minimal()
+
+    class FakeTorchDtype:
+        pass
+
+    FakeTorchDtype.__module__ = "torch"
+    with pytest.raises(ValueError, match=r"torch.*pass the equivalent numpy dtype"):
+        kspaceFirstOrder(
+            kgrid=kgrid,
+            medium=medium,
+            source=deepcopy(source),
+            sensor=sensor,
+            backend="python",
+            device="cpu",
+            dtype=FakeTorchDtype(),
+        )
+
+
+def test_python_single_vs_double_numerical_agreement():
+    """Single and double precision runs must agree to within float32 tolerance.
+
+    Sanity check that float32 runs aren't producing garbage. Float32 has
+    ~7 decimal digits, so a relative tolerance of ~1e-4 is reasonable for
+    a short propagation in a uniform medium.
+    """
+    kgrid, medium, source, sensor = _build_minimal()
+    p_double = np.asarray(
+        kspaceFirstOrder(
+            kgrid=kgrid,
+            medium=medium,
+            source=deepcopy(source),
+            sensor=sensor,
+            backend="python",
+            device="cpu",
+            dtype=np.float64,
+            pml_inside=False,
+            smooth_p0=False,
+            quiet=True,
+        )["p"]
+    )
+    p_single = np.asarray(
+        kspaceFirstOrder(
+            kgrid=kgrid,
+            medium=medium,
+            source=deepcopy(source),
+            sensor=sensor,
+            backend="python",
+            device="cpu",
+            dtype=np.float32,
+            pml_inside=False,
+            smooth_p0=False,
+            quiet=True,
+        )["p"]
+    )
+    scale = np.max(np.abs(p_double))
+    abs_err = np.max(np.abs(p_double.astype(np.float64) - p_single.astype(np.float64)))
+    assert abs_err / scale < 1e-4, f"single-vs-double max relative error {abs_err / scale:.2e} > 1e-4"
+
+
+def test_cpp_backend_warns_on_non_float64_dtype(tmp_path):
+    """Setting dtype=np.float32 with backend='cpp' must warn (binary uses fixed precision)."""
+    kgrid, medium, source, sensor = _build_minimal()
+    import subprocess
+
+    with pytest.warns(UserWarning, match="dtype.*has no effect.*cpp"):
+        try:
+            kspaceFirstOrder(
+                kgrid=kgrid,
+                medium=medium,
+                source=deepcopy(source),
+                sensor=sensor,
+                backend="cpp",
+                device="cpu",
+                dtype=np.float32,
+                pml_inside=False,
+                smooth_p0=False,
+                pml_alpha=10,
+                data_path=str(tmp_path),
+                quiet=True,
+            )
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            # Binary not available on this box; warning still must have fired
+            pass
+
+
+def test_cpp_backend_silent_on_default_dtype(tmp_path, recwarn):
+    """Default dtype (None → float64) must not warn on the C++ backend."""
+    kgrid, medium, source, sensor = _build_minimal()
+    import subprocess
+
+    try:
+        kspaceFirstOrder(
+            kgrid=kgrid,
+            medium=medium,
+            source=deepcopy(source),
+            sensor=sensor,
+            backend="cpp",
+            device="cpu",
+            # dtype left at default
+            pml_inside=False,
+            smooth_p0=False,
+            pml_alpha=10,
+            data_path=str(tmp_path),
+            quiet=True,
+        )
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+    assert not any("dtype" in str(w.message) and "has no effect" in str(w.message) for w in recwarn.list)