diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 00000000..9878f6a5
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,64 @@
+# Benchmarks
+
+This directory contains performance benchmarks for k-wave-python. These are not standard examples: they are intended to measure runtime and memory behavior and can become expensive as the grid size grows.
+
+## 3D Solver Scaling Benchmark
+
+`benchmark.py` ports MATLAB k-Wave's `benchmark.m`. It runs `kspaceFirstOrder` on a sequence of 3D grids with increasing sizes, averages runtime over repeated runs, and saves partial results after each run.
+
+The default benchmark uses:
+
+- heterogeneous absorbing medium
+- smoothed initial pressure ball source
+- binary sensor mask built from 100 Cartesian points on a sphere
+- 1000 time steps
+- 3 averages per grid size
+- grid sizes based on MATLAB's original scale arrays, starting at `32 x 32 x 32`
+
+By default, this can run for a long time and may stop once memory limits are reached.
+
+## Usage
+
+Run a small smoke benchmark:
+
+```bash
+uv run benchmarks/benchmark.py --max-cases 1 --num-averages 1 --number-time-points 20
+```
+
+Run the default CPU benchmark:
+
+```bash
+uv run benchmarks/benchmark.py
+```
+
+Run with single-precision arrays:
+
+```bash
+uv run benchmarks/benchmark.py --data-cast single
+```
+
+Run on the Python GPU backend:
+
+```bash
+uv run benchmarks/benchmark.py --device gpu
+```
+
+Choose an output file:
+
+```bash
+uv run benchmarks/benchmark.py --output-path benchmark_data.json
+```
+
+## Output
+
+The benchmark writes a JSON file containing:
+
+- `comp_size`: total grid points for each completed grid size
+- `comp_time`: rolling average elapsed seconds for each grid size
+- `options`: benchmark settings and environment metadata
+- `output_path`: path to the JSON output file
+- `error_reached`: whether the benchmark stopped after an exception
+- `error_message`: exception message, if any
+- `mem_usage`: optional process peak memory estimate when `--report-mem-usage` is set
+
+Partial results are saved after each run so completed timings are preserved if a later grid fails.
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 00000000..522d2cce
--- /dev/null
+++ b/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""Performance benchmarks ported from MATLAB k-Wave."""
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
new file mode 100644
index 00000000..7594dd5b
--- /dev/null
+++ b/benchmarks/benchmark.py
@@ -0,0 +1,181 @@
+"""
+k-Wave 3D Performance Benchmark
+
+Ported from: k-Wave/benchmark.m
+
+Runs a sequence of 3D initial-value simulations with increasing grid sizes and
+records average execution time for each grid.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from time import perf_counter
+from typing import Any, Callable
+
+from benchmarks.helpers import (
+    BenchmarkOptions,
+    ChildPeakMemorySampler,
+    PeakMemorySampler,
+    build_case,
+    current_memory_bytes,
+    default_output_path,
+    grid_sizes,
+    options_payload,
+    rolling_average,
+    save_results,
+    store_case_result,
+    validate_memory_bytes,
+)
+from kwave.kspaceFirstOrder import kspaceFirstOrder
+
+
+def run(
+    options: BenchmarkOptions = BenchmarkOptions(),
+    *,
+    backend: str = "python",
+    device: str = "cpu",
+    max_cases: int | None = None,
+    output_path: str | Path | None = None,
+    quiet: bool = True,
+    solver: Callable[..., Any] | None = None,
+    timer: Callable[[], float] = perf_counter,
+    memory_reader: Callable[[], float] = current_memory_bytes,
+    memory_sampling_interval: float = 0.05,
+    mem_sampler_factory: Callable[[], Any] | None = None,
+) -> dict[str, Any]:
+    solver = kspaceFirstOrder if solver is None else solver
+    cases = grid_sizes(options)
+    if max_cases is not None:
+        if max_cases <= 0:
+            raise ValueError("max_cases must be positive")
+        cases = cases[:max_cases]
+
+    # Sampler picks itself based on backend: cpp runs the simulation in a
+    # subprocess, so Python-process RSS sampling is meaningless — use
+    # getrusage(RUSAGE_CHILDREN) instead. Override via mem_sampler_factory
+    # for tests.
+    def _default_sampler_factory() -> Any:
+        if backend == "cpp":
+            return ChildPeakMemorySampler()
+        return PeakMemorySampler(memory_reader, memory_sampling_interval)
+
+    sampler_factory = mem_sampler_factory or _default_sampler_factory
+
+    path = default_output_path(options) if output_path is None else Path(output_path)
+    result: dict[str, Any] = {
+        "comp_size": [],
+        "comp_time": [],
+        "options": options_payload(options, backend, device),
+        "output_path": str(path),
+        "error_reached": False,
+        "error_message": "",
+    }
+    if options.report_mem_usage:
+        # Probe early so unsupported combinations (e.g. Windows + cpp) fail
+        # before any output file is written.
+        try:
+            sampler_factory()
+        except NotImplementedError as exc:
+            raise ValueError(str(exc)) from exc
+        if backend != "cpp":
+            # For python backend, also probe the reader so platform-missing
+            # /proc or ps surfaces here rather than mid-simulation.
+            try:
+                validate_memory_bytes(memory_reader())
+            except Exception as exc:
+                raise ValueError("report_mem_usage is not supported on this platform") from exc
+        result["mem_usage"] = []
+
+    for case_index, (nx, ny, nz, scale) in enumerate(cases):
+        loop_time = 0.0
+        loop_mem_usage = 0.0
+        try:
+            kgrid, medium, source, sensor = build_case(options, nx, ny, nz, scale)
+            for loop_num in range(1, options.num_averages + 1):
+                if options.report_mem_usage:
+                    with sampler_factory() as memory_sampler:
+                        start = timer()
+                        solver(
+                            kgrid,
+                            medium,
+                            source,
+                            sensor,
+                            backend=backend,
+                            device=device,
+                            quiet=quiet,
+                            pml_size=options.pml_size,
+                            pml_inside=options.pml_inside,
+                            smooth_p0=False,
+                        )
+                        elapsed_time = timer() - start
+                    loop_mem_usage = rolling_average(loop_mem_usage, memory_sampler.peak_bytes, loop_num)
+                else:
+                    start = timer()
+                    solver(
+                        kgrid,
+                        medium,
+                        source,
+                        sensor,
+                        backend=backend,
+                        device=device,
+                        quiet=quiet,
+                        pml_size=options.pml_size,
+                        pml_inside=options.pml_inside,
+                        smooth_p0=False,
+                    )
+                    elapsed_time = timer() - start
+                loop_time = rolling_average(loop_time, elapsed_time, loop_num)
+                store_case_result(result, case_index, nx * ny * nz, loop_time, loop_mem_usage, options.report_mem_usage)
+                save_results(path, result)
+        except Exception as exc:
+            result["error_reached"] = True
+            result["error_message"] = str(exc)
+            save_results(path, result)
+            break
+
+    return result
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run the k-Wave 3D performance benchmark.")
+    parser.add_argument("--data-cast", choices=("off", "single"), default="off")
+    parser.add_argument("--backend", choices=("python", "cpp"), default="python")
+    parser.add_argument("--device", choices=("cpu", "gpu"), default="cpu")
+    parser.add_argument("--max-cases", type=int, default=None)
+    parser.add_argument("--num-averages", type=int, default=3)
+    parser.add_argument("--number-time-points", type=int, default=1000)
+    parser.add_argument("--output-path", type=Path, default=None)
+    parser.add_argument("--report-mem-usage", action="store_true")
+    parser.add_argument("--verbose", action="store_true")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = _parse_args()
+    benchmark_options = BenchmarkOptions(
+        data_cast=args.data_cast,
+        num_averages=args.num_averages,
+        number_time_points=args.number_time_points,
+        report_mem_usage=args.report_mem_usage,
+    )
+    result = run(
+        benchmark_options,
+        backend=args.backend,
+        device=args.device,
+        max_cases=args.max_cases,
+        output_path=args.output_path,
+        quiet=not args.verbose,
+    )
+    print(f"Benchmark results saved to {result['output_path']}")
+    if result["error_reached"]:
+        print("Memory limit reached or error encountered, exiting benchmark. Error message:", file=sys.stderr)
+        print(f"  {result['error_message']}", file=sys.stderr)
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/helpers.py b/benchmarks/helpers.py
new file mode 100644
index 00000000..9290d320
--- /dev/null
+++ b/benchmarks/helpers.py
@@ -0,0 +1,357 @@
+from __future__ import annotations
+
+import json
+import math
+import os
+import platform
+import subprocess
+import threading
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Callable
+
+import numpy as np
+
+import kwave
+from kwave.data import Vector
+from kwave.kgrid import kWaveGrid
+from kwave.kmedium import kWaveMedium
+from kwave.ksensor import kSensor
+from kwave.ksource import kSource
+from kwave.utils.conversion import cart2grid
+from kwave.utils.filters import smooth
+from kwave.utils.mapgen import make_ball, make_cart_sphere
+
+
+@dataclass(frozen=True)
+class BenchmarkOptions:
+    data_cast: str = "off"
+    heterogeneous_media: bool = True
+    absorbing_media: bool = True
+    nonlinear_media: bool = False
+    binary_sensor_mask: bool = True
+    number_sensor_points: int = 100
+    number_time_points: int = 1000
+    num_averages: int = 3
+    start_size: int = 32
+    x_scale_array: tuple[int, ...] = (1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16)
+    y_scale_array: tuple[int, ...] = (1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16)
+    z_scale_array: tuple[int, ...] = (1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8)
+    domain_size: float = 22e-3
+    sensor_radius: float = 10e-3
+    pml_size: int = 10
+    pml_inside: bool = True
+    report_mem_usage: bool = False
+
+    def __post_init__(self) -> None:
+        if self.data_cast not in {"off", "single"}:
+            raise ValueError("data_cast must be 'off' or 'single'. Use device='gpu' to run on a GPU.")
+        if not (len(self.x_scale_array) == len(self.y_scale_array) == len(self.z_scale_array)):
+            raise ValueError("scale arrays must have the same length")
+        if self.number_time_points <= 0 or self.num_averages <= 0:
+            raise ValueError("number_time_points and num_averages must be positive")
+        if self.start_size <= 0:
+            raise ValueError("start_size must be positive")
+        if self.number_sensor_points <= 1:
+            raise ValueError("number_sensor_points must be greater than 1")
+        if self.domain_size <= 0:
+            raise ValueError("domain_size must be positive")
+        if self.sensor_radius <= 0:
+            raise ValueError("sensor_radius must be positive")
+        if self.pml_size < 0:
+            raise ValueError("pml_size must be non-negative")
+
+    @property
+    def dtype(self) -> type[np.floating[Any]]:
+        return np.float32 if self.data_cast == "single" else np.float64
+
+
+def grid_sizes(options: BenchmarkOptions = BenchmarkOptions()) -> list[tuple[int, int, int, int]]:
+    return [
+        (
+            options.start_size * xscale,
+            options.start_size * yscale,
+            options.start_size * zscale,
+            min(xscale, yscale, zscale),
+        )
+        for xscale, yscale, zscale in zip(options.x_scale_array, options.y_scale_array, options.z_scale_array)
+    ]
+
+
+def build_case(options: BenchmarkOptions, nx: int, ny: int, nz: int, scale: int) -> tuple[kWaveGrid, kWaveMedium, kSource, kSensor]:
+    dtype = options.dtype
+    dx = options.domain_size / nx
+    dy = options.domain_size / ny
+    dz = options.domain_size / nz
+    kgrid = kWaveGrid(Vector([nx, ny, nz]), Vector([dx, dy, dz]))
+
+    c0 = dtype(1500)
+    rho0 = dtype(1000)
+    alpha_coeff = dtype(0.75)
+    alpha_power = dtype(1.5)
+
+    if options.heterogeneous_media:
+        sound_speed = c0 * np.ones((nx, ny, nz), dtype=dtype)
+        # MATLAB: sound_speed(1:Nx/4, :, :)  →  Python: [:nx//4, :, :] (head slice, no -1).
+        sound_speed[: nx // 4, :, :] = c0 * dtype(1.2)
+        density = rho0 * np.ones((nx, ny, nz), dtype=dtype)
+        # MATLAB: density(:, Ny/4:end, :)  →  Python: [:, ny//4-1:, :] (tail slice — the
+        # -1 converts MATLAB's 1-indexed inclusive start to Python's 0-indexed start).
+        # max(..., 0) guards tiny grids where ny//4 - 1 would be negative.
+        density[:, max(ny // 4 - 1, 0) :, :] = rho0 * dtype(1.2)
+    else:
+        sound_speed = np.array(c0, dtype=dtype)
+        density = np.array(rho0, dtype=dtype)
+
+    medium = kWaveMedium(sound_speed=sound_speed, density=density)
+    if options.absorbing_media:
+        medium.alpha_coeff = alpha_coeff
+        medium.alpha_power = alpha_power
+    if options.nonlinear_media:
+        medium.BonA = dtype(6)
+
+    source = kSource()
+    # make_ball treats the supplied center as 1-indexed internally (mapgen.py),
+    # so passing nx//2 (= MATLAB's Nx/2 1-indexed value) yields the same centroid.
+    source.p0 = dtype(10) * make_ball(Vector([nx, ny, nz]), Vector([nx // 2, ny // 2, nz // 2]), 2 * scale)
+    # smooth() upcasts to float64 via the FFT path even when given float32;
+    # the trailing .astype(dtype) restores user-requested precision.
+    source.p0 = smooth(source.p0.astype(dtype, copy=False), restore_max=True).astype(dtype, copy=False)
+
+    sensor_mask = make_cart_sphere(options.sensor_radius, options.number_sensor_points)
+    if options.binary_sensor_mask:
+        sensor_mask, _, _ = cart2grid(kgrid, sensor_mask, order="C")
+        sensor_mask = sensor_mask.astype(bool)
+    sensor = kSensor(mask=sensor_mask)
+    sensor.record = ["p"]
+
+    kgrid.makeTime(np.max(np.asarray(medium.sound_speed)))
+    kgrid.setTime(options.number_time_points, kgrid.dt)
+
+    return kgrid, medium, source, sensor
+
+
+def default_output_path(options: BenchmarkOptions) -> Path:
+    computer_name = platform.node() or "unknown-computer"
+    date = datetime.now().strftime("%Y%m%d-%H%M%S")
+    return Path(f"benchmark_data-{computer_name}-{options.data_cast}-{date}.json")
+
+
+def rolling_average(previous_average: float, new_value: float, count: int) -> float:
+    return (previous_average * (count - 1) + new_value) / count
+
+
+def store_case_result(
+    result: dict[str, Any], case_index: int, comp_size: int, comp_time: float, mem_usage: float, report_mem_usage: bool
+) -> None:
+    if case_index == len(result["comp_size"]):
+        result["comp_size"].append(comp_size)
+        result["comp_time"].append(comp_time)
+        if report_mem_usage:
+            result["mem_usage"].append(mem_usage)
+        return
+
+    result["comp_size"][case_index] = comp_size
+    result["comp_time"][case_index] = comp_time
+    if report_mem_usage:
+        result["mem_usage"][case_index] = mem_usage
+
+
+def save_results(path: Path, result: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "comp_size": [int(size) for size in result["comp_size"]],
+        "comp_time": [float(time) for time in result["comp_time"]],
+        "options": result["options"],
+        "output_path": result["output_path"],
+        "error_reached": bool(result["error_reached"]),
+        "error_message": result["error_message"],
+    }
+    if "mem_usage" in result:
+        payload["mem_usage"] = [float(usage) for usage in result["mem_usage"]]
+    path.write_text(json.dumps(payload, indent=2, allow_nan=False) + "\n")
+
+
+def options_payload(options: BenchmarkOptions, backend: str, device: str) -> dict[str, Any]:
+    payload = asdict(options)
+    payload.update(
+        {
+            "backend": backend,
+            "device": device,
+            "computer_name": platform.node(),
+            "python_version": platform.python_version(),
+            "platform": platform.platform(),
+            "kwave_python_version": kwave.__version__,
+        }
+    )
+    return payload
+
+
+def validate_memory_bytes(value: float) -> float:
+    memory_bytes = float(value)
+    if not math.isfinite(memory_bytes) or memory_bytes < 0:
+        raise ValueError("memory usage must be a finite non-negative value")
+    return memory_bytes
+
+
+def _linux_current_memory_bytes() -> float:
+    pages = int(Path("/proc/self/statm").read_text().split()[1])
+    return float(pages * os.sysconf("SC_PAGE_SIZE"))
+
+
+def _ps_current_memory_bytes() -> float:
+    completed = subprocess.run(
+        ["ps", "-o", "rss=", "-p", str(os.getpid())],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    return float(int(completed.stdout.strip()) * 1024)
+
+
+def _windows_current_memory_bytes() -> float:
+    import ctypes
+    from ctypes import wintypes
+
+    class ProcessMemoryCounters(ctypes.Structure):
+        _fields_ = [
+            ("cb", wintypes.DWORD),
+            ("PageFaultCount", wintypes.DWORD),
+            ("PeakWorkingSetSize", ctypes.c_size_t),
+            ("WorkingSetSize", ctypes.c_size_t),
+            ("QuotaPeakPagedPoolUsage", ctypes.c_size_t),
+            ("QuotaPagedPoolUsage", ctypes.c_size_t),
+            ("QuotaPeakNonPagedPoolUsage", ctypes.c_size_t),
+            ("QuotaNonPagedPoolUsage", ctypes.c_size_t),
+            ("PagefileUsage", ctypes.c_size_t),
+            ("PeakPagefileUsage", ctypes.c_size_t),
+        ]
+
+    counters = ProcessMemoryCounters()
+    counters.cb = ctypes.sizeof(ProcessMemoryCounters)
+    handle = ctypes.windll.kernel32.GetCurrentProcess()
+    if not ctypes.windll.psapi.GetProcessMemoryInfo(handle, ctypes.byref(counters), counters.cb):
+        raise RuntimeError("could not read process memory usage")
+    return float(counters.WorkingSetSize)
+
+
+def current_memory_bytes() -> float:
+    """Return the current resident-set-size of THIS Python process, in bytes.
+
+    Note: this returns the *current* RSS at the moment of the call, not a
+    historical peak (despite ``ru_maxrss``-style fields existing on most
+    platforms). Use ``PeakMemorySampler`` to track peak-over-time, or
+    ``ChildPeakMemorySampler`` for subprocess (``backend="cpp"``) measurement.
+    """
+    system = platform.system().lower()
+    if system == "linux":
+        try:
+            return validate_memory_bytes(_linux_current_memory_bytes())
+        except (OSError, ValueError, IndexError):
+            return validate_memory_bytes(_ps_current_memory_bytes())
+    if system == "darwin":
+        return validate_memory_bytes(_ps_current_memory_bytes())
+    if system == "windows":
+        return validate_memory_bytes(_windows_current_memory_bytes())
+
+    raise RuntimeError("report_mem_usage is not supported on this platform")
+
+
+# Back-compat alias — the old name is misleading but kept so external callers
+# (and any pre-merge benchmark JSON tooling) don't break.
+peak_memory_bytes = current_memory_bytes
+
+
+class ChildPeakMemorySampler:
+    """Capture the peak RSS of child processes spawned during the ``with`` block.
+
+    Use this for ``backend="cpp"`` benchmark runs where the simulation work
+    happens in a subprocess — ``PeakMemorySampler`` only sees the Python
+    parent's RSS and reports a number that has nothing to do with the C++
+    process's actual footprint.
+
+    Implementation: ``resource.getrusage(RUSAGE_CHILDREN).ru_maxrss`` is the
+    cumulative max across all reaped children, so we record before/after and
+    return the increment. On Linux ``ru_maxrss`` is reported in kilobytes; on
+    macOS it's in bytes (per BSD historical convention). Windows has no
+    equivalent in ``resource`` — the sampler raises ``NotImplementedError``
+    on Windows so the caller can refuse the combination cleanly.
+    """
+
+    def __init__(self) -> None:
+        if platform.system().lower() == "windows":
+            raise NotImplementedError(
+                "ChildPeakMemorySampler is not supported on Windows "
+                "(resource.RUSAGE_CHILDREN is POSIX-only). "
+                "Use backend='python' to measure simulation memory on Windows."
+            )
+        self._before = 0
+        self._after = 0
+
+    def __enter__(self) -> ChildPeakMemorySampler:
+        import resource  # POSIX-only; gated by the Windows check above
+
+        self._before = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
+        return self
+
+    def __exit__(self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: Any) -> None:
+        import resource
+
+        self._after = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
+
+    @property
+    def peak_bytes(self) -> float:
+        delta = max(0, self._after - self._before)
+        # ru_maxrss units differ by platform (Linux: KB, macOS: bytes).
+        if platform.system().lower() == "linux":
+            return float(delta * 1024)
+        return float(delta)
+
+
+class PeakMemorySampler:
+    def __init__(self, reader: Callable[[], float] = current_memory_bytes, interval: float = 0.05):
+        self._reader = reader
+        self._interval = interval
+        self._peak_bytes = 0.0
+        self._error: Exception | None = None
+        self._lock = threading.Lock()
+        self._stop_event = threading.Event()
+        self._thread: threading.Thread | None = None
+
+    @property
+    def peak_bytes(self) -> float:
+        with self._lock:
+            return self._peak_bytes
+
+    def __enter__(self) -> PeakMemorySampler:
+        self._record_sample()
+        if self._interval > 0:
+            self._thread = threading.Thread(target=self._sample_until_stopped, daemon=True)
+            self._thread.start()
+        return self
+
+    def __exit__(self, exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: Any) -> None:
+        self._stop_event.set()
+        if self._thread is not None:
+            self._thread.join()
+        try:
+            self._record_sample()
+        except Exception as exc:
+            if exc_type is None:
+                raise exc
+        if exc_type is None and self._error is not None:
+            raise self._error
+
+    def _sample_until_stopped(self) -> None:
+        while not self._stop_event.wait(self._interval):
+            try:
+                self._record_sample()
+            except Exception as exc:
+                self._error = exc
+                self._stop_event.set()
+
+    def _record_sample(self) -> None:
+        memory_bytes = validate_memory_bytes(self._reader())
+        with self._lock:
+            self._peak_bytes = max(self._peak_bytes, memory_bytes)
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
new file mode 100644
index 00000000..69ce5d77
--- /dev/null
+++ b/tests/test_benchmark.py
@@ -0,0 +1,266 @@
+import json
+from pathlib import Path
+from unittest.mock import patch
+
+import numpy as np
+import pytest
+
+from benchmarks.benchmark import BenchmarkOptions, build_case, grid_sizes, run
+from benchmarks.helpers import ChildPeakMemorySampler, current_memory_bytes, peak_memory_bytes
+
+
+def small_options(**overrides):
+    values = {
+        "start_size": 8,
+        "x_scale_array": (1,),
+        "y_scale_array": (1,),
+        "z_scale_array": (1,),
+        "number_sensor_points": 8,
+        "number_time_points": 5,
+        "num_averages": 1,
+        "sensor_radius": 1e-3,
+    }
+    values.update(overrides)
+    return BenchmarkOptions(**values)
+
+
+def test_grid_sizes_match_matlab_scale_arrays():
+    sizes = grid_sizes(BenchmarkOptions())
+
+    assert sizes[0] == (32, 32, 32, 1)
+    assert sizes[-1] == (512, 512, 256, 8)
+    assert len(sizes) == 12
+
+
+def test_build_case_matches_benchmark_defaults_for_small_grid():
+    options = small_options()
+
+    kgrid, medium, source, sensor = build_case(options, 8, 8, 8, 1)
+
+    assert tuple(kgrid.N) == (8, 8, 8)
+    assert np.isclose(kgrid.dx, options.domain_size / 8)
+    assert kgrid.Nt == options.number_time_points
+    assert medium.sound_speed.shape == (8, 8, 8)
+    assert np.all(medium.sound_speed[:2, :, :] == 1800)
+    assert np.all(medium.sound_speed[2:, :, :] == 1500)
+    assert np.all(medium.density[:, :1, :] == 1000)
+    assert np.all(medium.density[:, 1:, :] == 1200)
+    assert medium.alpha_coeff == pytest.approx(0.75)
+    assert medium.alpha_power == pytest.approx(1.5)
+    assert source.p0.shape == (8, 8, 8)
+    assert np.max(source.p0) == pytest.approx(10)
+    assert sensor.mask.shape == (8, 8, 8)
+    assert sensor.mask.dtype == bool
+    assert 0 < np.count_nonzero(sensor.mask) <= options.number_sensor_points
+    assert sensor.record == ["p"]
+
+
+def test_single_data_cast_uses_float32_arrays():
+    options = small_options(data_cast="single")
+
+    _, medium, source, _ = build_case(options, 8, 8, 8, 1)
+
+    assert medium.sound_speed.dtype == np.float32
+    assert medium.density.dtype == np.float32
+    assert source.p0.dtype == np.float32
+
+
+def test_run_aggregates_timings_and_saves_json_file(tmp_path: Path):
+    options = small_options(num_averages=2)
+    output_path = tmp_path / "benchmark.json"
+    times = iter([0.0, 1.0, 1.0, 3.0])
+    calls = []
+
+    def fake_solver(kgrid, medium, source, sensor, **kwargs):
+        calls.append((kgrid, medium, source, sensor, kwargs))
+        return {"p": np.zeros((1, 1))}
+
+    result = run(options, max_cases=1, output_path=output_path, solver=fake_solver, timer=lambda: next(times))
+
+    assert len(calls) == 2
+    assert calls[0][4]["pml_size"] == options.pml_size
+    assert calls[0][4]["pml_inside"] is True
+    assert calls[0][4]["smooth_p0"] is False
+    assert result["comp_size"] == [8 * 8 * 8]
+    assert result["comp_time"] == [pytest.approx(1.5)]
+    assert result["output_path"] == str(output_path)
+    assert result["error_reached"] is False
+    assert output_path.exists()
+
+    saved = json.loads(output_path.read_text())
+    assert saved["comp_size"] == [8 * 8 * 8]
+    assert saved["comp_time"] == [pytest.approx(1.5)]
+    assert saved["output_path"] == str(output_path)
+    assert saved["options"]["start_size"] == 8
+
+
+def test_run_keeps_distinct_cases_with_the_same_total_grid_points(tmp_path: Path):
+    options = small_options(
+        x_scale_array=(1, 2),
+        y_scale_array=(2, 1),
+        z_scale_array=(1, 1),
+    )
+    output_path = tmp_path / "benchmark.json"
+    times = iter([0.0, 1.0, 1.0, 3.0])
+    shapes = []
+
+    def fake_solver(kgrid, *args, **kwargs):
+        shapes.append(tuple(kgrid.N))
+        return {"p": np.zeros((1, 1))}
+
+    result = run(options, output_path=output_path, solver=fake_solver, timer=lambda: next(times))
+
+    assert shapes == [(8, 16, 8), (16, 8, 8)]
+    assert result["comp_size"] == [8 * 16 * 8, 16 * 8 * 8]
+    assert result["comp_time"] == [pytest.approx(1.0), pytest.approx(2.0)]
+
+    saved = json.loads(output_path.read_text())
+    assert saved["comp_size"] == [8 * 16 * 8, 16 * 8 * 8]
+    assert saved["comp_time"] == [pytest.approx(1.0), pytest.approx(2.0)]
+
+
+def test_start_size_must_be_positive():
+    with pytest.raises(ValueError, match="start_size must be positive"):
+        small_options(start_size=0)
+
+
+def test_run_reports_memory_usage_and_saves_valid_json(tmp_path: Path):
+    options = small_options(report_mem_usage=True, num_averages=2)
+    output_path = tmp_path / "benchmark.json"
+    times = iter([0.0, 1.0, 1.0, 3.0])
+    memory_samples = iter([512.0, 1024.0, 2048.0, 4096.0, 8192.0])
+
+    def fake_solver(*args, **kwargs):
+        return {"p": np.zeros((1, 1))}
+
+    result = run(
+        options,
+        max_cases=1,
+        output_path=output_path,
+        solver=fake_solver,
+        timer=lambda: next(times),
+        memory_reader=lambda: next(memory_samples),
+        memory_sampling_interval=0,
+    )
+
+    assert result["mem_usage"] == [pytest.approx(5120.0)]
+
+    saved = json.loads(output_path.read_text())
+    assert saved["mem_usage"] == [pytest.approx(5120.0)]
+
+
+def test_report_memory_usage_fails_before_writing_when_unavailable(tmp_path: Path):
+    options = small_options(report_mem_usage=True)
+    output_path = tmp_path / "benchmark.json"
+
+    def unsupported_memory_reader():
+        raise RuntimeError("memory unavailable")
+
+    with pytest.raises(ValueError, match="report_mem_usage is not supported"):
+        run(options, max_cases=1, output_path=output_path, memory_reader=unsupported_memory_reader)
+
+    assert not output_path.exists()
+
+
+def test_peak_memory_bytes_is_back_compat_alias_for_current_memory_bytes():
+    # The old name was misleading (Linux/macOS/Windows readers return current
+    # RSS, not historical peak). Keep the alias so external benchmark tooling
+    # that imported the old name doesn't break.
+    assert peak_memory_bytes is current_memory_bytes
+
+
+@pytest.mark.parametrize(
+    "kwargs, expected_match",
+    [
+        ({"domain_size": 0}, "domain_size must be positive"),
+        ({"domain_size": -1e-3}, "domain_size must be positive"),
+        ({"sensor_radius": 0}, "sensor_radius must be positive"),
+        ({"sensor_radius": -1}, "sensor_radius must be positive"),
+        ({"pml_size": -1}, "pml_size must be non-negative"),
+    ],
+)
+def test_options_post_init_rejects_invalid_geometry(kwargs, expected_match):
+    with pytest.raises(ValueError, match=expected_match):
+        small_options(**kwargs)
+
+
+def test_cpp_backend_uses_child_sampler_via_factory(tmp_path: Path):
+    """For backend='cpp', the subprocess sampler factory is used (not PeakMemorySampler)."""
+    options = small_options(report_mem_usage=True, num_averages=2)
+    output_path = tmp_path / "benchmark.json"
+    times = iter([0.0, 1.0, 1.0, 3.0])
+
+    sampler_calls = []
+
+    class FakeChildSampler:
+        def __enter__(self):
+            sampler_calls.append("enter")
+            return self
+
+        def __exit__(self, *exc):
+            sampler_calls.append("exit")
+
+        @property
+        def peak_bytes(self) -> float:
+            return float(1024 * len(sampler_calls))  # arbitrary but deterministic
+
+    def fake_solver(*args, **kwargs):
+        return {"p": np.zeros((1, 1))}
+
+    result = run(
+        options,
+        backend="cpp",
+        max_cases=1,
+        output_path=output_path,
+        solver=fake_solver,
+        timer=lambda: next(times),
+        mem_sampler_factory=FakeChildSampler,
+    )
+
+    # Two loop iterations × (enter, exit). The early-startup probe just
+    # constructs a sampler to verify the factory works; it doesn't enter.
+    assert sampler_calls == ["enter", "exit", "enter", "exit"]
+    assert "mem_usage" in result and len(result["mem_usage"]) == 1
+
+
+def test_cpp_backend_reports_clear_error_when_child_sampler_unsupported(tmp_path: Path):
+    """Windows + cpp + report_mem_usage should raise a clear error at startup."""
+    options = small_options(report_mem_usage=True)
+    output_path = tmp_path / "benchmark.json"
+
+    def unsupported_sampler_factory():
+        raise NotImplementedError("ChildPeakMemorySampler is not supported on Windows (resource.RUSAGE_CHILDREN is POSIX-only).")
+
+    with pytest.raises(ValueError, match="not supported on Windows"):
+        run(
+            options,
+            backend="cpp",
+            max_cases=1,
+            output_path=output_path,
+            mem_sampler_factory=unsupported_sampler_factory,
+        )
+    # Failed-fast, before writing the output file.
+    assert not output_path.exists()
+
+
+def test_child_peak_memory_sampler_refuses_on_windows():
+    with patch("benchmarks.helpers.platform.system", return_value="Windows"):
+        with pytest.raises(NotImplementedError, match="not supported on Windows"):
+            ChildPeakMemorySampler()
+
+
+def test_run_saves_partial_results_after_solver_error(tmp_path: Path):
+    options = small_options()
+    output_path = tmp_path / "benchmark.json"
+
+    def failing_solver(*args, **kwargs):
+        raise RuntimeError("solver failed")
+
+    result = run(options, max_cases=1, output_path=output_path, solver=failing_solver)
+
+    assert result["error_reached"] is True
+    assert result["error_message"] == "solver failed"
+    assert output_path.exists()
+    saved = json.loads(output_path.read_text())
+    assert saved["error_reached"] is True
+    assert saved["error_message"] == "solver failed"