diff --git a/CLAUDE.md b/CLAUDE.md
index 601c11d..1163064 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -51,19 +51,37 @@ secondary counters (`depth_iter`, `buffer_iter`, `super_iter`,
 L2->L3 OFM spills, and weight reloads happen. Together these form the
 "layer control parameters" (`Lcp`).
 
-**Stamp.** A spatial replica of a small AIE region. A 4x4 design
-"stamped" twice (`-o 2x4x4`) runs the same kernels in parallel on two
-side-by-side 4x4 regions. Each stamp gets its own backend connection and
-its own `AIEUtil` helper. The debugger schedules and breakpoints them
-independently.
-
-**Batch.** Conceptually the same as stamping but used for data-parallel
-inference; multiple input samples processed in parallel by replicated
-hardware. Detected from `device_batch_size` in `buffer_info.json`.
-
-**Overlay.** The shape of the AIE region in use, written `NxCxR`
-(stamps x columns x rows). Default `1x4x4`. Each stamp i occupies
-columns `[i*C, (i+1)*C)`.
+**Stamp.** A spatial replica of a small AIE region within one batch.
+Different stamps in a batch may run *different* kernels (and may skip
+layers entirely - higher-indexed stamps participate in a subset of
+layers). Each replica gets its own backend connection and its own
+`AIEUtil` helper.
+
+**Batch.** A data-parallel copy of the whole per-batch stamp set.
+Batches run the *same* kernels on different input samples, so they
+share PCs/ELFs/buffers and the debugger only resolves metadata once
+per per-batch stamp and mirrors it across batches. Taken from
+`device_batch_size` in `buffer_info.json`.
+
+**Overlay (4D).** The shape of the AIE region in use is `BxSxCxR`:
+- `B` = number of batches (data-parallel copies)
+- `S` = stamps per batch (spatial replicas inside one batch)
+- `C` = columns per stamp
+- `R` = rows per stamp
+
+Replicas are packed stamp-inner along columns: the flat replica id
+`sid = b*S + s` occupies columns `[sid*C, (sid+1)*C)`. The rest of
+the code refers to that flat id as "stamp id" / "sid". `Overlay`
+helpers: `get_batch_count()`, `get_stamps_per_batch()`,
+`get_stampcount()` (= total replicas = B*S), `replica_to_batch(sid)`,
+`replica_to_stamp(sid)`, `is_leftmost_in_batch(sid)` (true when
+`sid % S == 0`; these replicas are the per-batch stamp 0 and always
+participate in every layer).
+
+`buffer_info.json` stores this as `.meta.layout = [stamps, R, C]`,
+`.meta.device_batch_size = B`, `.meta.max_stamps_used = S` (with
+fallback to `max(no_of_stamps)` across layers). The `-o` CLI override
+parses `SxCxR` (or `CxR`) and forces `B=1`.
 
 **PM reload.** The AIE has limited program memory; large designs split
 their code across multiple ELFs and reload program memory between layer
@@ -138,8 +156,9 @@ Common runtime flags (`-f`):
   and the file format.
 - `skip_iter` -- use a perf-counter trick to fast-forward iterations
   instead of polling per iteration.
-- `multistamp` -- actually drive every stamp; default is to collapse to
-  stamp 0 for sanity.
+- `multistamp` -- actually drive every replica (all batches and all
+  per-batch stamps); default is to collapse to a single replica
+  (B=S=1) for sanity.
 
 ### Testing
 
@@ -235,20 +254,26 @@ This is where the real work happens. The two methods to read first are
 
 - `common_init()` runs once before any layer. If the user did not pass
   the `multistamp` flag, the runner collapses the design to a single
-  stamp here -- it edits the layer/overlay/impls lists in place so the
-  rest of the system simply sees a 1-stamp design. This is the safest
-  default because multi-stamp scheduling is intricate.
+  replica (`B=S=1`) here -- it edits the layer/overlay/impls lists in
+  place so the rest of the system simply sees a 1-replica design.
+  This is the safest default because multi-stamp scheduling is
+  intricate.
 - `schedule_layer_start()` arms the start (and optionally end) PC
-  breakpoint on every stamp. When PM reload is expected it also
-  installs a combo-event that survives the reload, *and* it may arm a
-  future stamp's breakpoint *early* -- before the outer loop reaches
-  the layer that stamp actually runs. This is necessary because if a
-  stamp does not participate in the current layer, releasing it
-  without a valid breakpoint would let it free-run past its real
-  target. The "PM RELOAD on stamp X" log line is when arming happens,
-  not when the reload physically occurs.
-- `run_layer()` runs one layer to completion across all stamps using a
-  thread pool, one worker per stamp.
+  breakpoint on every stamp. Per-batch leftmost replicas (where
+  `sid % S == 0`) always participate in `next_layer`; other replicas
+  may need their breakpoint armed for a *future* layer they actually
+  run. When PM reload is expected it also installs a combo-event that
+  survives the reload, *and* it may arm a future stamp's breakpoint
+  *early* -- before the outer loop reaches the layer that stamp
+  actually runs. This is necessary because if a stamp does not
+  participate in the current layer, releasing it without a valid
+  breakpoint would let it free-run past its real target. The "PM
+  RELOAD on stamp X" log line is when arming happens, not when the
+  reload physically occurs. The list of replicas that actually
+  breakpoint on `next_layer` is stashed in `state.stamps_to_run` for
+  `run_layer` to consume.
+- `run_layer()` runs one layer to completion across the replicas in
+  `state.stamps_to_run` using a thread pool, one worker per replica.
 - Inside a layer the runner alternates: continue, poll for breakpoint,
   identify whether we hit start or end PC, dump the appropriate
   buffers, increment the iteration counter, repeat.
@@ -277,17 +302,27 @@ through.
 
 A small holder for "where are we now": current layer index, current
 iteration, ping/pong toggle for OFM dumps, list of pending manual
-breakpoints, and per-stamp PM-reload flags. `update_layer()` is the
-generator the runner iterates to advance through the design.
+breakpoints, per-replica PM-reload flags, and `stamps_per_batch` (S)
+so `get_next_layer_for_stamp(sid)` can map a flat replica id to its
+per-batch stamp index and gate against each layer's
+`stamps_per_batch`. `stamps_to_run` is set by `schedule_layer_start`
+and consumed by `run_layer`. `update_layer()` is the generator the
+runner iterates to advance through the design.
 
 ### `layer_info.py` -- buffer / layer metadata
 
 Parses `buffer_info.json` and produces the `Layer` and `Buffer` objects
 the rest of the system uses.
 
-- A `Layer` knows its kernels (one `Stamp` per AIE replica), its
-  input/output/weight buffers, its L3 buffers, and its iteration
-  counts (`Lcp`).
+- A `Layer` knows its kernels, its input/output/weight buffers, its
+  L3 buffers, and its iteration counts (`Lcp`). `layer.stamps` is the
+  *per-batch* stamp list (length `layer.stamps_per_batch`, which may
+  be less than the overlay's S - higher-indexed stamps skip this
+  layer). Batches share kernel/PC metadata, so callers translate a
+  flat replica id with `layer.get_stamp(sid)` (returns
+  `stamps[sid % S]`, the per-batch stamp index) and gate participation
+  with `layer.runs_replica(sid)` (false when this layer's
+  `stamps_per_batch` is smaller, e.g. TG layers).
 - A `Buffer` is the user-level concept (one IFM, one OFM, one weight
   set). Internally it holds an `L1Buffer` (ping/pong) and a list of
   `L2Buffer` chunks. Buffers larger than the memory-tile size are
@@ -321,6 +356,10 @@ preceding lines and skips those.
 
 Owns the on-disk layout and the actual reads. Files land under
 `<output_dir>/batch<N>/layer_<order>/<buffer_type>/<col>_<row>/`.
+The batch/stamp coordinates come from
+`overlay.replica_to_batch(sid)` and `overlay.replica_to_stamp(sid)`,
+so an `sid` always maps to one `batch<N>` directory and one
+`stamp<S>` suffix on the L2 dump filename.
 
 Binary format: an 8-byte little-endian length header followed by the
 raw 32-bit words. With `text_dump` the data is written as ASCII hex
@@ -374,10 +413,15 @@ Two non-obvious pieces:
 
 ### `aie_overlay.py` -- overlay geometry
 
-Parses `NxCxR` from `-o` (or the layout in `buffer_info.json`, which
-is stored as `[stamps, nrow, ncol]` rather than `NxCxR`) and builds
-the list of `(col, row)` tiles per stamp. Methods like `get_tiles`
-filter that list by tile type, with the AIE row offset already added.
+Holds the 4D `(B, S, C, R)` layout and builds the list of `(col, row)`
+tiles for each flat replica id `sid = b*S + s`. Layout is supplied
+either from `buffer_info.json` (`[stamps, R, C]` plus
+`device_batch_size` and `max_stamps_used`) or parsed from the `-o`
+CLI flag (`SxCxR` or `CxR`, forces `B=1`). `get_tiles` filters by
+tile type with the AIE row offset already added. Methods used by the
+rest of the system: `get_stampcount` (total replicas, B*S),
+`get_batch_count`, `get_stamps_per_batch`, `get_stampwidth` (C),
+`replica_to_batch`, `replica_to_stamp`, `is_leftmost_in_batch`.
 
 ### `arch/` -- per-device definitions
 
@@ -536,7 +580,8 @@ Before implementing:
 - No "flexibility" or "configurability" that wasn't requested.
 - No error handling for impossible scenarios.
 - If you write 200 lines and it could be 50, rewrite it.
-- Try to keep docstrings short to medium length.
+- Keep docstrings short - ideally 1-2 lines. If possible, add small one line comments interspersed
+  throughout rather than big chunks of comments.
 
 Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify.
 
diff --git a/src/mldebug/aie_overlay.py b/src/mldebug/aie_overlay.py
index 138b711..9cb0444 100644
--- a/src/mldebug/aie_overlay.py
+++ b/src/mldebug/aie_overlay.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved.
 
 """
 Manages overlays and stamps
@@ -9,7 +9,16 @@
 class Overlay:
   """
   Abstraction for AIE Overlay.
-  NxCxR: Stamps/Batches x Cols x Rows.
+
+  Layout is BxSxCxR where:
+    B = number of batches (data-parallel copies of the design)
+    S = number of stamps per batch (spatial replicas inside one batch)
+    C = columns per stamp
+    R = rows per stamp
+
+  Replicas are packed stamp-inner along columns: replica i = b*S + s occupies
+  columns [i*C, (i+1)*C). The flat replica id is what the rest of the system
+  refers to as "sid" (stamp id).
   """
 
   def __init__(self, args, layout):
@@ -17,60 +26,80 @@ def __init__(self, args, layout):
     Initialize the Overlay with layout and tile information.
 
     Args:
-      args: Argument object containing configuration options, including aie_iface and overlay string.
-      layout: Tuple representing (stamps, ncol, nrow) as default or externally supplied layout.
+      args: Argument object containing configuration options, including
+        aie_iface and overlay string.
+      layout: Tuple representing the layout from buffer_info. Either
+        (batches, stamps, nrow, ncol) (new 4-element form) or
+        (stamps, nrow, ncol) (legacy; treated as batches=1).
     """
     self.aie_iface = args.aie_iface
     self.stamps = {}
     self.impls = {}
-    self.layout = self._get_layout(args.overlay, layout)
-
-    # For larger devices, a 4x4 overlay can be repeated (stamped)
-    stamps, ncol, nrow = self.layout
-    for stamp_id in range(stamps):
-      tiles = []
-      start_col = stamp_id * ncol
-      for col in range(start_col, start_col + ncol):
-        for row in range(nrow + self.aie_iface.AIE_TILE_ROW_OFFSET):
-          tiles.append((col, row))
-      self.stamps[stamp_id] = tiles
+    batches, stamps_per_batch, ncol, nrow = self._get_layout(args.overlay, layout)
+
+    # Materialize tiles for every physical replica so dropped ones stay quiescible.
+    for b in range(batches):
+      for s in range(stamps_per_batch):
+        replica_id = b * stamps_per_batch + s
+        tiles = []
+        start_col = replica_id * ncol
+        for col in range(start_col, start_col + ncol):
+          for row in range(nrow + self.aie_iface.AIE_TILE_ROW_OFFSET):
+            tiles.append((col, row))
+        self.stamps[replica_id] = tiles
+
+    # Without `multistamp`, collapse to one active replica so LayerInfo/DebugState/
+    # backends size to it; extras stay in self.stamps (see get_inactive_tiles).
+    if args.run_flags.multistamp:
+      self.layout = (batches, stamps_per_batch, ncol, nrow)
+    else:
+      self.layout = (1, 1, ncol, nrow)
 
   def _get_layout(self, args_overlay, layout):
     """
-    Determine the overlay layout parameters (stamps, columns, rows).
+    Determine the overlay layout parameters as (batches, stamps, ncol, nrow).
 
     Args:
-      args_overlay (str): User-specified overlay string (e.g. '2x4x4').
-      layout (tuple/list): Provided layout as (stamps, ncol, nrow).
+      args_overlay (str): User-specified overlay string (e.g. '2x4x4' or
+        '4x4'). Parsed as N x C x R; treated as batches=1, stamps=N.
+      layout (tuple/list): Layout supplied by LayerInfo. Either
+        (batches, stamps, nrow, ncol) (new 4-element form) or
+        (stamps, nrow, ncol) (legacy).
 
     Returns:
-      tuple: (stamps, ncol, nrow) representing number of stamps, columns, and rows.
+      tuple: (batches, stamps, ncol, nrow).
     """
-    stamps, ncol, nrow = (1, 4, 4)
+    batches, stamps_per_batch, ncol, nrow = (1, 1, 4, 4)
     if args_overlay:
-      layout = [int(x) for x in args_overlay.split("x")]
-      if len(layout) == 3:
-        stamps, ncol, nrow = layout
-      elif len(layout) == 2:
-        ncol, nrow = layout
+      parsed = [int(x) for x in args_overlay.split("x")]
+      if len(parsed) == 3:
+        stamps_per_batch, ncol, nrow = parsed
+      elif len(parsed) == 2:
+        ncol, nrow = parsed
       else:
         print(f"[WARNING] Cannot parse overlay: {args_overlay}.")
     elif layout:
-      # Layout in buffer_info will be reversed
-      stamps, nrow, ncol = layout
-    print("[INFO] Using Layout: ", stamps, ncol, nrow)
-    return stamps, ncol, nrow
+      if len(layout) == 4:
+        # New form from buffer_info: [B, S, R, C]
+        batches, stamps_per_batch, nrow, ncol = layout
+      elif len(layout) == 3:
+        # Legacy form: [stamps, R, C]; batches encoded by caller into stamps
+        stamps_per_batch, nrow, ncol = layout
+
+    print("[INFO] Using Layout: ", batches, stamps_per_batch, ncol, nrow)
+
+    return batches, stamps_per_batch, ncol, nrow
 
   def get_first_relative_core_tile(self, stamp_id=0):
     """
-    Get the (col, row) tuple for the first AIE core tile in the specified stamp,
-    adjusting row by the device-specific tile row offset.
+    Get the (col, row) tuple for the first AIE core tile in the specified
+    replica, adjusting row by the device-specific tile row offset.
 
     Args:
-      stamp_id (int, optional): Stamp index to query. Default is 0.
+      stamp_id (int, optional): Replica index to query. Default is 0.
 
     Returns:
-      tuple: (column, row) of the first core tile within the given stamp.
+      tuple: (column, row) of the first core tile within the given replica.
     """
     t = self.get_tiles(self.aie_iface.AIE_TILE_T, stamp_id)[0]
     return t[0], t[1] - self.aie_iface.AIE_TILE_ROW_OFFSET
@@ -80,17 +109,19 @@ def get_tiles(self, tile_type=None, stamp_id=0, raw=False):
     Query tile locations for the overlay.
 
     Args:
-      tile_type (str, optional): Tile type identifier for filtering. If None, returns all tile positions.
-      stamp_id (int, optional): Stamp ID to filter tiles by. Defaults to 0.
-      raw (bool, optional): If True, return all tile positions for all stamps, unfiltered.
+      tile_type (str, optional): Tile type identifier for filtering. If None,
+        returns all tile positions.
+      stamp_id (int, optional): Replica id to filter tiles by. Defaults to 0.
+      raw (bool, optional): Return all tiles for all replicas
 
     Returns:
-      list[tuple]: List of (column, row) tile coordinates corresponding to requested tiles.
+      list[tuple]: List of (column, row) tile coordinates corresponding to
+        requested tiles.
     """
     tile_list = []
     if raw:
-      for stamp in self.stamps.values():
-        tile_list.extend(stamp)
+      for sid in self.get_stampids():
+        tile_list.extend(self.stamps[sid])
     else:
       tile_list = self.stamps[stamp_id]
     if not tile_type:
@@ -99,36 +130,89 @@ def get_tiles(self, tile_type=None, stamp_id=0, raw=False):
 
   def get_stampids(self):
     """
-    Get a list of all configured stamp IDs in the overlay.
+    Get a list of the active replica ids. In single-stamp mode: [0];
 
     Returns:
-      list[int]: List of integer stamp IDs available in the layout.
+      list[int]: List of integer replica ids (length = batches * stamps).
     """
-    return list(self.stamps.keys())
+    return list(range(self.get_replica_count()))
 
-  def get_stampcount(self):
+  def get_inactive_tiles(self):
     """
-    Return the number of stamps present in the overlay.
+    Tiles for physical replicas that exist in the design but fall outside the
+    active view (every replica beyond replica 0 when multistamp is disabled).
 
     Returns:
-      int: The stamp count (N from NxCxR).
+      list[tuple]: (column, row) tiles to be quiesced. Empty in multistamp mode.
+    """
+    tiles = []
+    for sid in range(self.get_replica_count(), len(self.stamps)):
+      tiles.extend(self.stamps[sid])
+    return tiles
+
+  def get_replica_count(self):
+    """
+    Total number of replicas in the overlay (batches * stamps_per_batch).
+    """
+    return self.layout[0] * self.layout[1]
+
+  def get_stampcount(self):
+    """
+    Total number of replicas (alias for get_replica_count, kept for
+    backward compatibility with existing callers).
+    """
+    return self.get_replica_count()
+
+  def get_batch_count(self):
+    """
+    Number of batches (B from BxSxCxR).
     """
     return self.layout[0]
 
+  def get_stamps_per_batch(self):
+    """
+    Number of stamps within a single batch (S from BxSxCxR).
+    """
+    return self.layout[1]
+
+  def replica_to_batch(self, sid):
+    """
+    Map a flat replica id to its batch index.
+    """
+    return sid // self.layout[1]
+
+  def replica_to_stamp(self, sid):
+    """
+    Map a flat replica id to its per-batch stamp index.
+    """
+    return sid % self.layout[1]
+
+  def is_leftmost_in_batch(self, sid):
+    """
+    True if this replica is the leftmost stamp of its batch (per-batch stamp
+    index == 0). The leftmost-in-batch replica is always scheduled at every
+    layer; the others may skip layers.
+    """
+    return sid % self.layout[1] == 0
+
   def get_stampwidth(self):
     """
-    Get the width (number of columns) for a single stamp within the overlay.
+    Get the width (number of columns) for a single stamp/replica.
 
     Returns:
-      int: The number of columns in the overlay (C from NxCxR).
+      int: The number of columns per replica (C from BxSxCxR).
     """
-    return self.layout[1]
+    return self.layout[2]
 
   def get_repr(self):
     """
-    Return the string representation of the overlay layout (e.g., '2x4x4').
+    Return the string representation of the overlay layout (e.g. '2x1x4x4'
+    or '1x4x4' when only one batch).
 
     Returns:
-      str: Overlay configuration as 'N x C x R' string.
+      str: Overlay configuration as a 'B x S x C x R' (or 'S x C x R') string.
     """
-    return "x".join([str(x) for x in self.layout])
+    batches, stamps, ncol, nrow = self.layout
+    if batches == 1:
+      return f"{stamps}x{ncol}x{nrow}"
+    return f"{batches}x{stamps}x{ncol}x{nrow}"
diff --git a/src/mldebug/aie_util.py b/src/mldebug/aie_util.py
index 9a78257..4210a7c 100644
--- a/src/mldebug/aie_util.py
+++ b/src/mldebug/aie_util.py
@@ -5,9 +5,7 @@
 Manages high level interaction with AIE
 """
 
-import time
-
-from mldebug.utils import LOGGER
+from mldebug.utils import LOGGER, wait_until
 
 
 class AIEUtil:
@@ -159,20 +157,21 @@ def skip_iterations(self, count, sid):
     write(reg_map["DEBUG_CONTROL1"], perf_cntr_event << 16)
     self.impl.continue_aie()
     # Step3: Poll all tiles until every PERF_CNTR_1 reaches the specified count.
-    timeout = 10
-    start_time = time.time()
     perf_cntr_1 = reg_map["PERF_CNTR_1"]
-    while True:
-      time.sleep(0.1)
-      values = self.read_aie_regs(perf_cntr_1)
-      if all(v == count for v in values.values()):
-        break
-      if time.time() - start_time > timeout:
-        LOGGER.log(
-          f"{sid}: Timeout waiting for skip {count} iterations across tiles! "
-          f"Design might be hung. Values={values}"
-        )
-        return False
+    last = {}
+
+    def reached():
+      last["values"] = self.read_aie_regs(perf_cntr_1)
+      return all(v == count for v in last["values"].values())
+
+    def on_timeout():
+      LOGGER.log(
+        f"{sid}: Timeout waiting for skip {count} iterations across tiles! "
+        f"Design might be hung. Values={last['values']}"
+      )
+
+    if not wait_until(reached, on_timeout=on_timeout):
+      return False
 
     # Step6: Reset debug control to stop at program counter event
     pc_event = self._get_eventid("PC_0_CORE")
@@ -188,12 +187,7 @@ def skip_iterations_to_lock_acq(self, lock_acq_pc, count, sid):
 
     self.impl.set_pc_breakpoint(lock_acq_pc)
     self.impl.continue_aie()
-    timeout = 10
-    start_time = time.time()
-    while time.time() - start_time < timeout:
-      time.sleep(0.1)
-      if self.impl.poll_core_status():
-        break
+    wait_until(self.impl.poll_core_status)
 
     pcs = self.impl.read_core_pc(True)
     is_valid =  self.pcs_match_target(pcs, lock_acq_pc)
@@ -283,12 +277,16 @@ def read_control_instr(self):
       for c, r in self._filter_tiles(self.aie_iface.MEM_TILE_T)
     }
 
-  def initialize_stamp(self):
+  def initialize_stamp(self, tiles=None):
     """
-    Initialize and clear DEBUG_CONTROL1 and DEBUG_CONTROL0 registers for all AIE tiles
-    belonging to the overlay instance (usually at the start of execution for multi-stamp).
+    Clear DEBUG_CONTROL1 unhalt specified tiles.
+
+    Args:
+      tiles (list[tuple], optional): (col, row) tiles to clear. Default: this stamp's tiles.
     """
-    for c, r in self._filter_tiles(self.aie_iface.AIE_TILE_T):
+    if tiles is None:
+      tiles = self.tiles
+    for c, r in self.aie_iface.filter_tiles(self.aie_iface.AIE_TILE_T, tiles):
       self.impl.write_register(c, r, self.aie_iface.Core_registers["DEBUG_CONTROL1"], 0)
       self.impl.write_register(c, r, self.aie_iface.Core_registers["DEBUG_CONTROL0"], 0)
 
diff --git a/src/mldebug/arch/aie2p_defs.py b/src/mldebug/arch/aie2p_defs.py
index 855c25b..07801d9 100644
--- a/src/mldebug/arch/aie2p_defs.py
+++ b/src/mldebug/arch/aie2p_defs.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved.
 
 """
 AIE2/AIE2P Specific Defs
@@ -12,6 +12,8 @@
 MEM_TILE_T = "mem_tile"
 TILE_TYPES = [AIE_TILE_T, SHIM_TILE_T, MEM_TILE_T]
 
+ARCH_NAME = "aie2p"
+
 AIE_TILE_ROW_OFFSET = 2
 MEM_TILE_SZ = 0x80000
 HAS_UC_MODULE = False
diff --git a/src/mldebug/arch/aie2ps_defs.py b/src/mldebug/arch/aie2ps_defs.py
index 02db650..f2171eb 100644
--- a/src/mldebug/arch/aie2ps_defs.py
+++ b/src/mldebug/arch/aie2ps_defs.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved.
 
 """
-AIE2/AIE2P Specific Defs
+AIE2PS Specific Defs
 """
 
 import json
@@ -12,6 +12,8 @@
 MEM_TILE_T = "mem_tile"
 TILE_TYPES = [AIE_TILE_T, SHIM_TILE_T, MEM_TILE_T]
 
+ARCH_NAME = "aie2ps"
+
 AIE_TILE_ROW_OFFSET = 3
 MEM_TILE_SZ = 0x80000
 HAS_UC_MODULE = True
diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py
index 8463e92..2bba4e1 100644
--- a/src/mldebug/batch_runner.py
+++ b/src/mldebug/batch_runner.py
@@ -17,7 +17,7 @@
 
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
-from mldebug.utils import LOGGER, cleanup_and_exit, timeit
+from mldebug.utils import LOGGER, cleanup_and_exit, timeit, wait_until
 
 
 class BatchRunner:
@@ -57,23 +57,9 @@ def __init__(self, args, state, design_info, impls, aie_utls,
 
   def common_init(self):
     """
-    Common initialization for batch and interactive modes.
-
-    Collapses to single-stamp mode if multistamp flag is not set,
-    enables PC halt for all stamps, and initializes skip-iteration support.
+    Enable PC halt and skip-iteration support for each active replica. The
+    single-stamp collapse is handled up front by the Overlay.
     """
-    if not self.args.run_flags.multistamp and self.design_info.overlay.get_stampcount() > 1:
-      for layer in self.design_info.layers:
-        layer.stamps[:] = layer.stamps[:1]
-      for u in self.aie_utls[1:]:
-        u.initialize_stamp()
-      # In-place list modification so all holders of these references see the change
-      del self.aie_utls[1:]
-      del self.impls[1:]
-      self.design_info.overlay.layout = (1,) + self.design_info.overlay.layout[1:]
-      self.design_info.overlay.stamps = {0: self.design_info.overlay.stamps[0]}
-      LOGGER.log("[INFO] Using single stamp control. Please use multistamp flag for more data.")
-
     for sid in self.design_info.overlay.get_stampids():
       self.impls[sid].enable_pc_halt()
       if self.args.run_flags.skip_iter:
@@ -117,7 +103,7 @@ def _set_layer_breakpoint(self, layer, skip_end_pc, sid, pm_reload_expected):
     start_pc_slot = 0
     end_pc_slot = 1
 
-    stamp = layer.stamps[sid]
+    stamp = layer.get_stamp(sid)
     start_pc = stamp.start_pc
     if not start_pc:
       print(f"Invalid configuration on stamp {sid} layer {layer.layer_order}.")
@@ -135,26 +121,30 @@ def _set_layer_breakpoint(self, layer, skip_end_pc, sid, pm_reload_expected):
 
   def check_pm_reload(self, stamp_id=0):
     """
-    Check if the next ELF will be loaded (PM Reload).
-
+    Check if the next ELF will be loaded (PM Reload) for the given replica.
     Args:
-      stamp_id: Stamp index to check for reload (default 0).
-
+      stamp_id: Replica id to check for reload (default 0).
     Returns:
       True if program memory reload will occur at the next layer, False otherwise.
     """
     layer = self.state.layers[self.state.current_layer]
-    if not self.design_info.work_dir.pm_reload_en[stamp_id] or self.state.current_layer + 1 >= len(self.state.layers):
+    # PM Load is not enabled for this stamp or this is last layer
+    if not self.design_info.work_dir.stamp(stamp_id).pm_reload_en or self.state.current_layer + 1 >= len(self.state.layers):
       return False
-
-    if stamp_id > 0 and not self.design_info.is_batched():
-      next_layer = self.state.get_next_layer_for_stamp(stamp_id, idx=1)
-    else:
+    # Stamp id doesn't run for this layer
+    if not layer.runs_replica(stamp_id):
+      return False
+    # Find next layer that runs this stamp
+    if self.design_info.overlay.is_leftmost_in_batch(stamp_id):
       next_layer = self.state.layers[self.state.current_layer + 1]
+    else:
+      next_layer = self.state.get_next_layer_for_stamp(stamp_id, idx=1)
+    if next_layer is None or not next_layer.runs_replica(stamp_id):
+      return False
 
-    if next_layer and stamp_id < len(layer.stamps) and stamp_id < len(next_layer.stamps):
-      return layer.stamps[stamp_id].elf_name != next_layer.stamps[stamp_id].elf_name
-    return False
+    cur_stamp = layer.get_stamp(stamp_id)
+    next_stamp = next_layer.get_stamp(stamp_id)
+    return cur_stamp.elf_name != next_stamp.elf_name
 
   def hit_next_breakpoint(self, sid=0):
     """
@@ -184,100 +174,90 @@ def schedule_layer_start(self, next_layer):
     Args:
       next_layer: Next Layer object to start.
     """
-    stamp_target_layers = {0: next_layer}
-
-    for sid in range(1, len(self.state.pm_reload)):
-      stamp_target_layers[sid] = self.state.get_next_layer_for_stamp(sid)
+    overlay = self.design_info.overlay
+    stamp_target_layers = {}
+    for sid in range(len(self.state.pm_reload)):
+      if overlay.is_leftmost_in_batch(sid):
+        # Leftmost replica of every batch always participates in next_layer.
+        stamp_target_layers[sid] = next_layer
+      else:
+        stamp_target_layers[sid] = self.state.get_next_layer_for_stamp(sid)
 
     for utl in self.aie_utls:
       utl.disable_ecc_event()
 
     bes_to_poll = []
     bes_to_run = []
-    # Stamp0 breakpoint always scheduled
-    # Stamp1+ breakpoint only scheduled at end of 2 stamps or at beginning
+    active_stamps_all_batches = []
+    # Per-batch leftmost stamps (sid 0 within each batch) always have their
+    # breakpoint scheduled on next_layer. The remaining stamps may early-arm
+    # a breakpoint for a *future* layer they actually participate in.
     #
-    # NOTE ON "EARLY" PM-RELOAD ARMING:
-    # `target_layer` for stamp N may be a layer *later* than `next_layer`
-    # (the outer-loop layer currently being scheduled). This happens when a
-    # non-participating stamp skips one or more layers - `get_next_layer_for_stamp`
-    # walks forward to the next layer that actually contains this stamp.
-    #
-    # When that future target layer uses a different ELF for this stamp, we
-    # must arm the start-PC breakpoint AND the combo event (via break_combo
-    # inside _set_layer_breakpoint) *before* the stamp is released with
-    # continue_aie below. If we defer arming until we reach the outer-loop
-    # iteration for the stamp's real target layer, the stamp would have
-    # already been released without a valid breakpoint (or without combo
-    # event coverage across the PM reload) and would either free-run past
-    # its target start PC or stall indefinitely at the end of its previous
-    # layer - blocking progress of the other stamps that depend on it.
-    #
-    # Consequence: the "PM RELOAD" log may appear while scheduling an outer
-    # layer that this stamp does not participate in. That is intentional -
-    # it marks when the breakpoint is *armed*, not when the reload
-    # physically occurs. The post-poll block below finalizes the combo
-    # event (enable_pc_halt + clear pm_reload[sid]) only once the outer
-    # loop actually reaches that stamp's target layer, guarded by
-    # `break_on_stamp_scheduled[sid]` so we do not re-arm on the way there.
+    # Example for "EARLY" PM-RELOAD ARMING:
+    #   Layer 0  stamp0 stamp1 stamp2
+    #   Layer 1  stamp0 stamp1
+    #                          <PM Reload Stamp2>
+    #   Layer 3  stamp0 stamp1 stamp2
+    # Step to layer 0 : step to all 3 stamps
+    # Step to layer 1 : run stamp0,1 Arm Stamp2 via combo and continue it
+    #                   PM Reload message appears early for stamp 2
+    # Step to layer 3 : step to all 3 stamps
     for sid, pml in enumerate(self.state.pm_reload):
       target_layer = stamp_target_layers.get(sid)
-      if not target_layer or (sid > 0 and self.state.break_on_stamp_scheduled[sid]):
+      if not target_layer:
         continue
-      self.state.break_on_stamp_scheduled[sid] = True
-      if pml:
-        if target_layer.layer_order != next_layer.layer_order:
-          LOGGER.log(
-            f"\nArming PM RELOAD on stamp {sid} for Layer_{target_layer.layer_order} "
-          )
-        else:
-          LOGGER.log(f"\nPM RELOAD on stamp: {sid}")
-      stamp = target_layer.stamps[sid]
-      skip_end_pc = not (self.args.run_flags.l1_ofm_dump and stamp.end_pc)
-      self._set_layer_breakpoint(target_layer, skip_end_pc, sid, pml)
-      bes_to_run.append(self.impls[sid])
-      if target_layer.layer_order == next_layer.layer_order:
+      is_leftmost = overlay.is_leftmost_in_batch(sid)
+      reaches_now = target_layer.layer_order == next_layer.layer_order
+      already_armed = not is_leftmost and self.state.break_on_stamp_scheduled[sid]
+      stamp = target_layer.get_stamp(sid)
+
+      if not already_armed:
+        self.state.break_on_stamp_scheduled[sid] = True
+        if pml:
+          if not reaches_now:
+            LOGGER.log(
+              f"\nArming PM RELOAD on stamp {sid} for Layer_{target_layer.layer_order} "
+            )
+          else:
+            LOGGER.log(f"\nPM RELOAD on stamp: {sid}")
+        skip_end_pc = not (self.args.run_flags.l1_ofm_dump and stamp.end_pc)
+        self._set_layer_breakpoint(target_layer, skip_end_pc, sid, pml)
+        bes_to_run.append(self.impls[sid])
+
+      # We have reached previously scheduled breakpoint
+      if reaches_now:
         bes_to_poll.append(self.impls[sid])
+        active_stamps_all_batches.append((sid, pml, stamp))
 
     # Run stamps at exact same time
     for be in bes_to_run:
       be.continue_aie()
 
     # Poll stamps until breakpoint is hit
-    timeout = 10
-    start_time = time.time()
-    while time.time() - start_time < timeout:
-      if self.args.backend == "test":
-        break
-      time.sleep(0.1)
-      if all(be.poll_core_status() for be in bes_to_poll):
-        break
-
-    # When combo events are used, it takes a few cycles to
-    # hit the breakpoint, so pc might have moved
-    for sid, pml in enumerate(self.state.pm_reload):
-      ta_layer = stamp_target_layers.get(sid)
-      if ta_layer is not None and next_layer.layer_order == ta_layer.layer_order:
-        stamp = next_layer.stamps[sid]
-        pcs = self.impls[sid].read_core_pc(True)
-
-        # combo event trigger has one cycle delay
-        is_correct_pc = utl.pcs_match_target(pcs, stamp.start_pc, allow_combo_delay=pml)
+    if self.args.backend != "test":
+      wait_until(lambda: all(be.poll_core_status() for be in bes_to_poll))
+
+    # Now check that breakpoints were hit at the right PC for each stamp
+    # that actually targets next_layer. When combo events are used the PC
+    # may have moved by a few cycles past the start_pc.
+    for sid, pml, stamp in active_stamps_all_batches:
+      pcs = self.impls[sid].read_core_pc(True)
+      utl = self.aie_utls[sid]
+      is_correct_pc = utl.pcs_match_target(pcs, stamp.start_pc, allow_combo_delay=pml)
+
+      if is_correct_pc:
+        self._process_start_breakpoint(next_layer, 1, sid=sid)
+      else:
+        print(f"[ERROR] Step to start of Layer_{next_layer.layer_order} failed on Stamp_{sid}")
+        self._process_err()
+      if pml:
+        self.impls[sid].enable_pc_halt()
+        self.state.pm_reload[sid] = False
+      # Breakpoint has now been observed for this stamp;
+      self.state.break_on_stamp_scheduled[sid] = False
 
-        if is_correct_pc:
-          self._process_start_breakpoint(next_layer, 1, sid=sid)
-        else:
-          print(f"[ERROR] Step to start of Layer_{next_layer.layer_order} failed on Stamp_{sid}")
-          self._process_err()
-        if pml:
-          self.impls[sid].enable_pc_halt()
-          self.state.pm_reload[sid] = False
-        # Breakpoint has now been observed for this stamp; clear the
-        # "already scheduled" guard so the next outer-loop layer can
-        # arm it normally. For stamps whose target_layer is *not* yet
-        # this next_layer (early-armed for a future target), the flag
-        # stays True - preventing re-arm/continue while we walk past.
-        self.state.break_on_stamp_scheduled[sid] = False
+    # Save for run_layer to consume.
+    self.state.active_stamps_all_batches = active_stamps_all_batches
 
   # ------------------------------------------------------------------ #
   # Core execution primitives (shared by batch and interactive)
@@ -392,9 +372,9 @@ def _run_stamp(self, layer, sid, target_itr, cur_it=1):
       cur_it: Starting iteration number (default 1).
 
     Returns:
-      True on success, False on error.
+      Success or error.
     """
-    stamp = layer.stamps[sid]
+    stamp = layer.get_stamp(sid)
     utl = self.aie_utls[sid]
 
     skip_end_pc = not (self.args.run_flags.l1_ofm_dump and stamp.end_pc)
@@ -405,7 +385,7 @@ def _run_stamp(self, layer, sid, target_itr, cur_it=1):
       self.state.error = not utl.skip_iterations(target_itr - cur_it, sid)
     elif self.args.run_flags.skip_iter2:
       self.state.error = not utl.skip_iterations_to_lock_acq(
-         self.design_info.work_dir.post_layer_lock_acq_pcs[sid], target_itr - cur_it, sid)
+         self.design_info.work_dir.stamp(sid).post_layer_lock_acq_pc, target_itr - cur_it, sid)
     else:
       while cur_it < target_itr:
         self.hit_next_breakpoint(sid)
@@ -435,21 +415,29 @@ def run_layer(self, layer, target_itr=None, cur_it=None):
       target_itr: Target iteration (default None = last).
       cur_it: Initial iteration number (default None = 1).
     """
-    n_stamp = len(layer.stamps)
     if not cur_it:
       cur_it = 1
 
-    with ThreadPoolExecutor(max_workers=n_stamp) as executor:
-      futures = [executor.submit(self._run_stamp, layer, sid, target_itr, cur_it) for sid in range(n_stamp)]
+    # active_stamps_all_batches is determined by schedule_layer_start
+    stamps = self.state.active_stamps_all_batches
+
+    with ThreadPoolExecutor(max_workers=len(stamps)) as executor:
+      futures = [
+        executor.submit(self._run_stamp, layer, sid, target_itr, cur_it)
+        for sid, _pml, _stamp in stamps
+      ]
       for f in as_completed(futures):
         res = f.result()
         if not res:
           self.state.error = True
 
-    # At final iteration of a multistamp layer, drain stamps that have no
-    # remaining future layer so they don't sit halted at their last breakpoint.
-    if n_stamp > 1 and (target_itr is None or target_itr == layer.lcp.num_iter):
-      for sid in range(1, n_stamp):
+    # Unhalt right replicas that have no remaining future layer
+    overlay = self.design_info.overlay
+    total_replicas = len(self.state.pm_reload)
+    if total_replicas > 1 and (target_itr is None or target_itr == layer.lcp.num_iter):
+      for sid in range(total_replicas):
+        if overlay.is_leftmost_in_batch(sid):
+          continue
         if not self.state.get_next_layer_for_stamp(sid, idx=1):
           self.impls[sid].continue_aie()
 
@@ -464,7 +452,6 @@ def run_layer(self, layer, target_itr=None, cur_it=None):
   def execute_and_dump(self):
     """
     Execute all layers in batch mode, dumping buffers as required.
-
     Primary entry point for batch mode execution in MLDebugger.
     """
     self.common_init()
@@ -475,12 +462,15 @@ def execute_and_dump(self):
                  f" stamps: {len(layer.stamps)}, iters {layer.lcp.num_iter}")
       self.schedule_layer_start(layer)
       self.run_layer(layer)
-      for sid in range(len(layer.stamps)):
-        self.state.pm_reload[sid] = self.check_pm_reload(sid)
+
+      # Only recompute reload state for replicas that run THIS layer
+      for sid, _ in enumerate(self.state.pm_reload):
+        if layer.runs_replica(sid):
+          self.state.pm_reload[sid] = self.check_pm_reload(sid)
 
     for sid in overlay.get_stampids():
       self.aie_utls[sid].initialize_stamp()
-      self.impls[sid].continue_aie()
+
     LOGGER.log("\nFinished Execution")
     self._handle_fsp()
     self._write_run_summary("SUCCESS")
diff --git a/src/mldebug/client_debug.py b/src/mldebug/client_debug.py
index cbee01b..5e15f42 100644
--- a/src/mldebug/client_debug.py
+++ b/src/mldebug/client_debug.py
@@ -60,7 +60,11 @@ def __init__(self, args, ctx_id, pid, output_dir):
 
     try:
       self.design_info = LayerInfo(args)
-      self.state = DebugState(self.design_info.layers, self.design_info.overlay.get_stampcount())
+      self.state = DebugState(
+        self.design_info.layers,
+        self.design_info.overlay.get_stampcount(),
+        stamps_per_batch=self.design_info.overlay.get_stamps_per_batch(),
+      )
     except Exception as err:
       if debug_server:
         print("[INFO] closing debug server.")
@@ -82,10 +86,12 @@ def __init__(self, args, ctx_id, pid, output_dir):
       self.impls.append(impl)
       self.aie_utls.append(
         AIEUtil(
-          args.aie_iface, impl, self.design_info.overlay.get_tiles(stamp_id=i), self.design_info.work_dir.globals[i]
+          args.aie_iface, impl, self.design_info.overlay.get_tiles(stamp_id=i), self.design_info.work_dir.stamp(i).globals
         )
       )
 
+    self._quiesce_inactive_stamps()
+
     self.impl = self.impls[0]
     self.status_handle = AIEStatus(
       self.impl, self.design_info.overlay.get_tiles, args.aie_iface, self.design_info.overlay.get_repr()
@@ -109,6 +115,17 @@ def __init__(self, args, ctx_id, pid, output_dir):
       self.dumper.debug_server.close()
       sys.exit(0)
 
+  def _quiesce_inactive_stamps(self):
+    """
+    Clear debug-control registers on physical replicas excluded from the active
+    view so they run freely
+    """
+    inactive_tiles = self.design_info.overlay.get_inactive_tiles()
+    if not inactive_tiles:
+      return
+    self.aie_utls[0].initialize_stamp(inactive_tiles)
+    LOGGER.log("[INFO] Using single stamp control. Please use multistamp flag for more data.")
+
   # --- Batch mode delegation ---
 
   def execute_and_dump(self):
@@ -291,10 +308,7 @@ def init_leftmost_stamp(self):
     For stamps with index > 1, initialize the stamp and continue execution.
     """
     self.impls[0].enable_pc_halt()
-    for sid, impl in enumerate(self.impls):
-      if sid > 0:
-        self.aie_utls[sid].initialize_stamp()
-        impl.continue_aie()
+    self._quiesce_inactive_stamps()
 
   def wreg_stamp(self, offset, val, sid=0):
     """
diff --git a/src/mldebug/debug_state.py b/src/mldebug/debug_state.py
index 0a54125..9ffc2dd 100644
--- a/src/mldebug/debug_state.py
+++ b/src/mldebug/debug_state.py
@@ -11,24 +11,28 @@ class DebugState:
   Keep Track of debug state
   """
 
-  def __init__(self, layers, stampcount) -> None:
+  def __init__(self, layers, stampcount, stamps_per_batch=1) -> None:
     """
     Initialize the DebugState object.
 
     Args:
-      layers (list): The list of layer objects that define the AIE execution steps.
-      stampcount (int): The number of stamps.
+      layers (list): In order BE layer list
+      stampcount (int): Number of replicas (batches * stamps_per_batch).
+      stamps_per_batch (int): Number of stamps within a single batch (S from BxSxCxR).
     """
     self.current_layer = -1
     self.cur_it = 1
     self.ofm_ping = True
     self.layers = layers
+    self.stamps_per_batch = stamps_per_batch
     self.manual_breakpoints = []
     # Run AIE to finish without invoking breakpoints
     self.continue_to_finish = False
     self.error = False
     self.break_on_stamp_scheduled = [False for _ in range(stampcount)]
     self.pm_reload = [False for _ in range(stampcount)]
+    # stamps to run in current layer; set at step to layer start
+    self.active_stamps_all_batches = None
 
   def update_layer(self):
     """
@@ -45,11 +49,17 @@ def update_layer(self):
 
   def get_next_layer_for_stamp(self, stamp_id, idx=0):
     """
-    Find the next layer that includes the specified stamp_id.
+    Find the next layer in which the given replica participates.
+
+    A layer's per-batch stamp count (`stamps_per_batch`) may be smaller than
+    the overlay's S, meaning higher-indexed stamps (within a batch) skip
+    that layer. We map the flat replica id to its per-batch stamp index
+    `s = stamp_id % S` and require `s < layer.stamps_per_batch`.
     """
+    s = stamp_id % self.stamps_per_batch
     for i in range(self.current_layer + idx, len(self.layers)):
       layer = self.layers[i]
-      if stamp_id < len(layer.stamps):
+      if s < getattr(layer, "stamps_per_batch", len(layer.stamps)):
         return layer
     return None
 
diff --git a/src/mldebug/layer_info.py b/src/mldebug/layer_info.py
index 1c45cc7..11e9777 100644
--- a/src/mldebug/layer_info.py
+++ b/src/mldebug/layer_info.py
@@ -111,7 +111,7 @@ def __init__(self, entry, buf_type, size_shift, aie_iface, ifm=False, ofm=False,
       ping = entry["l1_ping"]
       pong = entry["l1_pong"]
       self.l1 = L1Buffer(int(ping[0], 16), ping[1] * size_shift, int(pong[0], 16), pong[1] * size_shift)
-    
+
     # Handle both "l2" format and "l2_ping/l2_pong" format
     l2_bufs_list = []
     if "l2_ping" in entry:
@@ -228,16 +228,19 @@ class Layer:
   Contains all buffer, iteration, and kernel (stamp) mapping information.
   """
 
-  def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_report):
+  def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_report, num_batches=1):
     """
     Initialize a Layer object using given metadata, populating buffer and kernel/stamp lists.
 
     Args:
-        info (dict): Layer metadata.
-        size_shift (int): Size shift parameter.
-        version: Software version object.
-        aie_iface: AIE interface object.
-        num_stamps (int): Number of stamps in overlay.
+      info (dict): Layer entry in buffer_info
+      size_shift (int): Size shift parameter.
+      version: Software version object.
+      aie_iface: AIE interface object.
+      num_stamps (int): Number of stamps per batch (S from BxSxCxR overlay).
+      mladf_report: Optional MladfReport for templated-graph layers.
+      num_batches (int): Number of batches (B from BxSxCxR overlay). Each
+        batch is a data-parallel copy of the per-batch stamps; defaults to 1.
     """
     self.flexml_ids = []
     self.l3_ifm_buffers = []
@@ -250,7 +253,11 @@ def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_repor
     self.lcp = Lcp()
     self.pm_work_dir = info.get("pm", None)
     self.is_unsupported = False
-    self.is_concat = False
+    self.num_batches = num_batches
+    # Global stamps-per-batch (S from BxSxCxR), captured before the per-layer
+    # reduction below. Flat replica ids map to a per-batch stamp via `sid % S`,
+    # so this is the correct modulus even when this layer runs fewer stamps.
+    self.overlay_stamps_per_batch = num_stamps
 
     self.lcp.is_tg = "templated_graph" in info
     kname = [i.lower() for i in info["kernel_name"]][0]
@@ -262,8 +269,17 @@ def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_repor
     n_stamps = info.get("no_of_stamps")
     if n_stamps and n_stamps < num_stamps:
       num_stamps = n_stamps
+
     self.stamps = [Stamp(name=kname) for _ in range(num_stamps)]
 
+    # 1. Layers without any kernel should be skipped
+    # 2. Unsupported superkernel should be skipped
+    if info.get("is_concat") or not kname or any(k in kname for k in unsupported_superkernels):
+      LOGGER.verbose_print(f"[WARNING] unsupported kernel {kname} at Layer {self.layer_order} will be skipped.")
+      self.is_unsupported = True
+      return
+
+    # Fill missing TG metadata from mladf report
     if self.lcp.is_tg:
       for sid, stamp in enumerate(self.stamps):
         stamp.name = mladf_report.get_skname_for_bilo(self.layer_order, sid)
@@ -275,14 +291,8 @@ def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_repor
       self.lcp.num_iter = mladf_report._get_iters_for_bilo(self.layer_order)
 
     self._initialize_l3_buffers(info, version)
-    # 1. Layers without any kernel should be skipped
-    # 2. Unsupported superkernel should be skipped
-    self.is_concat = info.get("is_concat") or not kname
-    if self.is_concat:
-      LOGGER.verbose_print(f"[WARNING] unsupported kernel {kname} at Layer {self.layer_order} will be skipped.")
-      self.is_unsupported = True
-      return
 
+    # No L2 support for templated graph layers
     if self.lcp.is_tg:
       return
 
@@ -291,6 +301,19 @@ def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_repor
     self._initialize_iters(info, version)
     LOGGER.verbose_print(f"{self.layer_order}: {kname} {self.lcp.num_iter}")
 
+  def runs_replica(self, sid):
+    """
+    A layer can run less no of stamps than maximum.
+    """
+    return (sid % self.overlay_stamps_per_batch) < len(self.stamps)
+
+  def get_stamp(self, sid):
+    """
+    Per-batch stamp metadata for flat replica `sid`, indexed by `sid % S`.
+    Caller must ensure participation (see `runs_replica`).
+    """
+    return self.stamps[int(sid % self.overlay_stamps_per_batch)]
+
   def _initialize_flexml_ids(self, info):
     """
     Populate self.flexml_ids for this layer from its metadata.
@@ -457,12 +480,13 @@ def __init__(self, args):
         args: Namespace of configuration and input files (from argparser or similar).
     """
     self.layers = []
-    self.layout = [1, 4, 4]
+    # Layout for Overlay: (batches, stamps_per_batch, nrow, ncol). Updated by
+    # _read_buffer_info when a buffer_info.json is supplied.
+    self.layout = [1, 1, 4, 4]
     self.aie_iface = args.aie_iface
     self.x2 = False
     self.x2_work_dirs = {}
     self.layer_workdir_map = {}
-    self.device_batch_size = 1
     self.mladf_report = None
 
     has_bi = args.buffer_info and Path(args.buffer_info).is_file()
@@ -473,25 +497,28 @@ def __init__(self, args):
       data = self._read_buffer_info(args.buffer_info)
     # 2. Initialize Overlay from Layout
     self.overlay = Overlay(args, self.layout)
+    # Re-sync local view in case Overlay applied -o overrides.
+    num_batches = self.overlay.get_batch_count()
+    num_stamps = self.overlay.get_stamps_per_batch()
     # 3. Parse mladf report.
     # TBD: memory optimize this as this json can be large
     if not args.aie_only and has_bi and use_mladf:
       self.mladf_report = MladfReport(args.buffer_info, args.mladf_report, self.overlay.get_stampwidth())
     # 4. Initialize Layers
     if not args.aie_only:
-      num_stamps = len(self.overlay.get_stampids())
-      self._init_layers(data, args.aie_iface, num_stamps)
+      self._init_layers(data, args.aie_iface, num_stamps, num_batches)
     # 5: Parse work dir
     if self.x2:
       for layer in self.layers:
         if layer.pm_work_dir:
           path = os.path.join(args.aie_dir, layer.pm_work_dir)
           if layer.pm_work_dir not in self.x2_work_dirs:
-            self.x2_work_dirs[layer.pm_work_dir] = WorkDir(path, args.peano, self.overlay)
+            self.x2_work_dirs[layer.pm_work_dir] = WorkDir(path, args.peano, self.overlay, self.aie_iface.ARCH_NAME)
           self.layer_workdir_map[layer.layer_order] = self.x2_work_dirs[layer.pm_work_dir]
       self.work_dir = next(iter(self.layer_workdir_map.values()))
     else:
-      self.work_dir = WorkDir(args.aie_dir, args.peano, self.overlay, args.run_flags.dump_temps)
+      self.work_dir = WorkDir(args.aie_dir, args.peano, self.overlay,
+                              self.aie_iface.ARCH_NAME, args.run_flags.dump_temps)
 
     if not args.aie_only:
       # Set PC Value for layers
@@ -528,12 +555,12 @@ def print_aie_functions(self, elf_id=None):
 
   def is_stamped(self):
     """
-    Check if design is a multi-stamp (multi-superkernel) program.
+    Check if the design has more than one stamp per batch.
 
     Returns:
-        bool: True if stamped/multi-stamp, False otherwise.
+        bool: True if stamps_per_batch > 1, False otherwise.
     """
-    return len(self.overlay.get_stampids()) > 1
+    return self.overlay.get_stamps_per_batch() > 1
 
   def is_batched(self):
     """
@@ -542,7 +569,7 @@ def is_batched(self):
     Returns:
         bool: True if more than one batch, False otherwise.
     """
-    return self.device_batch_size > 1
+    return self.overlay.get_batch_count() > 1
 
   def _create_info(self):
     """
@@ -554,18 +581,24 @@ def _create_info(self):
     info = {}
     for n in range(len(self.overlay.get_stampids())):
       info[n] = {}
+    s_per_batch = self.overlay.get_stamps_per_batch()
     for layer in self.layers:
       order = layer.layer_order
-      for sid, stamp in enumerate(layer.stamps):
-        imap = info[sid]
-        elf = stamp.elf_name
-        if not elf:
-          continue
-        if elf not in imap:
-          imap[elf] = [order, order]
-        else:
-          imap[elf][0] = min(imap[elf][0], order)
-          imap[elf][1] = max(imap[elf][1], order)
+      # Map each per-batch stamp to its flat replica id in every batch:
+      # sid = b * S + s. Reduced layers (s < stamps_per_batch) simply omit
+      # the higher per-batch stamps, mirroring participation across batches.
+      for b in range(self.overlay.get_batch_count()):
+        for s, stamp in enumerate(layer.stamps):
+          sid = b * s_per_batch + s
+          imap = info[sid]
+          elf = stamp.elf_name
+          if not elf:
+            continue
+          if elf not in imap:
+            imap[elf] = [order, order]
+          else:
+            imap[elf][0] = min(imap[elf][0], order)
+            imap[elf][1] = max(imap[elf][1], order)
     return info
 
   def print_info(self):
@@ -576,7 +609,7 @@ def print_info(self):
     sep = "--------------------------------------------"
     m = "Design info (Excluding TG Layer IDs)\n"
     m += f"{sep}\nFlexml Layer Count: {len(self.layers)}\n{sep}"
-    if not self.work_dir.elf_flxmlid_maps or not self.layers:
+    if not self.work_dir.stamps or not self.layers:
       return
     for sid, imap in info.items():
       m += f"\nStamp {sid}: "
@@ -649,13 +682,15 @@ def initialize_l3_layer_mapping(self, flexmlrt_hsi, external_buffer_id):
     for layer in self.layers:
       layer.l3_buffers = layer.l3_ofm_buffers if self.x2 else layer.l3_ifm_buffers
 
-      # Duplicate L3 buffers for multi-stamp designs (batched designs)
+      # Duplicate L3 buffers per additional batch (data-parallel copies).
+      # Stamps within a batch share the same L3 IFM/OFM region, so we only
+      # replicate across batches, not across per-batch stamps.
       if self.is_batched():
         original_buffers = list(layer.l3_buffers)
-        for stamp_idx in range(1, self.device_batch_size):
+        for b in range(1, self.overlay.get_batch_count()):
           for orig_buffer in original_buffers:
             stamped_buffer = L3Buffer(
-              name=f"{orig_buffer.name}_stamp_{stamp_idx}",
+              name=f"{orig_buffer.name}_stamp_{b}",
               tensor_name=orig_buffer.tensor_name,
               size=orig_buffer.size,
               offset=None
@@ -733,7 +768,15 @@ def _overlap(self, buf1, buf2):
 
   def _read_buffer_info(self, buffer_info_file):
     """
-    Load and parse the buffer_info JSON, extracting layout and batch size.
+    Load and parse the buffer_info JSON, extracting the (B, S, R, C) layout.
+
+    buffer_info encodes the 4D overlay shape across two fields:
+      .meta.layout            -> [stamps_in_overlay, R, C]
+      .meta.device_batch_size -> B (number of data-parallel batch copies)
+      .meta.max_stamps_used   -> S (per-batch stamps actually used; may be
+                                   smaller than stamps_in_overlay). Falls
+                                   back to max(no_of_stamps) across layers,
+                                   then to the layout's stamp count.
 
     Args:
         buffer_info_file (str): Path to buffer_info JSON.
@@ -741,43 +784,42 @@ def _read_buffer_info(self, buffer_info_file):
     Returns:
         dict: Parsed JSON object from file.
     Side Effects:
-        - Sets self.layout, self.device_batch_size, self.x2.
+        - Sets self.layout to (B, S, R, C), self.x2.
     """
     print("Initializing Buffer Info ...")
     with open(buffer_info_file, encoding="utf-8") as fd:
       data = json.load(fd)
-    self.layout = data[".meta"].get("layout")
-    self.device_batch_size = data[".meta"].get("device_batch_size", 1)
-
-    # Layout now represents Full overlay but design can choose
-    # to use only a part of it
-    stampcount = data[".meta"].get("max_stamps_used")
-    if stampcount:
-      self.layout[0] = stampcount
-    elif data.get("layers"):
-      self.layout[0] = max(lyr.get("no_of_stamps", 1) for _, lyr in data["layers"].items() )
-    # Else use old style
-
-    # Treat mBnS as 1BnS
-    if self.device_batch_size > 1:
-      if self.layout[0] > 1:
-        LOGGER.log("[WARNING] Currently mBatch x nStamp is unsupported. Setting batchcount to 1.")
-        self.device_batch_size = 1
+
+    raw_layout = data[".meta"].get("layout") or [1, 4, 4]
+    overlay_stamps, nrow, ncol = raw_layout
+
+    # B (batches) comes from device_batch_size.
+    batches = data[".meta"].get("device_batch_size", 1)
+
+    # S (per-batch stamps used) comes from max_stamps_used, with sensible
+    # fallbacks: layer hints, then the overlay's nominal stamp count.
+    stamps = data[".meta"].get("max_stamps_used")
+    if not stamps:
+      if data.get("layers"):
+        stamps = max(lyr.get("no_of_stamps", 1) for _, lyr in data["layers"].items())
       else:
-        self.layout[0] = self.device_batch_size
-        LOGGER.log("Batched design detected")
+        stamps = overlay_stamps
 
+    self.layout = (batches, stamps, nrow, ncol)
+    if batches > 1:
+      LOGGER.log("Batched design detected")
     self.x2 = data[".meta"].get("flow") == "x2"
     return data
 
-  def _init_layers(self, raw_info, aie_iface, num_stamps):
+  def _init_layers(self, raw_info, aie_iface, num_stamps, num_batches=1):
     """
     Parse all layer entries from metadata and populate self.layers.
 
     Args:
         raw_info (dict): Parsed buffer_info JSON metadata.
         aie_iface: AIE interface object.
-        num_stamps (int): Number of stamps identified from overlay.
+        num_stamps (int): Stamps per batch (S from BxSxCxR).
+        num_batches (int): Number of batches (B from BxSxCxR).
     """
     version = Version.from_string(raw_info[".meta"]["version"])
     size_shift = raw_info[".meta"].get("size_shift")
@@ -794,7 +836,8 @@ def _init_layers(self, raw_info, aie_iface, num_stamps):
     raw_layers = sorted(raw_layers.items(), key=lambda item: item[1]["layer_order"])
     for entry in raw_layers:
       info = entry[1]
-      layer = Layer(info, size_shift, version, aie_iface, num_stamps, self.mladf_report)
+      layer = Layer(info, size_shift, version, aie_iface, num_stamps, self.mladf_report,
+                    num_batches=num_batches)
       self.layers.append(layer)
 
   def _initialize_layers_from_workdir_x2(self, args):
@@ -816,10 +859,11 @@ def _initialize_layers_from_workdir_x2(self, args):
     self.layers = [layer for layer in self.layers if not layer.lcp.is_tg]
     if not self.layers:
       raise RuntimeError("No layers found in the design.")
-    for sid in self.overlay.get_stampids():
+    # Resolve PCs once per stamp
+    for sid in range(self.overlay.get_stamps_per_batch()):
       for layer in self.layers:
-        flist = list(self.layer_workdir_map[layer.layer_order].aie_functions[sid].values())[0]
-        self.layer_workdir_map[layer.layer_order].pm_reload_en[sid] = True
+        flist = list(self.layer_workdir_map[layer.layer_order].stamps[sid].aie_functions.values())[0]
+        self.layer_workdir_map[layer.layer_order].stamps[sid].pm_reload_en = True
         for f in flist:
           if _strip_template(layer.stamps[sid].name.lower()) == _strip_template(f.name.lower()):
             stamp = layer.stamps[sid]
@@ -856,33 +900,47 @@ def _initialize_layers_from_workdir(self, args):
 
     # Hierarchy of Data:
     # Stamp <- Elf <- Layers
-    # AIECompiler only knows flexmlIDs so we use that to match with correct layer
-    for sid in self.overlay.get_stampids():
-      has_pm_reload = self.work_dir.pm_reload_en[sid]
-      for elf_name, flist in self.work_dir.aie_functions[sid].items():
-        LOGGER.verbose_print(f"Initializing layers for stamp {sid} ELF: {elf_name}")
-        elf_id = elf_name.split("reloadable")[-1]
-        for f, l in itertools.product(flist, self.layers):
-          if sid > len(l.stamps) - 1:
-            continue
-          if _strip_template(l.stamps[sid].name.lower()) == _strip_template(f.name.lower()):
-            stamp = l.stamps[sid]
-            if l.lcp.is_tg and stamp.elf_name == elf_id:
-              stamp.start_pc = f.start_pc
-              if f.name.lower() not in skip_end_pc_kernels:
-                stamp.end_pc = f.final_lock_release_pc
-              continue
-            # Check if this layer is present in the elf
-            # In buffer_info the flexml_ids might not be in order of stamps
-            if has_pm_reload and not any(i in self.work_dir.elf_flxmlid_maps[sid][elf_id] for i in l.flexml_ids):
-              continue
-            LOGGER.verbose_print("Layer found:", l.layer_order, stamp.name)
-            stamp.elf_name = elf_id
-            stamp.start_pc = f.start_pc
-            if f.name.lower() not in skip_end_pc_kernels:
-              stamp.end_pc = f.final_lock_release_pc
+    # AIECompiler only knows flexmlIDs so we use that to match with correct layer.
+    # For each layer we pick the ELF its kernel lives in, then fill in the PCs.
+    for sid in range(self.overlay.get_stamps_per_batch()):
+      aiec_info = self.work_dir.stamp(sid)
+      # Index functions by elf_id and stripped name for direct lookup.
+      funcs_by_elf = {
+        elf_name.split("reloadable")[-1]:
+          {_strip_template(f.name.lower()): f for f in flist}
+        for elf_name, flist in aiec_info.aie_functions.items()
+      }
+      for layer in self.layers:
+        if sid >= len(layer.stamps):
+          continue
+        stamp = layer.stamps[sid]
+        key = _strip_template(stamp.name.lower())
+        # Pick the ELF this layer's kernel comes from.
+        if layer.lcp.is_tg:
+          # TG layers already carry their elf_name.
+          elf_id = stamp.elf_name
+        elif aiec_info.pm_reload_en:
+          # In buffer_info the flexml_ids might not be in order of stamps, so
+          # match on flexml-id membership and name within the same ELF.
+          elf_id = next((e for e, fns in funcs_by_elf.items()
+                         if key in fns
+                         and any(i in aiec_info.elf_flxmlid_maps[e] for i in layer.flexml_ids)),
+                        None)
+        else:
+          elf_id = next((e for e, fns in funcs_by_elf.items() if key in fns), None)
+
+        f = funcs_by_elf.get(elf_id, {}).get(key) if elf_id is not None else None
+        if f is None:
+          continue
+        LOGGER.verbose_print("Layer found:", layer.layer_order, stamp.name)
+        if not layer.lcp.is_tg:
+          stamp.elf_name = elf_id
+        stamp.start_pc = f.start_pc
+        if f.name.lower() not in skip_end_pc_kernels:
+          stamp.end_pc = f.final_lock_release_pc
 
     # Under right conditions, we don't even go through iterations
+    # This is optional enhancement for stability.
     if args.run_flags.skip_iter:
       for idx, layer in enumerate(self.layers):
         if idx >= len(self.layers) - 1:
diff --git a/src/mldebug/memory_dumper.py b/src/mldebug/memory_dumper.py
index c2da679..10ad0c0 100644
--- a/src/mldebug/memory_dumper.py
+++ b/src/mldebug/memory_dumper.py
@@ -119,13 +119,8 @@ def dump_memory_l2(self, buffers, it, layer_order=None, use_l2_names=False, sid=
       return
 
     overlay = self.design_info.overlay
-    # batch + stamp combination doesn't exist
-    if self.design_info.is_batched():
-      batch = str(sid)
-      suffix = "stamp0"
-    else:
-      batch = "0"
-      suffix = f"stamp{sid}"
+    batch = str(overlay.replica_to_batch(sid))
+    suffix = f"stamp{overlay.replica_to_stamp(sid)}"
 
     for buffer in buffers:
       if buffer.ofm:
@@ -161,9 +156,7 @@ def dump_memory_l1(self, buffers, it, is_ping=None, sid=0):
     if self.args.run_flags.skip_dump or self.args.run_flags.l2_dump_only:
       return
 
-    batch = "0"
-    if self.design_info.is_batched():
-      batch = str(sid)
+    batch = str(self.design_info.overlay.replica_to_batch(sid))
 
     for buffer in buffers:
       if not buffer.l1:
diff --git a/src/mldebug/utils.py b/src/mldebug/utils.py
index 16eaa5d..92db172 100644
--- a/src/mldebug/utils.py
+++ b/src/mldebug/utils.py
@@ -248,6 +248,33 @@ def wrapper(*args, **kwargs):
   return wrapper
 
 
+def wait_until(predicate, *, timeout=10.0, interval=0.1, on_timeout=None):
+  """
+  Poll ``predicate`` until it returns truthy or ``timeout`` seconds elapse.
+
+  Uses ``time.monotonic`` so it is immune to wall-clock jumps.
+
+  Args:
+    predicate (callable): Zero-arg callable returning truthy when done.
+    timeout (float): Max seconds to wait.
+    interval (float): Sleep between polls.
+    on_timeout (callable, optional): Called once if the timeout fires
+      (e.g. to log a diagnostic).
+
+  Returns:
+    bool: True if ``predicate`` became truthy, False on timeout.
+  """
+  start = time.monotonic()
+  while True:
+    time.sleep(interval)
+    if predicate():
+      return True
+    if time.monotonic() - start > timeout:
+      if on_timeout is not None:
+        on_timeout()
+      return False
+
+
 def print_tile_grid(title, tiles, register_values=None, format_type="hex"):
   """
   Prints a grid visualization of tile information and optional register values.
diff --git a/src/mldebug/work_dir.py b/src/mldebug/work_dir.py
index 0f9bb3e..9ddb2f1 100644
--- a/src/mldebug/work_dir.py
+++ b/src/mldebug/work_dir.py
@@ -9,7 +9,7 @@
 import re
 import subprocess
 from importlib import resources
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 
 from mldebug.extra.calltree import AIECallTree
@@ -53,6 +53,30 @@ class GlobalVar:
   size: int
 
 
+@dataclass
+class StampInfo:
+  """
+  Per-stamp data parsed from the work directory.
+
+  One instance exists per per-batch stamp (S of them). Batch copies run the
+  same ELFs, so callers map a flat replica id (b*S + s) back to its stamp via
+  WorkDir.stamp(sid) rather than storing B*S duplicates.
+  """
+
+  # True when the stamp has reloadable ELFs (program-memory reload).
+  pm_reload_en: bool = False
+  # elf_name -> list[AIEFunction]
+  aie_functions: dict = field(default_factory=dict)
+  # elf partition -> list of flexml layer ids (only set when pm reload).
+  elf_flxmlid_maps: dict = field(default_factory=dict)
+  # list[GlobalVar] for lcpPing/lcpPong (None until first var is found).
+  globals: list = field(default_factory=list)
+  # Lock acquire instruction PC after layer execution (used for skip_iter).
+  post_layer_lock_acq_pc: int = 0
+  # list[(elf_name, lst_text)] captured during LLVM parsing.
+  lst_map: list = field(default_factory=list)
+
+
 def _parse_flexml_layer_id(objstr):
   """
   Parse a layer index from an object string using the current BE naming convention.
@@ -78,7 +102,7 @@ class WorkDir:
   Abstraction for AIE Work Directory
   """
 
-  def __init__(self, aie_dir, peano, overlay, dump_lst=False):
+  def __init__(self, aie_dir, peano, overlay, arch_name,  dump_lst=False):
     """
     Initialize the AIE Work Directory abstraction. Sets up internal state and parses functions.
     Args:
@@ -87,31 +111,31 @@ def __init__(self, aie_dir, peano, overlay, dump_lst=False):
         overlay: Overlay object with get_stampids() and get_first_relative_core_tile().
         dump_lst (bool): Whether to dump LST files.
     """
-    num_stamps = len(overlay.get_stampids())
-
-    self.pm_reload_en = [False] * num_stamps
-    self.aie_functions = [None] * num_stamps
-    self.elf_flxmlid_maps = [None] * num_stamps
-    self.globals = [None] * num_stamps
     self.peano = peano
     self.aie_dir = aie_dir
-    self.dump_lst = dump_lst
-    # Lock acquire instruction PC after layer execution
-    # This pc can be used for skip_iter
-    self.post_layer_lock_acq_pcs = [0] * num_stamps
+    self.stamps_per_batch = overlay.get_stamps_per_batch()
+    self.stamps = [StampInfo() for _ in range(self.stamps_per_batch)]
 
-    self._stamp_lst_map = {}
-    for sid in range(num_stamps):
-      self._stamp_lst_map[sid] = []
+    self._initialize_functions(aie_dir, overlay, arch_name, dump_lst)
 
-    self._initialize_functions(aie_dir, overlay)
+  def stamp(self, sid):
+    """
+    Map a flat replica id (b*S + s) back to its per-batch StampInfo. Batch
+    copies have the same ELFs, so all replicas of stamp s see the same data.
+
+    Args:
+        sid (int): Flat replica id.
+    Returns:
+        StampInfo: The per-batch stamp info for this replica.
+    """
+    return self.stamps[sid % self.stamps_per_batch]
 
   def _check_for_lock_acq(self, line, sid, llvm):
     """
     find lock acq in base lst
     """
     if "acq" in line.lower():
-      self.post_layer_lock_acq_pcs[sid] = self._get_pc(line, llvm)
+      self.stamps[sid].post_layer_lock_acq_pc = self._get_pc(line, llvm)
 
   def _demangle(self, fstring):
     """
@@ -141,7 +165,7 @@ def _parse_aie_runtime_control(self, work_dir, col, row, stampid):
         row (int): Row of the target core tile.
         stampid (int): Stamp index into overlay.
     Side effects:
-        Updates self.elf_flxmlid_maps[stampid] to map ELF partitions to layers.
+        Updates self.stamps[stampid].elf_flxmlid_maps to map ELF partitions to layers.
     """
     elf_layer_map = {}
     # Elfs for different columns can be reloaded in same line so we have to create multiple groups
@@ -163,36 +187,36 @@ def _parse_aie_runtime_control(self, work_dir, col, row, stampid):
             if layeridx in elf_layer_map[par]:
               break
             elf_layer_map[par].append(layeridx)
-    self.elf_flxmlid_maps[stampid] = elf_layer_map
+    self.stamps[stampid].elf_flxmlid_maps = elf_layer_map
 
-  def _get_lst(self, elf_path, elf_name):
+  def _get_lst(self, elf_path, elf_name, arch_name, dump_lst):
     """
     Generate and fetch a disassembly listing (lst) for an ELF file using llvm-objdump.
 
     Args:
         elf_path (str): Path to the ELF binary.
         elf_name (str): Base ELF file name (stem).
+        arch_name (str): Target architecture name passed to llvm-objdump.
+        dump_lst (bool): Whether to write the output listing to disk.
 
     Returns:
         str: Decoded assembly listing as text.
     Side effects:
-        If self.dump_lst is True, writes the output listing to disk.
+        If dump_lst is True, writes the output listing to disk.
     """
     lst_data = ""
     exe = "llvm-objdump.elf"
-    archname = "aie2p"
     if is_windows():
       exe = "llvm-objdump.exe"
     elif is_aarch64():
       exe = "llvm-objdump.aarch64"
-      archname = "aie2ps"
     with resources.as_file(resources.files("mldebug") / "bin" / exe) as objdump_path:
       lst = subprocess.check_output(
-        [str(objdump_path), "-d", "-z", "--no-show-raw-insn", f"--arch-name={archname}", "-C", elf_path]
+        [str(objdump_path), "-d", "-z", "--no-show-raw-insn", f"--arch-name={arch_name}", "-C", elf_path]
       )
       lst_data = lst.decode("utf-8")
 
-    if self.dump_lst:
+    if dump_lst:
       fname = elf_name + ".lst"
       print("Writing assembly listing to:", fname)
       with open(fname, "w", encoding="utf8") as fd:
@@ -259,52 +283,58 @@ def _breakpoint_allowed(self, lines, i):
         return False
     return True
 
-  def _initialize_functions(self, work_dir, overlay):
+  def _initialize_functions(self, work_dir, overlay, arch_name, dump_lst):
     """
     Parse work directory and its ELF files to extract function ranges, tail calls,
     global variables, and layer/partition info.
 
+    For batched + stamped designs we only parse one batch's worth of stamps
+    (S replicas). The same ELF binaries are loaded into the additional batch
+    columns, so the PCs and global addresses are identical; callers reach the
+    batch copies through self.stamp(sid) (sid % S).
+
     Args:
         work_dir (str): Path to the AIE work directory.
         overlay: Overlay object for tile mapping.
+        arch_name (str): Target architecture name passed to llvm-objdump.
+        dump_lst (bool): Whether to write disassembly listings to disk.
     Side effects:
-        Populates aie_functions, pm_reload_en, globals, elf_flxmlid_maps.
+        Populates self.stamps[s] for each per-batch stamp.
     """
     print("[INFO] Try to detect Work Directory ...")
     full_path = Path(work_dir + "/aie/")
     if not Path.exists(full_path):
       LOGGER.log(f"[INFO] Work directory {full_path} does not exist.")
       return
-    for stampid in overlay.get_stampids():
-      col, row = overlay.get_first_relative_core_tile(stampid)
+    for s in range(self.stamps_per_batch):
+      col, row = overlay.get_first_relative_core_tile(s)
       core_name = f"{col}_{row}"
       print(f"Core: {core_name}")
       plist = []
       for elf in full_path.glob(f"{core_name}*"):
         plist.append(elf)
       if len(plist) > 1:
-        self.pm_reload_en[stampid] = True
-        self._parse_aie_runtime_control(work_dir, col, row, stampid)
-      self.aie_functions[stampid] = {}
+        self.stamps[s].pm_reload_en = True
+        self._parse_aie_runtime_control(work_dir, col, row, s)
 
       # Parse LST
       for p in plist:
         LOGGER.verbose_print(f"[INFO] Process: {p}")
         if not self.peano:
-          success = self._parse_lst_chess(p, stampid)
+          success = self._parse_lst_chess(p, s)
           if not success:
             print(f"[WARNING] Failed to parse LST for {p}. Assuming peano compiler.")
             self.peano = True
         if self.peano:
-          self._parse_lst_llvm(p, stampid)
+          self._parse_lst_llvm(p, s, arch_name, dump_lst)
 
       # Parse map file to find LCP
       # Only base map file has global variables
       first_elf = full_path / core_name
       if self.peano:
-        self._extract_globals_llvm(first_elf, stampid)
+        self._extract_globals_llvm(first_elf, s)
       else:
-        self._extract_globals_chess(first_elf, stampid)
+        self._extract_globals_chess(first_elf, s)
 
   def _parse_lst_chess(self, elf, stampid):
     """
@@ -318,7 +348,7 @@ def _parse_lst_chess(self, elf, stampid):
     Returns:
         bool: True if parsed successfully, False if the LST file doesn't exist.
     Side effects:
-        Populates self.aie_functions[stampid][elf_name] with AIEFunction objects.
+        Populates self.stamps[stampid].aie_functions[elf_name] with AIEFunction objects.
     """
     elf_name = elf.stem
     lst_file = f"{elf}/Release/{elf_name}.lst"
@@ -327,7 +357,7 @@ def _parse_lst_chess(self, elf, stampid):
 
     is_base = "reloadable" not in elf_name
 
-    self.aie_functions[stampid][elf_name] = []
+    self.stamps[stampid].aie_functions[elf_name] = []
     with open(lst_file, encoding="utf-8") as fd:
       lines = fd.read().split("\n")
     count = len(lines)
@@ -377,7 +407,7 @@ def _parse_lst_chess(self, elf, stampid):
             i -= 1
             break
           i += 1
-        self.aie_functions[stampid][elf_name].append(
+        self.stamps[stampid].aie_functions[elf_name].append(
           AIEFunction(demangled, start_pc, end_pc, final_lock_release_pc, tail_call)
         )
       i += 1
@@ -427,9 +457,9 @@ def _extract_globals_llvm(self, elf, sid):
 
     Args:
         elf (Path): Path object of the ELF directory.
-        sid (int): Index into self.globals for this stamp.
+        sid (int): Index into self.stamps for this stamp.
     Side effects:
-        Appends GlobalVar objects to self.globals[sid] for lcpPing/lcpPong if present.
+        Appends GlobalVar objects to self.stamps[sid].globals for lcpPing/lcpPong if present.
     """
     mapfile_path = f"{elf}/Release/{elf.stem}.map"
     if not Path(mapfile_path).exists():
@@ -438,21 +468,21 @@ def _extract_globals_llvm(self, elf, sid):
 
     def _extract_var(lines, var_name):
       """
-      Find and add a global variable by name from the given lines to self.globals[sid].
+      Find and add a global variable by name from the given lines to self.stamps[sid].globals.
       Args:
           lines (List[str]): Lines of map file.
           var_name (str): Variable name to search for.
       Side effects:
-          Updates self.globals[sid].
+          Updates self.stamps[sid].globals.
       """
-      if not self.globals[sid]:
-        self.globals[sid] = []
+      if not self.stamps[sid].globals:
+        self.stamps[sid].globals = []
       for line in lines:
         if var_name in line:
           tokens = line.split()
           if len(tokens) >= 3:
             try:
-              self.globals[sid].append(GlobalVar(var_name, int(tokens[0], base=16), int(tokens[2], base=16)))
+              self.stamps[sid].globals.append(GlobalVar(var_name, int(tokens[0], base=16), int(tokens[2], base=16)))
               LOGGER.verbose_print(f"[INFO] Found global variable: {var_name} at {tokens[0]} size {tokens[2]}")
             except ValueError:
               pass  # Ignore lines that cannot be parsed
@@ -470,9 +500,9 @@ def _extract_globals_chess(self, elf, sid):
 
     Args:
         elf (Path): Path object of the ELF directory.
-        sid (int): Index into self.globals for this stamp.
+        sid (int): Index into self.stamps for this stamp.
     Side effects:
-        Appends GlobalVar objects to self.globals[sid] for lcpPing/lcpPong if present.
+        Appends GlobalVar objects to self.stamps[sid].globals for lcpPing/lcpPong if present.
     """
     mapfile_path = f"{elf}/Release/{elf.stem}.map"
     if not Path(mapfile_path).exists():
@@ -481,15 +511,15 @@ def _extract_globals_chess(self, elf, sid):
 
     def _extract_var(lines, var_name):
       """
-      Find and add a global variable by name from the given lines to self.globals[sid].
+      Find and add a global variable by name from the given lines to self.stamps[sid].globals.
       Args:
           lines (List[str]): Lines of map file.
           var_name (str): Variable name to search for.
       Side effects:
-          Updates self.globals[sid].
+          Updates self.stamps[sid].globals.
       """
-      if not self.globals[sid]:
-        self.globals[sid] = []
+      if not self.stamps[sid].globals:
+        self.stamps[sid].globals = []
       for line in lines:
         if var_name in line:
           tokens = line.split()[0].split("..")
@@ -498,7 +528,7 @@ def _extract_var(lines, var_name):
               start_addr = int(tokens[0], base=16)
               end_addr = int(tokens[1], base=16)
               size = end_addr - start_addr + 1
-              self.globals[sid].append(GlobalVar(var_name, start_addr, size))
+              self.stamps[sid].globals.append(GlobalVar(var_name, start_addr, size))
               LOGGER.verbose_print(f"[INFO] Found global variable: {var_name} at {start_addr} size {size}")
             except ValueError:
               pass  # Ignore lines that cannot be parsed
@@ -509,7 +539,7 @@ def _extract_var(lines, var_name):
       _extract_var(lines, "lcpPing")
       _extract_var(lines, "lcpPong")
 
-  def _parse_lst_llvm(self, elf, stampid):
+  def _parse_lst_llvm(self, elf, stampid, arch_name, dump_lst):
     """
     Parse LLVM-based LST disassembly to extract functions, boundaries,
     final lock release instructions, and tail call status.
@@ -517,19 +547,21 @@ def _parse_lst_llvm(self, elf, stampid):
     Args:
         elf (Path): Path object for the ELF file directory.
         stampid (int): Index into aie_functions.
+        arch_name (str): Target architecture name passed to llvm-objdump.
+        dump_lst (bool): Whether to write disassembly listings to disk.
     Side effects:
-        Populates self.aie_functions[stampid][elf_name] with AIEFunction objects.
+        Populates self.stamps[stampid].aie_functions[elf_name] with AIEFunction objects.
     """
     elf_name = elf.stem
     elf_path = f"{elf}/Release/{elf.stem}"
-    data = self._get_lst(elf_path, elf_name)
-    self._stamp_lst_map[stampid].append((elf_name, self._get_lst(elf_path, elf_name)))
+    data = self._get_lst(elf_path, elf_name, arch_name, dump_lst)
+    self.stamps[stampid].lst_map.append((elf_name, data))
     lines = data.split("\n")
 
     is_base = "reloadable" not in elf_name
 
-    self.aie_functions[stampid][elf_name] = []
-    flist = self.aie_functions[stampid][elf_name]
+    self.stamps[stampid].aie_functions[elf_name] = []
+    flist = self.stamps[stampid].aie_functions[elf_name]
     in_func = None
     for i, line in enumerate(lines):
       # function call
@@ -578,7 +610,7 @@ def find_functions_by_pc(self, pc):
         List[str]: List of "<elf>:<funcname>" strings whose PC range covers the input.
     """
     funclist = []
-    fmap = self.aie_functions[0]
+    fmap = self.stamps[0].aie_functions
     if fmap:
       for elf, flist in fmap.items():
         for func in flist:
@@ -596,21 +628,23 @@ def print_aie_functions(self, elf_id=None):
     Side effects:
         Prints formatted function info to stdout.
     """
-    if all(x is None for x in self.aie_functions):
+    if all(not si.aie_functions for si in self.stamps):
       print("No functions found in design. Please specify aiedir option.")
       return
 
     sep = "--------------------------------------------"
 
     if elf_id:
-      for fmap in self.aie_functions:
+      for si in self.stamps:
+        fmap = si.aie_functions
         if elf_id in fmap:
           print(f"{sep}\nFunctions in {elf_id}\n{sep}")
           for f in fmap[elf_id]:
             print(f)
           return
 
-    for stamp, fmap in enumerate(self.aie_functions):
+    for stamp, si in enumerate(self.stamps):
+      fmap = si.aie_functions
       if not fmap:
         continue
       print(f"{sep}\nElfs in Stamp: {stamp}\n{sep}")
@@ -628,10 +662,10 @@ def print_calltree(self, sid=0):
     Args:
         sid (int): Stamp index.
     """
-    if sid not in self._stamp_lst_map:
-      LOGGER.log(f"[ERROR] Stamp {sid} not found in _stamp_lst_map.")
+    if not 0 <= sid < self.stamps_per_batch:
+      LOGGER.log(f"[ERROR] Stamp {sid} out of range.")
       return
-    for elf_id, lst_content in self._stamp_lst_map[sid]:
+    for elf_id, lst_content in self.stamps[sid].lst_map:
       LOGGER.log(f"[INFO] Printing calltree for {elf_id}\n")
       tree = AIECallTree.from_string(lst_content)
       tree.print_calltree()
@@ -644,10 +678,10 @@ def dump_lst_to_file(self, sid=0):
     Args:
         sid (int): Stamp index.
     """
-    if sid not in self._stamp_lst_map:
-      LOGGER.log(f"[ERROR] Stamp {sid} not found in _stamp_lst_map.")
+    if not 0 <= sid < self.stamps_per_batch:
+      LOGGER.log(f"[ERROR] Stamp {sid} out of range.")
       return
-    for elf_id, lst_content in self._stamp_lst_map[sid]:
+    for elf_id, lst_content in self.stamps[sid].lst_map:
       with open(f"{elf_id}.lst", "w", encoding="utf-8") as fd:
         fd.write(lst_content)
       LOGGER.log(f"[INFO] LST file dumped to {elf_id}.lst")