From c4d49a2d162edd5175fa31e3a270620716ad5bdb Mon Sep 17 00:00:00 2001 From: anurag Date: Wed, 27 May 2026 04:35:45 -0600 Subject: [PATCH 01/17] add time helper util Signed-off-by: anurag --- src/mldebug/aie_util.py | 38 ++++++++++++++++--------------------- src/mldebug/batch_runner.py | 12 +++--------- src/mldebug/utils.py | 27 ++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 31 deletions(-) diff --git a/src/mldebug/aie_util.py b/src/mldebug/aie_util.py index 9a78257..1487a20 100644 --- a/src/mldebug/aie_util.py +++ b/src/mldebug/aie_util.py @@ -5,9 +5,7 @@ Manages high level interaction with AIE """ -import time - -from mldebug.utils import LOGGER +from mldebug.utils import LOGGER, wait_until class AIEUtil: @@ -159,20 +157,21 @@ def skip_iterations(self, count, sid): write(reg_map["DEBUG_CONTROL1"], perf_cntr_event << 16) self.impl.continue_aie() # Step3: Poll all tiles until every PERF_CNTR_1 reaches the specified count. - timeout = 10 - start_time = time.time() perf_cntr_1 = reg_map["PERF_CNTR_1"] - while True: - time.sleep(0.1) - values = self.read_aie_regs(perf_cntr_1) - if all(v == count for v in values.values()): - break - if time.time() - start_time > timeout: - LOGGER.log( - f"{sid}: Timeout waiting for skip {count} iterations across tiles! " - f"Design might be hung. Values={values}" - ) - return False + last = {} + + def reached(): + last["values"] = self.read_aie_regs(perf_cntr_1) + return all(v == count for v in last["values"].values()) + + def on_timeout(): + LOGGER.log( + f"{sid}: Timeout waiting for skip {count} iterations across tiles! " + f"Design might be hung. Values={last['values']}" + ) + + if not wait_until(reached, on_timeout=on_timeout): + return False # Step6: Reset debug control to stop at program counter event pc_event = self._get_eventid("PC_0_CORE") @@ -188,12 +187,7 @@ def skip_iterations_to_lock_acq(self, lock_acq_pc, count, sid): self.impl.set_pc_breakpoint(lock_acq_pc) self.impl.continue_aie() - timeout = 10 - start_time = time.time() - while time.time() - start_time < timeout: - time.sleep(0.1) - if self.impl.poll_core_status(): - break + wait_until(self.impl.poll_core_status) pcs = self.impl.read_core_pc(True) is_valid = self.pcs_match_target(pcs, lock_acq_pc) diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py index 8463e92..3ee46a0 100644 --- a/src/mldebug/batch_runner.py +++ b/src/mldebug/batch_runner.py @@ -17,7 +17,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed -from mldebug.utils import LOGGER, cleanup_and_exit, timeit +from mldebug.utils import LOGGER, cleanup_and_exit, timeit, wait_until class BatchRunner: @@ -244,14 +244,8 @@ def schedule_layer_start(self, next_layer): be.continue_aie() # Poll stamps until breakpoint is hit - timeout = 10 - start_time = time.time() - while time.time() - start_time < timeout: - if self.args.backend == "test": - break - time.sleep(0.1) - if all(be.poll_core_status() for be in bes_to_poll): - break + if self.args.backend != "test": + wait_until(lambda: all(be.poll_core_status() for be in bes_to_poll)) # When combo events are used, it takes a few cycles to # hit the breakpoint, so pc might have moved diff --git a/src/mldebug/utils.py b/src/mldebug/utils.py index 16eaa5d..92db172 100644 --- a/src/mldebug/utils.py +++ b/src/mldebug/utils.py @@ -248,6 +248,33 @@ def wrapper(*args, **kwargs): return wrapper +def wait_until(predicate, *, timeout=10.0, interval=0.1, on_timeout=None): + """ + Poll ``predicate`` until it returns truthy or ``timeout`` seconds elapse. + + Uses ``time.monotonic`` so it is immune to wall-clock jumps. + + Args: + predicate (callable): Zero-arg callable returning truthy when done. + timeout (float): Max seconds to wait. + interval (float): Sleep between polls. + on_timeout (callable, optional): Called once if the timeout fires + (e.g. to log a diagnostic). + + Returns: + bool: True if ``predicate`` became truthy, False on timeout. + """ + start = time.monotonic() + while True: + time.sleep(interval) + if predicate(): + return True + if time.monotonic() - start > timeout: + if on_timeout is not None: + on_timeout() + return False + + def print_tile_grid(title, tiles, register_values=None, format_type="hex"): """ Prints a grid visualization of tile information and optional register values. From 7e93feee93c4a10ddf50aee0f5d26e8e310ba33d Mon Sep 17 00:00:00 2001 From: anurag Date: Wed, 27 May 2026 05:14:18 -0600 Subject: [PATCH 02/17] merge from PR Signed-off-by: anurag --- src/mldebug/aie_overlay.py | 155 ++++++++++++++++++++--------- src/mldebug/batch_runner.py | 182 ++++++++++++++++++++-------------- src/mldebug/client_debug.py | 6 +- src/mldebug/debug_state.py | 21 +++- src/mldebug/layer_info.py | 187 ++++++++++++++++++++++------------- src/mldebug/memory_dumper.py | 13 +-- src/mldebug/work_dir.py | 39 ++++++-- 7 files changed, 389 insertions(+), 214 deletions(-) diff --git a/src/mldebug/aie_overlay.py b/src/mldebug/aie_overlay.py index 138b711..78fcaef 100644 --- a/src/mldebug/aie_overlay.py +++ b/src/mldebug/aie_overlay.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -# Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved. """ Manages overlays and stamps @@ -9,7 +9,16 @@ class Overlay: """ Abstraction for AIE Overlay. - NxCxR: Stamps/Batches x Cols x Rows. + + Layout is BxSxCxR where: + B = number of batches (data-parallel copies of the design) + S = number of stamps per batch (spatial replicas inside one batch) + C = columns per stamp + R = rows per stamp + + Replicas are packed stamp-inner along columns: replica i = b*S + s occupies + columns [i*C, (i+1)*C). The flat replica id is what the rest of the system + refers to as "sid" (stamp id). """ def __init__(self, args, layout): @@ -17,60 +26,71 @@ def __init__(self, args, layout): Initialize the Overlay with layout and tile information. Args: - args: Argument object containing configuration options, including aie_iface and overlay string. - layout: Tuple representing (stamps, ncol, nrow) as default or externally supplied layout. + args: Argument object containing configuration options, including + aie_iface and overlay string. + layout: Tuple representing the layout from buffer_info. Either + (batches, stamps, nrow, ncol) (new 4-element form) or + (stamps, nrow, ncol) (legacy; treated as batches=1). """ self.aie_iface = args.aie_iface self.stamps = {} self.impls = {} self.layout = self._get_layout(args.overlay, layout) - # For larger devices, a 4x4 overlay can be repeated (stamped) - stamps, ncol, nrow = self.layout - for stamp_id in range(stamps): - tiles = [] - start_col = stamp_id * ncol - for col in range(start_col, start_col + ncol): - for row in range(nrow + self.aie_iface.AIE_TILE_ROW_OFFSET): - tiles.append((col, row)) - self.stamps[stamp_id] = tiles + batches, stamps_per_batch, ncol, nrow = self.layout + for b in range(batches): + for s in range(stamps_per_batch): + replica_id = b * stamps_per_batch + s + tiles = [] + start_col = replica_id * ncol + for col in range(start_col, start_col + ncol): + for row in range(nrow + self.aie_iface.AIE_TILE_ROW_OFFSET): + tiles.append((col, row)) + self.stamps[replica_id] = tiles def _get_layout(self, args_overlay, layout): """ - Determine the overlay layout parameters (stamps, columns, rows). + Determine the overlay layout parameters as (batches, stamps, ncol, nrow). Args: - args_overlay (str): User-specified overlay string (e.g. '2x4x4'). - layout (tuple/list): Provided layout as (stamps, ncol, nrow). + args_overlay (str): User-specified overlay string (e.g. '2x4x4' or + '4x4'). Parsed as N x C x R; treated as batches=1, stamps=N. + layout (tuple/list): Layout supplied by LayerInfo. Either + (batches, stamps, nrow, ncol) (new 4-element form) or + (stamps, nrow, ncol) (legacy). Returns: - tuple: (stamps, ncol, nrow) representing number of stamps, columns, and rows. + tuple: (batches, stamps, ncol, nrow). """ - stamps, ncol, nrow = (1, 4, 4) + batches, stamps_per_batch, ncol, nrow = (1, 1, 4, 4) if args_overlay: - layout = [int(x) for x in args_overlay.split("x")] - if len(layout) == 3: - stamps, ncol, nrow = layout - elif len(layout) == 2: - ncol, nrow = layout + parsed = [int(x) for x in args_overlay.split("x")] + if len(parsed) == 3: + stamps_per_batch, ncol, nrow = parsed + elif len(parsed) == 2: + ncol, nrow = parsed else: print(f"[WARNING] Cannot parse overlay: {args_overlay}.") elif layout: - # Layout in buffer_info will be reversed - stamps, nrow, ncol = layout - print("[INFO] Using Layout: ", stamps, ncol, nrow) - return stamps, ncol, nrow + if len(layout) == 4: + # New form from buffer_info: [B, S, R, C] + batches, stamps_per_batch, nrow, ncol = layout + elif len(layout) == 3: + # Legacy form: [stamps, R, C]; batches encoded by caller into stamps + stamps_per_batch, nrow, ncol = layout + print("[INFO] Using Layout: ", batches, stamps_per_batch, ncol, nrow) + return batches, stamps_per_batch, ncol, nrow def get_first_relative_core_tile(self, stamp_id=0): """ - Get the (col, row) tuple for the first AIE core tile in the specified stamp, - adjusting row by the device-specific tile row offset. + Get the (col, row) tuple for the first AIE core tile in the specified + replica, adjusting row by the device-specific tile row offset. Args: - stamp_id (int, optional): Stamp index to query. Default is 0. + stamp_id (int, optional): Replica index to query. Default is 0. Returns: - tuple: (column, row) of the first core tile within the given stamp. + tuple: (column, row) of the first core tile within the given replica. """ t = self.get_tiles(self.aie_iface.AIE_TILE_T, stamp_id)[0] return t[0], t[1] - self.aie_iface.AIE_TILE_ROW_OFFSET @@ -80,12 +100,15 @@ def get_tiles(self, tile_type=None, stamp_id=0, raw=False): Query tile locations for the overlay. Args: - tile_type (str, optional): Tile type identifier for filtering. If None, returns all tile positions. - stamp_id (int, optional): Stamp ID to filter tiles by. Defaults to 0. - raw (bool, optional): If True, return all tile positions for all stamps, unfiltered. + tile_type (str, optional): Tile type identifier for filtering. If None, + returns all tile positions. + stamp_id (int, optional): Replica id to filter tiles by. Defaults to 0. + raw (bool, optional): If True, return all tile positions for all + replicas, unfiltered. Returns: - list[tuple]: List of (column, row) tile coordinates corresponding to requested tiles. + list[tuple]: List of (column, row) tile coordinates corresponding to + requested tiles. """ tile_list = [] if raw: @@ -99,36 +122,76 @@ def get_tiles(self, tile_type=None, stamp_id=0, raw=False): def get_stampids(self): """ - Get a list of all configured stamp IDs in the overlay. + Get a list of all configured replica ids in the overlay. Returns: - list[int]: List of integer stamp IDs available in the layout. + list[int]: List of integer replica ids (length = batches * stamps). """ return list(self.stamps.keys()) + def get_replica_count(self): + """ + Total number of replicas in the overlay (batches * stamps_per_batch). + """ + return self.layout[0] * self.layout[1] + def get_stampcount(self): """ - Return the number of stamps present in the overlay. + Total number of replicas (alias for get_replica_count, kept for + backward compatibility with existing callers). + """ + return self.get_replica_count() - Returns: - int: The stamp count (N from NxCxR). + def get_batch_count(self): + """ + Number of batches (B from BxSxCxR). """ return self.layout[0] + def get_stamps_per_batch(self): + """ + Number of stamps within a single batch (S from BxSxCxR). + """ + return self.layout[1] + + def replica_to_batch(self, sid): + """ + Map a flat replica id to its batch index. + """ + return sid // self.layout[1] + + def replica_to_stamp(self, sid): + """ + Map a flat replica id to its per-batch stamp index. + """ + return sid % self.layout[1] + + def is_leftmost_in_batch(self, sid): + """ + True if this replica is the leftmost stamp of its batch (per-batch stamp + index == 0). The leftmost-in-batch replica is always scheduled at every + layer; the others may skip layers. + """ + return sid % self.layout[1] == 0 + def get_stampwidth(self): """ - Get the width (number of columns) for a single stamp within the overlay. + Get the width (number of columns) for a single stamp/replica. Returns: - int: The number of columns in the overlay (C from NxCxR). + int: The number of columns per replica (C from BxSxCxR). """ - return self.layout[1] + return self.layout[2] def get_repr(self): """ - Return the string representation of the overlay layout (e.g., '2x4x4'). + Return the string representation of the overlay layout (e.g. '2x1x4x4' + or '1x4x4' when only one batch). Returns: - str: Overlay configuration as 'N x C x R' string. + str: Overlay configuration as a 'B x S x C x R' (or 'S x C x R') string. """ - return "x".join([str(x) for x in self.layout]) + batches, stamps, ncol, nrow = self.layout + if batches == 1: + return f"{stamps}x{ncol}x{nrow}" + return f"{batches}x{stamps}x{ncol}x{nrow}" diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py index 3ee46a0..be88432 100644 --- a/src/mldebug/batch_runner.py +++ b/src/mldebug/batch_runner.py @@ -59,19 +59,29 @@ def common_init(self): """ Common initialization for batch and interactive modes. - Collapses to single-stamp mode if multistamp flag is not set, - enables PC halt for all stamps, and initializes skip-iteration support. + Collapses to a single replica (batch 0, stamp 0) when the `multistamp` + flag is not set, enables PC halt for all replicas, and initializes + skip-iteration support. """ if not self.args.run_flags.multistamp and self.design_info.overlay.get_stampcount() > 1: for layer in self.design_info.layers: layer.stamps[:] = layer.stamps[:1] + layer.stamps_per_batch = 1 + layer.num_batches = 1 for u in self.aie_utls[1:]: u.initialize_stamp() # In-place list modification so all holders of these references see the change del self.aie_utls[1:] del self.impls[1:] - self.design_info.overlay.layout = (1,) + self.design_info.overlay.layout[1:] + # Drop both batch and stamp dims to 1, keeping (C, R) intact. + _, _, ncol, nrow = self.design_info.overlay.layout + self.design_info.overlay.layout = (1, 1, ncol, nrow) self.design_info.overlay.stamps = {0: self.design_info.overlay.stamps[0]} + # Keep DebugState in sync with the collapsed view so per-batch helpers + # behave correctly (S=1 makes every replica leftmost-in-batch). + self.state.stamps_per_batch = 1 + self.design_info.num_batches = 1 + self.design_info.num_stamps = 1 LOGGER.log("[INFO] Using single stamp control. Please use multistamp flag for more data.") for sid in self.design_info.overlay.get_stampids(): @@ -117,7 +127,7 @@ def _set_layer_breakpoint(self, layer, skip_end_pc, sid, pm_reload_expected): start_pc_slot = 0 end_pc_slot = 1 - stamp = layer.stamps[sid] + stamp = layer.get_stamp(sid) start_pc = stamp.start_pc if not start_pc: print(f"Invalid configuration on stamp {sid} layer {layer.layer_order}.") @@ -135,10 +145,15 @@ def _set_layer_breakpoint(self, layer, skip_end_pc, sid, pm_reload_expected): def check_pm_reload(self, stamp_id=0): """ - Check if the next ELF will be loaded (PM Reload). + Check if the next ELF will be loaded (PM Reload) for the given replica. + + For each batch, the leftmost replica (per-batch stamp index 0) always + participates in every layer, so we look at `current_layer + 1` directly. + Other replicas may skip layers, so we walk forward to the next layer + they actually run via `get_next_layer_for_stamp`. Args: - stamp_id: Stamp index to check for reload (default 0). + stamp_id: Replica id to check for reload (default 0). Returns: True if program memory reload will occur at the next layer, False otherwise. @@ -147,14 +162,16 @@ def check_pm_reload(self, stamp_id=0): if not self.design_info.work_dir.pm_reload_en[stamp_id] or self.state.current_layer + 1 >= len(self.state.layers): return False - if stamp_id > 0 and not self.design_info.is_batched(): - next_layer = self.state.get_next_layer_for_stamp(stamp_id, idx=1) - else: + if self.design_info.overlay.is_leftmost_in_batch(stamp_id): next_layer = self.state.layers[self.state.current_layer + 1] + else: + next_layer = self.state.get_next_layer_for_stamp(stamp_id, idx=1) - if next_layer and stamp_id < len(layer.stamps) and stamp_id < len(next_layer.stamps): - return layer.stamps[stamp_id].elf_name != next_layer.stamps[stamp_id].elf_name - return False + if next_layer is None: + return False + cur_stamp = layer.get_stamp(stamp_id) + next_stamp = next_layer.get_stamp(stamp_id) + return cur_stamp.elf_name != next_stamp.elf_name def hit_next_breakpoint(self, sid=0): """ @@ -184,45 +201,38 @@ def schedule_layer_start(self, next_layer): Args: next_layer: Next Layer object to start. """ - stamp_target_layers = {0: next_layer} - - for sid in range(1, len(self.state.pm_reload)): - stamp_target_layers[sid] = self.state.get_next_layer_for_stamp(sid) + overlay = self.design_info.overlay + stamp_target_layers = {} + for sid in range(len(self.state.pm_reload)): + if overlay.is_leftmost_in_batch(sid): + # Leftmost replica of every batch always participates in next_layer. + stamp_target_layers[sid] = next_layer + else: + stamp_target_layers[sid] = self.state.get_next_layer_for_stamp(sid) for utl in self.aie_utls: utl.disable_ecc_event() bes_to_poll = [] bes_to_run = [] - # Stamp0 breakpoint always scheduled - # Stamp1+ breakpoint only scheduled at end of 2 stamps or at beginning - # - # NOTE ON "EARLY" PM-RELOAD ARMING: - # `target_layer` for stamp N may be a layer *later* than `next_layer` - # (the outer-loop layer currently being scheduled). This happens when a - # non-participating stamp skips one or more layers - `get_next_layer_for_stamp` - # walks forward to the next layer that actually contains this stamp. + stamps_to_run = [] + # Per-batch leftmost stamps (sid 0 within each batch) always have their + # breakpoint scheduled on next_layer. The remaining stamps may early-arm + # a breakpoint for a *future* layer they actually participate in. # - # When that future target layer uses a different ELF for this stamp, we - # must arm the start-PC breakpoint AND the combo event (via break_combo - # inside _set_layer_breakpoint) *before* the stamp is released with - # continue_aie below. If we defer arming until we reach the outer-loop - # iteration for the stamp's real target layer, the stamp would have - # already been released without a valid breakpoint (or without combo - # event coverage across the PM reload) and would either free-run past - # its target start PC or stall indefinitely at the end of its previous - # layer - blocking progress of the other stamps that depend on it. - # - # Consequence: the "PM RELOAD" log may appear while scheduling an outer - # layer that this stamp does not participate in. That is intentional - - # it marks when the breakpoint is *armed*, not when the reload - # physically occurs. The post-poll block below finalizes the combo - # event (enable_pc_halt + clear pm_reload[sid]) only once the outer - # loop actually reaches that stamp's target layer, guarded by - # `break_on_stamp_scheduled[sid]` so we do not re-arm on the way there. + # Example for "EARLY" PM-RELOAD ARMING: + # Layer 0 stamp0 stamp1 stamp2 + # Layer 1 stamp0 stamp1 + # + # Layer 3 stamp0 stamp1 stamp2 + # Step to layer 0 : step to all 3 stamps + # Step to layer 1 : run stamp0,1 Arm Stamp2 via combo and continue it + # PM Reload message appears early for stamp 2 + # Step to layer 3 : step to all 3 stamps for sid, pml in enumerate(self.state.pm_reload): target_layer = stamp_target_layers.get(sid) - if not target_layer or (sid > 0 and self.state.break_on_stamp_scheduled[sid]): + is_leftmost = overlay.is_leftmost_in_batch(sid) + if not target_layer or (not is_leftmost and self.state.break_on_stamp_scheduled[sid]): continue self.state.break_on_stamp_scheduled[sid] = True if pml: @@ -232,12 +242,13 @@ def schedule_layer_start(self, next_layer): ) else: LOGGER.log(f"\nPM RELOAD on stamp: {sid}") - stamp = target_layer.stamps[sid] + stamp = target_layer.get_stamp(sid) skip_end_pc = not (self.args.run_flags.l1_ofm_dump and stamp.end_pc) self._set_layer_breakpoint(target_layer, skip_end_pc, sid, pml) bes_to_run.append(self.impls[sid]) if target_layer.layer_order == next_layer.layer_order: bes_to_poll.append(self.impls[sid]) + stamps_to_run.append((sid, pml, stamp)) # Run stamps at exact same time for be in bes_to_run: @@ -247,31 +258,31 @@ def schedule_layer_start(self, next_layer): if self.args.backend != "test": wait_until(lambda: all(be.poll_core_status() for be in bes_to_poll)) - # When combo events are used, it takes a few cycles to - # hit the breakpoint, so pc might have moved - for sid, pml in enumerate(self.state.pm_reload): - ta_layer = stamp_target_layers.get(sid) - if ta_layer is not None and next_layer.layer_order == ta_layer.layer_order: - stamp = next_layer.stamps[sid] - pcs = self.impls[sid].read_core_pc(True) - - # combo event trigger has one cycle delay - is_correct_pc = utl.pcs_match_target(pcs, stamp.start_pc, allow_combo_delay=pml) - - if is_correct_pc: - self._process_start_breakpoint(next_layer, 1, sid=sid) - else: - print(f"[ERROR] Step to start of Layer_{next_layer.layer_order} failed on Stamp_{sid}") - self._process_err() - if pml: - self.impls[sid].enable_pc_halt() - self.state.pm_reload[sid] = False - # Breakpoint has now been observed for this stamp; clear the - # "already scheduled" guard so the next outer-loop layer can - # arm it normally. For stamps whose target_layer is *not* yet - # this next_layer (early-armed for a future target), the flag - # stays True - preventing re-arm/continue while we walk past. - self.state.break_on_stamp_scheduled[sid] = False + # Now check that breakpoints were hit at the right PC for each stamp + # that actually targets next_layer. When combo events are used the PC + # may have moved by a few cycles past the start_pc. + for sid, pml, stamp in stamps_to_run: + pcs = self.impls[sid].read_core_pc(True) + utl = self.aie_utls[sid] + is_correct_pc = utl.pcs_match_target(pcs, stamp.start_pc, allow_combo_delay=pml) + + if is_correct_pc: + self._process_start_breakpoint(next_layer, 1, sid=sid) + else: + print(f"[ERROR] Step to start of Layer_{next_layer.layer_order} failed on Stamp_{sid}") + self._process_err() + if pml: + self.impls[sid].enable_pc_halt() + self.state.pm_reload[sid] = False + # Breakpoint has now been observed for this stamp; clear the + # "already scheduled" guard so the next outer-loop layer can arm + # it normally. For stamps whose target_layer is *not* yet this + # next_layer (early-armed for a future target), the flag stays + # True - preventing re-arm/continue while we walk past. + self.state.break_on_stamp_scheduled[sid] = False + + # Save for run_layer to consume. + self.state.stamps_to_run = stamps_to_run # ------------------------------------------------------------------ # # Core execution primitives (shared by batch and interactive) @@ -388,7 +399,7 @@ def _run_stamp(self, layer, sid, target_itr, cur_it=1): Returns: True on success, False on error. """ - stamp = layer.stamps[sid] + stamp = layer.get_stamp(sid) utl = self.aie_utls[sid] skip_end_pc = not (self.args.run_flags.l1_ofm_dump and stamp.end_pc) @@ -429,21 +440,40 @@ def run_layer(self, layer, target_itr=None, cur_it=None): target_itr: Target iteration (default None = last). cur_it: Initial iteration number (default None = 1). """ - n_stamp = len(layer.stamps) if not cur_it: cur_it = 1 + # stamps_to_run is set by schedule_layer_start. When run_layer is + # called from contexts that did not go through schedule_layer_start + # (e.g. interactive step within a layer), fall back to every replica + # that participates in the layer. + stamps_to_run = self.state.stamps_to_run + if not stamps_to_run: + stamps_to_run = [(sid, False, layer.get_stamp(sid)) for sid in range(len(self.impls)) + if sid < layer.num_batches * layer.stamps_per_batch] + n_stamp = max(len(stamps_to_run), 1) + with ThreadPoolExecutor(max_workers=n_stamp) as executor: - futures = [executor.submit(self._run_stamp, layer, sid, target_itr, cur_it) for sid in range(n_stamp)] + futures = [ + executor.submit(self._run_stamp, layer, sid, target_itr, cur_it) + for sid, _pml, _stamp in stamps_to_run + ] for f in as_completed(futures): res = f.result() if not res: self.state.error = True - # At final iteration of a multistamp layer, drain stamps that have no - # remaining future layer so they don't sit halted at their last breakpoint. - if n_stamp > 1 and (target_itr is None or target_itr == layer.lcp.num_iter): - for sid in range(1, n_stamp): + # At final iteration of a multistamp layer, drain replicas that have + # no remaining future layer so they don't sit halted at their last + # breakpoint. Leftmost-of-each-batch replicas always participate in + # the next layer (handled by schedule_layer_start), so only release + # the non-leftmost ones when they have no further layer to run. + overlay = self.design_info.overlay + total_replicas = len(self.state.pm_reload) + if total_replicas > 1 and (target_itr is None or target_itr == layer.lcp.num_iter): + for sid in range(total_replicas): + if overlay.is_leftmost_in_batch(sid): + continue if not self.state.get_next_layer_for_stamp(sid, idx=1): self.impls[sid].continue_aie() @@ -469,7 +499,7 @@ def execute_and_dump(self): f" stamps: {len(layer.stamps)}, iters {layer.lcp.num_iter}") self.schedule_layer_start(layer) self.run_layer(layer) - for sid in range(len(layer.stamps)): + for sid in range(len(self.state.pm_reload)): self.state.pm_reload[sid] = self.check_pm_reload(sid) for sid in overlay.get_stampids(): diff --git a/src/mldebug/client_debug.py b/src/mldebug/client_debug.py index cbee01b..8397e46 100644 --- a/src/mldebug/client_debug.py +++ b/src/mldebug/client_debug.py @@ -60,7 +60,11 @@ def __init__(self, args, ctx_id, pid, output_dir): try: self.design_info = LayerInfo(args) - self.state = DebugState(self.design_info.layers, self.design_info.overlay.get_stampcount()) + self.state = DebugState( + self.design_info.layers, + self.design_info.overlay.get_stampcount(), + stamps_per_batch=self.design_info.overlay.get_stamps_per_batch(), + ) except Exception as err: if debug_server: print("[INFO] closing debug server.") diff --git a/src/mldebug/debug_state.py b/src/mldebug/debug_state.py index 0a54125..1489baa 100644 --- a/src/mldebug/debug_state.py +++ b/src/mldebug/debug_state.py @@ -11,24 +11,31 @@ class DebugState: Keep Track of debug state """ - def __init__(self, layers, stampcount) -> None: + def __init__(self, layers, stampcount, stamps_per_batch=1) -> None: """ Initialize the DebugState object. Args: layers (list): The list of layer objects that define the AIE execution steps. - stampcount (int): The number of stamps. + stampcount (int): The total number of replicas (batches * stamps_per_batch). + stamps_per_batch (int): Number of stamps within a single batch (S from + the BxSxCxR overlay). Used by get_next_layer_for_stamp to convert a + flat replica id into a per-batch stamp index when checking layer + participation. """ self.current_layer = -1 self.cur_it = 1 self.ofm_ping = True self.layers = layers + self.stamps_per_batch = stamps_per_batch self.manual_breakpoints = [] # Run AIE to finish without invoking breakpoints self.continue_to_finish = False self.error = False self.break_on_stamp_scheduled = [False for _ in range(stampcount)] self.pm_reload = [False for _ in range(stampcount)] + # stamps to run in current layer; set at step to layer start + self.stamps_to_run = None def update_layer(self): """ @@ -45,11 +52,17 @@ def update_layer(self): def get_next_layer_for_stamp(self, stamp_id, idx=0): """ - Find the next layer that includes the specified stamp_id. + Find the next layer in which the given replica participates. + + A layer's per-batch stamp count (`stamps_per_batch`) may be smaller than + the overlay's S, meaning higher-indexed stamps (within a batch) skip + that layer. We map the flat replica id to its per-batch stamp index + `s = stamp_id % S` and require `s < layer.stamps_per_batch`. """ + s = stamp_id % self.stamps_per_batch for i in range(self.current_layer + idx, len(self.layers)): layer = self.layers[i] - if stamp_id < len(layer.stamps): + if s < getattr(layer, "stamps_per_batch", len(layer.stamps)): return layer return None diff --git a/src/mldebug/layer_info.py b/src/mldebug/layer_info.py index 1c45cc7..5fe0338 100644 --- a/src/mldebug/layer_info.py +++ b/src/mldebug/layer_info.py @@ -228,16 +228,19 @@ class Layer: Contains all buffer, iteration, and kernel (stamp) mapping information. """ - def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_report): + def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_report, num_batches=1): """ Initialize a Layer object using given metadata, populating buffer and kernel/stamp lists. Args: - info (dict): Layer metadata. - size_shift (int): Size shift parameter. - version: Software version object. - aie_iface: AIE interface object. - num_stamps (int): Number of stamps in overlay. + info (dict): Layer entry in buffer_info + size_shift (int): Size shift parameter. + version: Software version object. + aie_iface: AIE interface object. + num_stamps (int): Number of stamps per batch (S from BxSxCxR overlay). + mladf_report: Optional MladfReport for templated-graph layers. + num_batches (int): Number of batches (B from BxSxCxR overlay). Each + batch is a data-parallel copy of the per-batch stamps; defaults to 1. """ self.flexml_ids = [] self.l3_ifm_buffers = [] @@ -250,7 +253,7 @@ def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_repor self.lcp = Lcp() self.pm_work_dir = info.get("pm", None) self.is_unsupported = False - self.is_concat = False + self.num_batches = num_batches self.lcp.is_tg = "templated_graph" in info kname = [i.lower() for i in info["kernel_name"]][0] @@ -262,27 +265,36 @@ def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_repor n_stamps = info.get("no_of_stamps") if n_stamps and n_stamps < num_stamps: num_stamps = n_stamps + + # Per-batch stamp count for this layer. Batches share the same stamp + # metadata; the per-batch stamp list is mirrored across batches at call + # sites via get_stamp / get_stamps_for_all_batches. + self.stamps_per_batch = num_stamps self.stamps = [Stamp(name=kname) for _ in range(num_stamps)] + # 1. Layers without any kernel should be skipped + # 2. Unsupported superkernel should be skipped + if info.get("is_concat") or not kname or any(k in kname for k in unsupported_superkernels): + LOGGER.verbose_print(f"[WARNING] unsupported kernel {kname} at Layer {self.layer_order} will be skipped.") + self.is_unsupported = True + return + + # Fill missing TG metadata from mladf report if self.lcp.is_tg: - for sid, stamp in enumerate(self.stamps): - stamp.name = mladf_report.get_skname_for_bilo(self.layer_order, sid) - stamp.elf_name = mladf_report.get_elfid_for_bilo(self.layer_order, sid) - if not stamp.name or stamp.elf_name == -1 or any(k in stamp.name for k in unsupported_superkernels): - LOGGER.verbose_print(f"[WARNING] unsupported kernel {stamp.name} at Layer {self.layer_order} will be skipped.") + for s, stamp in enumerate(self.stamps): + sk_name = mladf_report.get_skname_for_bilo(self.layer_order, s) + elf_name = mladf_report.get_elfid_for_bilo(self.layer_order, s) + if not sk_name or elf_name == -1 or any(k in sk_name for k in unsupported_superkernels): + LOGGER.verbose_print(f"[WARNING] unsupported kernel {sk_name} at Layer {self.layer_order} will be skipped.") self.is_unsupported = True return + stamp.name = sk_name + stamp.elf_name = elf_name self.lcp.num_iter = mladf_report._get_iters_for_bilo(self.layer_order) self._initialize_l3_buffers(info, version) - # 1. Layers without any kernel should be skipped - # 2. Unsupported superkernel should be skipped - self.is_concat = info.get("is_concat") or not kname - if self.is_concat: - LOGGER.verbose_print(f"[WARNING] unsupported kernel {kname} at Layer {self.layer_order} will be skipped.") - self.is_unsupported = True - return + # No L2 support for templated graph layers if self.lcp.is_tg: return @@ -291,6 +303,23 @@ def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_repor self._initialize_iters(info, version) LOGGER.verbose_print(f"{self.layer_order}: {kname} {self.lcp.num_iter}") + def get_stamp(self, sid): + """ + Return the per-batch stamp metadata for a flat replica id. + + Batches are data-parallel copies that share the same kernel/PCs, so the + canonical per-batch stamp list (length `stamps_per_batch`) is indexed by + `sid % stamps_per_batch`. + """ + return self.stamps[int(sid % self.stamps_per_batch)] + + def get_stamps_for_all_batches(self): + """ + Return a list of stamps expanded across all batches (length B * S). Used + by callers that want to iterate over flat replica ids. + """ + return self.stamps * self.num_batches + def _initialize_flexml_ids(self, info): """ Populate self.flexml_ids for this layer from its metadata. @@ -457,12 +486,15 @@ def __init__(self, args): args: Namespace of configuration and input files (from argparser or similar). """ self.layers = [] - self.layout = [1, 4, 4] + # Layout for Overlay: (batches, stamps_per_batch, nrow, ncol). Updated by + # _read_buffer_info when a buffer_info.json is supplied. + self.layout = [1, 1, 4, 4] self.aie_iface = args.aie_iface self.x2 = False self.x2_work_dirs = {} self.layer_workdir_map = {} - self.device_batch_size = 1 + self.num_batches = 1 + self.num_stamps = 1 self.mladf_report = None has_bi = args.buffer_info and Path(args.buffer_info).is_file() @@ -473,14 +505,16 @@ def __init__(self, args): data = self._read_buffer_info(args.buffer_info) # 2. Initialize Overlay from Layout self.overlay = Overlay(args, self.layout) + # Re-sync local view in case Overlay applied -o overrides. + self.num_batches = self.overlay.get_batch_count() + self.num_stamps = self.overlay.get_stamps_per_batch() # 3. Parse mladf report. # TBD: memory optimize this as this json can be large if not args.aie_only and has_bi and use_mladf: self.mladf_report = MladfReport(args.buffer_info, args.mladf_report, self.overlay.get_stampwidth()) # 4. Initialize Layers if not args.aie_only: - num_stamps = len(self.overlay.get_stampids()) - self._init_layers(data, args.aie_iface, num_stamps) + self._init_layers(data, args.aie_iface, self.num_stamps, self.num_batches) # 5: Parse work dir if self.x2: for layer in self.layers: @@ -528,12 +562,12 @@ def print_aie_functions(self, elf_id=None): def is_stamped(self): """ - Check if design is a multi-stamp (multi-superkernel) program. + Check if the design has more than one stamp per batch. Returns: - bool: True if stamped/multi-stamp, False otherwise. + bool: True if stamps_per_batch > 1, False otherwise. """ - return len(self.overlay.get_stampids()) > 1 + return self.num_stamps > 1 def is_batched(self): """ @@ -542,7 +576,7 @@ def is_batched(self): Returns: bool: True if more than one batch, False otherwise. """ - return self.device_batch_size > 1 + return self.num_batches > 1 def _create_info(self): """ @@ -556,7 +590,7 @@ def _create_info(self): info[n] = {} for layer in self.layers: order = layer.layer_order - for sid, stamp in enumerate(layer.stamps): + for sid, stamp in enumerate(layer.get_stamps_for_all_batches()): imap = info[sid] elf = stamp.elf_name if not elf: @@ -649,13 +683,15 @@ def initialize_l3_layer_mapping(self, flexmlrt_hsi, external_buffer_id): for layer in self.layers: layer.l3_buffers = layer.l3_ofm_buffers if self.x2 else layer.l3_ifm_buffers - # Duplicate L3 buffers for multi-stamp designs (batched designs) + # Duplicate L3 buffers per additional batch (data-parallel copies). + # Stamps within a batch share the same L3 IFM/OFM region, so we only + # replicate across batches, not across per-batch stamps. if self.is_batched(): original_buffers = list(layer.l3_buffers) - for stamp_idx in range(1, self.device_batch_size): + for b in range(1, self.num_batches): for orig_buffer in original_buffers: stamped_buffer = L3Buffer( - name=f"{orig_buffer.name}_stamp_{stamp_idx}", + name=f"{orig_buffer.name}_stamp_{b}", tensor_name=orig_buffer.tensor_name, size=orig_buffer.size, offset=None @@ -733,7 +769,15 @@ def _overlap(self, buf1, buf2): def _read_buffer_info(self, buffer_info_file): """ - Load and parse the buffer_info JSON, extracting layout and batch size. + Load and parse the buffer_info JSON, extracting the (B, S, R, C) layout. + + buffer_info encodes the 4D overlay shape across two fields: + .meta.layout -> [stamps_in_overlay, R, C] + .meta.device_batch_size -> B (number of data-parallel batch copies) + .meta.max_stamps_used -> S (per-batch stamps actually used; may be + smaller than stamps_in_overlay). Falls + back to max(no_of_stamps) across layers, + then to the layout's stamp count. Args: buffer_info_file (str): Path to buffer_info JSON. @@ -741,43 +785,45 @@ def _read_buffer_info(self, buffer_info_file): Returns: dict: Parsed JSON object from file. Side Effects: - - Sets self.layout, self.device_batch_size, self.x2. + - Sets self.layout to (B, S, R, C), self.num_batches, self.num_stamps, + self.x2. """ print("Initializing Buffer Info ...") with open(buffer_info_file, encoding="utf-8") as fd: data = json.load(fd) - self.layout = data[".meta"].get("layout") - self.device_batch_size = data[".meta"].get("device_batch_size", 1) - - # Layout now represents Full overlay but design can choose - # to use only a part of it - stampcount = data[".meta"].get("max_stamps_used") - if stampcount: - self.layout[0] = stampcount - elif data.get("layers"): - self.layout[0] = max(lyr.get("no_of_stamps", 1) for _, lyr in data["layers"].items() ) - # Else use old style - - # Treat mBnS as 1BnS - if self.device_batch_size > 1: - if self.layout[0] > 1: - LOGGER.log("[WARNING] Currently mBatch x nStamp is unsupported. Setting batchcount to 1.") - self.device_batch_size = 1 + + raw_layout = data[".meta"].get("layout") or [1, 4, 4] + overlay_stamps, nrow, ncol = raw_layout + + # B (batches) comes from device_batch_size. + batches = data[".meta"].get("device_batch_size", 1) + + # S (per-batch stamps used) comes from max_stamps_used, with sensible + # fallbacks: layer hints, then the overlay's nominal stamp count. + stamps = data[".meta"].get("max_stamps_used") + if not stamps: + if data.get("layers"): + stamps = max(lyr.get("no_of_stamps", 1) for _, lyr in data["layers"].items()) else: - self.layout[0] = self.device_batch_size - LOGGER.log("Batched design detected") + stamps = overlay_stamps + self.num_batches = batches + self.num_stamps = stamps + self.layout = (batches, stamps, nrow, ncol) + if batches > 1: + LOGGER.log("Batched design detected") self.x2 = data[".meta"].get("flow") == "x2" return data - def _init_layers(self, raw_info, aie_iface, num_stamps): + def _init_layers(self, raw_info, aie_iface, num_stamps, num_batches=1): """ Parse all layer entries from metadata and populate self.layers. Args: raw_info (dict): Parsed buffer_info JSON metadata. aie_iface: AIE interface object. - num_stamps (int): Number of stamps identified from overlay. + num_stamps (int): Stamps per batch (S from BxSxCxR). + num_batches (int): Number of batches (B from BxSxCxR). """ version = Version.from_string(raw_info[".meta"]["version"]) size_shift = raw_info[".meta"].get("size_shift") @@ -794,7 +840,8 @@ def _init_layers(self, raw_info, aie_iface, num_stamps): raw_layers = sorted(raw_layers.items(), key=lambda item: item[1]["layer_order"]) for entry in raw_layers: info = entry[1] - layer = Layer(info, size_shift, version, aie_iface, num_stamps, self.mladf_report) + layer = Layer(info, size_shift, version, aie_iface, num_stamps, self.mladf_report, + num_batches=num_batches) self.layers.append(layer) def _initialize_layers_from_workdir_x2(self, args): @@ -816,13 +863,15 @@ def _initialize_layers_from_workdir_x2(self, args): self.layers = [layer for layer in self.layers if not layer.lcp.is_tg] if not self.layers: raise RuntimeError("No layers found in the design.") - for sid in self.overlay.get_stampids(): + # Resolve PCs once per per-batch stamp index. Batches share the same + # ELF/PCs so no mirroring is needed on the Layer's stamps list. + for s in range(self.num_stamps): for layer in self.layers: - flist = list(self.layer_workdir_map[layer.layer_order].aie_functions[sid].values())[0] - self.layer_workdir_map[layer.layer_order].pm_reload_en[sid] = True + flist = list(self.layer_workdir_map[layer.layer_order].aie_functions[s].values())[0] + self.layer_workdir_map[layer.layer_order].pm_reload_en[s] = True for f in flist: - if _strip_template(layer.stamps[sid].name.lower()) == _strip_template(f.name.lower()): - stamp = layer.stamps[sid] + if _strip_template(layer.stamps[s].name.lower()) == _strip_template(f.name.lower()): + stamp = layer.stamps[s] LOGGER.verbose_print("Layer found:", layer.layer_order, stamp.name, f.start_pc) stamp.elf_name = layer.pm_work_dir stamp.start_pc = f.start_pc @@ -857,16 +906,18 @@ def _initialize_layers_from_workdir(self, args): # Hierarchy of Data: # Stamp <- Elf <- Layers # AIECompiler only knows flexmlIDs so we use that to match with correct layer - for sid in self.overlay.get_stampids(): - has_pm_reload = self.work_dir.pm_reload_en[sid] - for elf_name, flist in self.work_dir.aie_functions[sid].items(): - LOGGER.verbose_print(f"Initializing layers for stamp {sid} ELF: {elf_name}") + # Resolve PCs once per per-batch stamp index (s). Batch copies share + # the same ELFs and PCs so no extra resolution is needed. + for s in range(self.num_stamps): + has_pm_reload = self.work_dir.pm_reload_en[s] + for elf_name, flist in self.work_dir.aie_functions[s].items(): + LOGGER.verbose_print(f"Initializing layers for stamp {s} ELF: {elf_name}") elf_id = elf_name.split("reloadable")[-1] for f, l in itertools.product(flist, self.layers): - if sid > len(l.stamps) - 1: + if s > len(l.stamps) - 1: continue - if _strip_template(l.stamps[sid].name.lower()) == _strip_template(f.name.lower()): - stamp = l.stamps[sid] + if _strip_template(l.stamps[s].name.lower()) == _strip_template(f.name.lower()): + stamp = l.stamps[s] if l.lcp.is_tg and stamp.elf_name == elf_id: stamp.start_pc = f.start_pc if f.name.lower() not in skip_end_pc_kernels: @@ -874,7 +925,7 @@ def _initialize_layers_from_workdir(self, args): continue # Check if this layer is present in the elf # In buffer_info the flexml_ids might not be in order of stamps - if has_pm_reload and not any(i in self.work_dir.elf_flxmlid_maps[sid][elf_id] for i in l.flexml_ids): + if has_pm_reload and not any(i in self.work_dir.elf_flxmlid_maps[s][elf_id] for i in l.flexml_ids): continue LOGGER.verbose_print("Layer found:", l.layer_order, stamp.name) stamp.elf_name = elf_id diff --git a/src/mldebug/memory_dumper.py b/src/mldebug/memory_dumper.py index c2da679..10ad0c0 100644 --- a/src/mldebug/memory_dumper.py +++ b/src/mldebug/memory_dumper.py @@ -119,13 +119,8 @@ def dump_memory_l2(self, buffers, it, layer_order=None, use_l2_names=False, sid= return overlay = self.design_info.overlay - # batch + stamp combination doesn't exist - if self.design_info.is_batched(): - batch = str(sid) - suffix = "stamp0" - else: - batch = "0" - suffix = f"stamp{sid}" + batch = str(overlay.replica_to_batch(sid)) + suffix = f"stamp{overlay.replica_to_stamp(sid)}" for buffer in buffers: if buffer.ofm: @@ -161,9 +156,7 @@ def dump_memory_l1(self, buffers, it, is_ping=None, sid=0): if self.args.run_flags.skip_dump or self.args.run_flags.l2_dump_only: return - batch = "0" - if self.design_info.is_batched(): - batch = str(sid) + batch = str(self.design_info.overlay.replica_to_batch(sid)) for buffer in buffers: if not buffer.l1: diff --git a/src/mldebug/work_dir.py b/src/mldebug/work_dir.py index 0f9bb3e..e44544f 100644 --- a/src/mldebug/work_dir.py +++ b/src/mldebug/work_dir.py @@ -264,6 +264,11 @@ def _initialize_functions(self, work_dir, overlay): Parse work directory and its ELF files to extract function ranges, tail calls, global variables, and layer/partition info. + For batched + stamped designs we only parse one batch's worth of stamps + (S replicas) and mirror the parsed data across the remaining batch copies. + The same ELF binaries are loaded into the additional batch columns, so + the PCs and global addresses are identical. + Args: work_dir (str): Path to the AIE work directory. overlay: Overlay object for tile mapping. @@ -275,36 +280,52 @@ def _initialize_functions(self, work_dir, overlay): if not Path.exists(full_path): LOGGER.log(f"[INFO] Work directory {full_path} does not exist.") return - for stampid in overlay.get_stampids(): - col, row = overlay.get_first_relative_core_tile(stampid) + stamps_per_batch = overlay.get_stamps_per_batch() + batches = overlay.get_batch_count() + # Parse per-batch stamps once, then mirror across batches below. + for s in range(stamps_per_batch): + col, row = overlay.get_first_relative_core_tile(s) core_name = f"{col}_{row}" print(f"Core: {core_name}") plist = [] for elf in full_path.glob(f"{core_name}*"): plist.append(elf) if len(plist) > 1: - self.pm_reload_en[stampid] = True - self._parse_aie_runtime_control(work_dir, col, row, stampid) - self.aie_functions[stampid] = {} + self.pm_reload_en[s] = True + self._parse_aie_runtime_control(work_dir, col, row, s) + self.aie_functions[s] = {} # Parse LST for p in plist: LOGGER.verbose_print(f"[INFO] Process: {p}") if not self.peano: - success = self._parse_lst_chess(p, stampid) + success = self._parse_lst_chess(p, s) if not success: print(f"[WARNING] Failed to parse LST for {p}. Assuming peano compiler.") self.peano = True if self.peano: - self._parse_lst_llvm(p, stampid) + self._parse_lst_llvm(p, s) # Parse map file to find LCP # Only base map file has global variables first_elf = full_path / core_name if self.peano: - self._extract_globals_llvm(first_elf, stampid) + self._extract_globals_llvm(first_elf, s) else: - self._extract_globals_chess(first_elf, stampid) + self._extract_globals_chess(first_elf, s) + + # Mirror per-batch data into batch 1..B-1 replica slots so all callers + # that index by flat replica id (b*S + s) see consistent data. + for b in range(1, batches): + for s in range(stamps_per_batch): + sid = b * stamps_per_batch + s + if sid >= len(self.pm_reload_en): + break + self.pm_reload_en[sid] = self.pm_reload_en[s] + self.aie_functions[sid] = self.aie_functions[s] + self.elf_flxmlid_maps[sid] = self.elf_flxmlid_maps[s] + self.globals[sid] = self.globals[s] + self._stamp_lst_map[sid] = self._stamp_lst_map[s] def _parse_lst_chess(self, elf, stampid): """ From e8f84d27a3bf90d9b3988d203003bb763cbbe33e Mon Sep 17 00:00:00 2001 From: anurag Date: Wed, 27 May 2026 05:24:00 -0600 Subject: [PATCH 03/17] update claude.md Signed-off-by: anurag --- CLAUDE.md | 119 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 81 insertions(+), 38 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 601c11d..1d1b0eb 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -51,19 +51,37 @@ secondary counters (`depth_iter`, `buffer_iter`, `super_iter`, L2->L3 OFM spills, and weight reloads happen. Together these form the "layer control parameters" (`Lcp`). -**Stamp.** A spatial replica of a small AIE region. A 4x4 design -"stamped" twice (`-o 2x4x4`) runs the same kernels in parallel on two -side-by-side 4x4 regions. Each stamp gets its own backend connection and -its own `AIEUtil` helper. The debugger schedules and breakpoints them -independently. - -**Batch.** Conceptually the same as stamping but used for data-parallel -inference; multiple input samples processed in parallel by replicated -hardware. Detected from `device_batch_size` in `buffer_info.json`. - -**Overlay.** The shape of the AIE region in use, written `NxCxR` -(stamps x columns x rows). Default `1x4x4`. Each stamp i occupies -columns `[i*C, (i+1)*C)`. +**Stamp.** A spatial replica of a small AIE region within one batch. +Different stamps in a batch may run *different* kernels (and may skip +layers entirely - higher-indexed stamps participate in a subset of +layers). Each replica gets its own backend connection and its own +`AIEUtil` helper. + +**Batch.** A data-parallel copy of the whole per-batch stamp set. +Batches run the *same* kernels on different input samples, so they +share PCs/ELFs/buffers and the debugger only resolves metadata once +per per-batch stamp and mirrors it across batches. Taken from +`device_batch_size` in `buffer_info.json`. + +**Overlay (4D).** The shape of the AIE region in use is `BxSxCxR`: +- `B` = number of batches (data-parallel copies) +- `S` = stamps per batch (spatial replicas inside one batch) +- `C` = columns per stamp +- `R` = rows per stamp + +Replicas are packed stamp-inner along columns: the flat replica id +`sid = b*S + s` occupies columns `[sid*C, (sid+1)*C)`. The rest of +the code refers to that flat id as "stamp id" / "sid". `Overlay` +helpers: `get_batch_count()`, `get_stamps_per_batch()`, +`get_stampcount()` (= total replicas = B*S), `replica_to_batch(sid)`, +`replica_to_stamp(sid)`, `is_leftmost_in_batch(sid)` (true when +`sid % S == 0`; these replicas are the per-batch stamp 0 and always +participate in every layer). + +`buffer_info.json` stores this as `.meta.layout = [stamps, R, C]`, +`.meta.device_batch_size = B`, `.meta.max_stamps_used = S` (with +fallback to `max(no_of_stamps)` across layers). The `-o` CLI override +parses `SxCxR` (or `CxR`) and forces `B=1`. **PM reload.** The AIE has limited program memory; large designs split their code across multiple ELFs and reload program memory between layer @@ -138,8 +156,9 @@ Common runtime flags (`-f`): and the file format. - `skip_iter` -- use a perf-counter trick to fast-forward iterations instead of polling per iteration. -- `multistamp` -- actually drive every stamp; default is to collapse to - stamp 0 for sanity. +- `multistamp` -- actually drive every replica (all batches and all + per-batch stamps); default is to collapse to a single replica + (B=S=1) for sanity. ### Testing @@ -235,20 +254,26 @@ This is where the real work happens. The two methods to read first are - `common_init()` runs once before any layer. If the user did not pass the `multistamp` flag, the runner collapses the design to a single - stamp here -- it edits the layer/overlay/impls lists in place so the - rest of the system simply sees a 1-stamp design. This is the safest - default because multi-stamp scheduling is intricate. + replica (`B=S=1`) here -- it edits the layer/overlay/impls lists in + place so the rest of the system simply sees a 1-replica design. + This is the safest default because multi-stamp scheduling is + intricate. - `schedule_layer_start()` arms the start (and optionally end) PC - breakpoint on every stamp. When PM reload is expected it also - installs a combo-event that survives the reload, *and* it may arm a - future stamp's breakpoint *early* -- before the outer loop reaches - the layer that stamp actually runs. This is necessary because if a - stamp does not participate in the current layer, releasing it - without a valid breakpoint would let it free-run past its real - target. The "PM RELOAD on stamp X" log line is when arming happens, - not when the reload physically occurs. -- `run_layer()` runs one layer to completion across all stamps using a - thread pool, one worker per stamp. + breakpoint on every stamp. Per-batch leftmost replicas (where + `sid % S == 0`) always participate in `next_layer`; other replicas + may need their breakpoint armed for a *future* layer they actually + run. When PM reload is expected it also installs a combo-event that + survives the reload, *and* it may arm a future stamp's breakpoint + *early* -- before the outer loop reaches the layer that stamp + actually runs. This is necessary because if a stamp does not + participate in the current layer, releasing it without a valid + breakpoint would let it free-run past its real target. The "PM + RELOAD on stamp X" log line is when arming happens, not when the + reload physically occurs. The list of replicas that actually + breakpoint on `next_layer` is stashed in `state.stamps_to_run` for + `run_layer` to consume. +- `run_layer()` runs one layer to completion across the replicas in + `state.stamps_to_run` using a thread pool, one worker per replica. - Inside a layer the runner alternates: continue, poll for breakpoint, identify whether we hit start or end PC, dump the appropriate buffers, increment the iteration counter, repeat. @@ -277,17 +302,26 @@ through. A small holder for "where are we now": current layer index, current iteration, ping/pong toggle for OFM dumps, list of pending manual -breakpoints, and per-stamp PM-reload flags. `update_layer()` is the -generator the runner iterates to advance through the design. +breakpoints, per-replica PM-reload flags, and `stamps_per_batch` (S) +so `get_next_layer_for_stamp(sid)` can map a flat replica id to its +per-batch stamp index and gate against each layer's +`stamps_per_batch`. `stamps_to_run` is set by `schedule_layer_start` +and consumed by `run_layer`. `update_layer()` is the generator the +runner iterates to advance through the design. ### `layer_info.py` -- buffer / layer metadata Parses `buffer_info.json` and produces the `Layer` and `Buffer` objects the rest of the system uses. -- A `Layer` knows its kernels (one `Stamp` per AIE replica), its - input/output/weight buffers, its L3 buffers, and its iteration - counts (`Lcp`). +- A `Layer` knows its kernels, its input/output/weight buffers, its + L3 buffers, and its iteration counts (`Lcp`). `layer.stamps` is the + *per-batch* stamp list (length `layer.stamps_per_batch`, which may + be less than the overlay's S - higher-indexed stamps skip this + layer). Batches share kernel/PC metadata, so callers translate a + flat replica id with `layer.get_stamp(sid)` (returns + `stamps[sid % stamps_per_batch]`) or + `layer.get_stamps_for_all_batches()` for the expanded `B*S` view. - A `Buffer` is the user-level concept (one IFM, one OFM, one weight set). Internally it holds an `L1Buffer` (ping/pong) and a list of `L2Buffer` chunks. Buffers larger than the memory-tile size are @@ -321,6 +355,10 @@ preceding lines and skips those. Owns the on-disk layout and the actual reads. Files land under `/batch/layer_//_/`. +The batch/stamp coordinates come from +`overlay.replica_to_batch(sid)` and `overlay.replica_to_stamp(sid)`, +so an `sid` always maps to one `batch` directory and one +`stamp` suffix on the L2 dump filename. Binary format: an 8-byte little-endian length header followed by the raw 32-bit words. With `text_dump` the data is written as ASCII hex @@ -374,10 +412,15 @@ Two non-obvious pieces: ### `aie_overlay.py` -- overlay geometry -Parses `NxCxR` from `-o` (or the layout in `buffer_info.json`, which -is stored as `[stamps, nrow, ncol]` rather than `NxCxR`) and builds -the list of `(col, row)` tiles per stamp. Methods like `get_tiles` -filter that list by tile type, with the AIE row offset already added. +Holds the 4D `(B, S, C, R)` layout and builds the list of `(col, row)` +tiles for each flat replica id `sid = b*S + s`. Layout is supplied +either from `buffer_info.json` (`[stamps, R, C]` plus +`device_batch_size` and `max_stamps_used`) or parsed from the `-o` +CLI flag (`SxCxR` or `CxR`, forces `B=1`). `get_tiles` filters by +tile type with the AIE row offset already added. Methods used by the +rest of the system: `get_stampcount` (total replicas, B*S), +`get_batch_count`, `get_stamps_per_batch`, `get_stampwidth` (C), +`replica_to_batch`, `replica_to_stamp`, `is_leftmost_in_batch`. ### `arch/` -- per-device definitions @@ -536,7 +579,7 @@ Before implementing: - No "flexibility" or "configurability" that wasn't requested. - No error handling for impossible scenarios. - If you write 200 lines and it could be 50, rewrite it. -- Try to keep docstrings short to medium length. +- Keep docstrings short - ideally 3-4 lines. Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify. From 5bb7abcdfa4ebfe95a47672f44f7cf067b3d0a07 Mon Sep 17 00:00:00 2001 From: anurag Date: Thu, 28 May 2026 15:55:44 -0600 Subject: [PATCH 04/17] single stamp fix Signed-off-by: anurag --- src/mldebug/aie_overlay.py | 9 ++++++-- src/mldebug/batch_runner.py | 41 ++++++++++++------------------------- src/mldebug/debug_state.py | 2 +- 3 files changed, 21 insertions(+), 31 deletions(-) diff --git a/src/mldebug/aie_overlay.py b/src/mldebug/aie_overlay.py index 78fcaef..95f98eb 100644 --- a/src/mldebug/aie_overlay.py +++ b/src/mldebug/aie_overlay.py @@ -35,7 +35,7 @@ def __init__(self, args, layout): self.aie_iface = args.aie_iface self.stamps = {} self.impls = {} - self.layout = self._get_layout(args.overlay, layout) + self.layout = self._get_layout(args.overlay, layout, args.run_flags.multistamp) batches, stamps_per_batch, ncol, nrow = self.layout for b in range(batches): @@ -48,7 +48,7 @@ def __init__(self, args, layout): tiles.append((col, row)) self.stamps[replica_id] = tiles - def _get_layout(self, args_overlay, layout): + def _get_layout(self, args_overlay, layout, is_multistamp): """ Determine the overlay layout parameters as (batches, stamps, ncol, nrow). @@ -78,7 +78,12 @@ def _get_layout(self, args_overlay, layout): elif len(layout) == 3: # Legacy form: [stamps, R, C]; batches encoded by caller into stamps stamps_per_batch, nrow, ncol = layout + + if not is_multistamp: + batches, stamps_per_batch = (1,1) + print("[INFO] Using Layout: ", batches, stamps_per_batch, ncol, nrow) + return batches, stamps_per_batch, ncol, nrow def get_first_relative_core_tile(self, stamp_id=0): diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py index be88432..dd9e8a7 100644 --- a/src/mldebug/batch_runner.py +++ b/src/mldebug/batch_runner.py @@ -215,7 +215,7 @@ def schedule_layer_start(self, next_layer): bes_to_poll = [] bes_to_run = [] - stamps_to_run = [] + active_stamps_all_batches = [] # Per-batch leftmost stamps (sid 0 within each batch) always have their # breakpoint scheduled on next_layer. The remaining stamps may early-arm # a breakpoint for a *future* layer they actually participate in. @@ -248,7 +248,7 @@ def schedule_layer_start(self, next_layer): bes_to_run.append(self.impls[sid]) if target_layer.layer_order == next_layer.layer_order: bes_to_poll.append(self.impls[sid]) - stamps_to_run.append((sid, pml, stamp)) + active_stamps_all_batches.append((sid, pml, stamp)) # Run stamps at exact same time for be in bes_to_run: @@ -261,7 +261,7 @@ def schedule_layer_start(self, next_layer): # Now check that breakpoints were hit at the right PC for each stamp # that actually targets next_layer. When combo events are used the PC # may have moved by a few cycles past the start_pc. - for sid, pml, stamp in stamps_to_run: + for sid, pml, stamp in active_stamps_all_batches: pcs = self.impls[sid].read_core_pc(True) utl = self.aie_utls[sid] is_correct_pc = utl.pcs_match_target(pcs, stamp.start_pc, allow_combo_delay=pml) @@ -274,15 +274,11 @@ def schedule_layer_start(self, next_layer): if pml: self.impls[sid].enable_pc_halt() self.state.pm_reload[sid] = False - # Breakpoint has now been observed for this stamp; clear the - # "already scheduled" guard so the next outer-loop layer can arm - # it normally. For stamps whose target_layer is *not* yet this - # next_layer (early-armed for a future target), the flag stays - # True - preventing re-arm/continue while we walk past. + # Breakpoint has now been observed for this stamp; self.state.break_on_stamp_scheduled[sid] = False # Save for run_layer to consume. - self.state.stamps_to_run = stamps_to_run + self.state.active_stamps_all_batches = active_stamps_all_batches # ------------------------------------------------------------------ # # Core execution primitives (shared by batch and interactive) @@ -397,7 +393,7 @@ def _run_stamp(self, layer, sid, target_itr, cur_it=1): cur_it: Starting iteration number (default 1). Returns: - True on success, False on error. + Success or error. """ stamp = layer.get_stamp(sid) utl = self.aie_utls[sid] @@ -443,31 +439,20 @@ def run_layer(self, layer, target_itr=None, cur_it=None): if not cur_it: cur_it = 1 - # stamps_to_run is set by schedule_layer_start. When run_layer is - # called from contexts that did not go through schedule_layer_start - # (e.g. interactive step within a layer), fall back to every replica - # that participates in the layer. - stamps_to_run = self.state.stamps_to_run - if not stamps_to_run: - stamps_to_run = [(sid, False, layer.get_stamp(sid)) for sid in range(len(self.impls)) - if sid < layer.num_batches * layer.stamps_per_batch] - n_stamp = max(len(stamps_to_run), 1) - - with ThreadPoolExecutor(max_workers=n_stamp) as executor: + # active_stamps_all_batches is determined by schedule_layer_start + stamps = self.state.active_stamps_all_batches + + with ThreadPoolExecutor(max_workers=len(stamps)) as executor: futures = [ executor.submit(self._run_stamp, layer, sid, target_itr, cur_it) - for sid, _pml, _stamp in stamps_to_run + for sid, _pml, _stamp in stamps ] for f in as_completed(futures): res = f.result() if not res: self.state.error = True - # At final iteration of a multistamp layer, drain replicas that have - # no remaining future layer so they don't sit halted at their last - # breakpoint. Leftmost-of-each-batch replicas always participate in - # the next layer (handled by schedule_layer_start), so only release - # the non-leftmost ones when they have no further layer to run. + # Unhalt right replicas that have no remaining future layer overlay = self.design_info.overlay total_replicas = len(self.state.pm_reload) if total_replicas > 1 and (target_itr is None or target_itr == layer.lcp.num_iter): @@ -499,7 +484,7 @@ def execute_and_dump(self): f" stamps: {len(layer.stamps)}, iters {layer.lcp.num_iter}") self.schedule_layer_start(layer) self.run_layer(layer) - for sid in range(len(self.state.pm_reload)): + for sid, _ in enumerate(self.state.pm_reload): self.state.pm_reload[sid] = self.check_pm_reload(sid) for sid in overlay.get_stampids(): diff --git a/src/mldebug/debug_state.py b/src/mldebug/debug_state.py index 1489baa..decaed1 100644 --- a/src/mldebug/debug_state.py +++ b/src/mldebug/debug_state.py @@ -35,7 +35,7 @@ def __init__(self, layers, stampcount, stamps_per_batch=1) -> None: self.break_on_stamp_scheduled = [False for _ in range(stampcount)] self.pm_reload = [False for _ in range(stampcount)] # stamps to run in current layer; set at step to layer start - self.stamps_to_run = None + self.active_stamps_all_batches = None def update_layer(self): """ From e2b2f9f48e269b45cf1599f19461ba080ced52f4 Mon Sep 17 00:00:00 2001 From: anurag Date: Fri, 29 May 2026 05:00:44 -0600 Subject: [PATCH 05/17] use early quiesce --- src/mldebug/aie_overlay.py | 40 ++++++++++++++++++++++++++----------- src/mldebug/batch_runner.py | 28 ++------------------------ src/mldebug/client_debug.py | 15 ++++++++++++++ 3 files changed, 45 insertions(+), 38 deletions(-) diff --git a/src/mldebug/aie_overlay.py b/src/mldebug/aie_overlay.py index 95f98eb..9cb0444 100644 --- a/src/mldebug/aie_overlay.py +++ b/src/mldebug/aie_overlay.py @@ -35,9 +35,9 @@ def __init__(self, args, layout): self.aie_iface = args.aie_iface self.stamps = {} self.impls = {} - self.layout = self._get_layout(args.overlay, layout, args.run_flags.multistamp) + batches, stamps_per_batch, ncol, nrow = self._get_layout(args.overlay, layout) - batches, stamps_per_batch, ncol, nrow = self.layout + # Materialize tiles for every physical replica so dropped ones stay quiescible. for b in range(batches): for s in range(stamps_per_batch): replica_id = b * stamps_per_batch + s @@ -48,7 +48,14 @@ def __init__(self, args, layout): tiles.append((col, row)) self.stamps[replica_id] = tiles - def _get_layout(self, args_overlay, layout, is_multistamp): + # Without `multistamp`, collapse to one active replica so LayerInfo/DebugState/ + # backends size to it; extras stay in self.stamps (see get_inactive_tiles). + if args.run_flags.multistamp: + self.layout = (batches, stamps_per_batch, ncol, nrow) + else: + self.layout = (1, 1, ncol, nrow) + + def _get_layout(self, args_overlay, layout): """ Determine the overlay layout parameters as (batches, stamps, ncol, nrow). @@ -79,9 +86,6 @@ def _get_layout(self, args_overlay, layout, is_multistamp): # Legacy form: [stamps, R, C]; batches encoded by caller into stamps stamps_per_batch, nrow, ncol = layout - if not is_multistamp: - batches, stamps_per_batch = (1,1) - print("[INFO] Using Layout: ", batches, stamps_per_batch, ncol, nrow) return batches, stamps_per_batch, ncol, nrow @@ -108,8 +112,7 @@ def get_tiles(self, tile_type=None, stamp_id=0, raw=False): tile_type (str, optional): Tile type identifier for filtering. If None, returns all tile positions. stamp_id (int, optional): Replica id to filter tiles by. Defaults to 0. - raw (bool, optional): If True, return all tile positions for all - replicas, unfiltered. + raw (bool, optional): Return all tiles for all replicas Returns: list[tuple]: List of (column, row) tile coordinates corresponding to @@ -117,8 +120,8 @@ def get_tiles(self, tile_type=None, stamp_id=0, raw=False): """ tile_list = [] if raw: - for stamp in self.stamps.values(): - tile_list.extend(stamp) + for sid in self.get_stampids(): + tile_list.extend(self.stamps[sid]) else: tile_list = self.stamps[stamp_id] if not tile_type: @@ -127,12 +130,25 @@ def get_tiles(self, tile_type=None, stamp_id=0, raw=False): def get_stampids(self): """ - Get a list of all configured replica ids in the overlay. + Get a list of the active replica ids. In single-stamp mode: [0]; Returns: list[int]: List of integer replica ids (length = batches * stamps). """ - return list(self.stamps.keys()) + return list(range(self.get_replica_count())) + + def get_inactive_tiles(self): + """ + Tiles for physical replicas that exist in the design but fall outside the + active view (every replica beyond replica 0 when multistamp is disabled). + + Returns: + list[tuple]: (column, row) tiles to be quiesced. Empty in multistamp mode. + """ + tiles = [] + for sid in range(self.get_replica_count(), len(self.stamps)): + tiles.extend(self.stamps[sid]) + return tiles def get_replica_count(self): """ diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py index dd9e8a7..90c4040 100644 --- a/src/mldebug/batch_runner.py +++ b/src/mldebug/batch_runner.py @@ -57,33 +57,9 @@ def __init__(self, args, state, design_info, impls, aie_utls, def common_init(self): """ - Common initialization for batch and interactive modes. - - Collapses to a single replica (batch 0, stamp 0) when the `multistamp` - flag is not set, enables PC halt for all replicas, and initializes - skip-iteration support. + Enable PC halt and skip-iteration support for each active replica. The + single-stamp collapse is handled up front by the Overlay. """ - if not self.args.run_flags.multistamp and self.design_info.overlay.get_stampcount() > 1: - for layer in self.design_info.layers: - layer.stamps[:] = layer.stamps[:1] - layer.stamps_per_batch = 1 - layer.num_batches = 1 - for u in self.aie_utls[1:]: - u.initialize_stamp() - # In-place list modification so all holders of these references see the change - del self.aie_utls[1:] - del self.impls[1:] - # Drop both batch and stamp dims to 1, keeping (C, R) intact. - _, _, ncol, nrow = self.design_info.overlay.layout - self.design_info.overlay.layout = (1, 1, ncol, nrow) - self.design_info.overlay.stamps = {0: self.design_info.overlay.stamps[0]} - # Keep DebugState in sync with the collapsed view so per-batch helpers - # behave correctly (S=1 makes every replica leftmost-in-batch). - self.state.stamps_per_batch = 1 - self.design_info.num_batches = 1 - self.design_info.num_stamps = 1 - LOGGER.log("[INFO] Using single stamp control. Please use multistamp flag for more data.") - for sid in self.design_info.overlay.get_stampids(): self.impls[sid].enable_pc_halt() if self.args.run_flags.skip_iter: diff --git a/src/mldebug/client_debug.py b/src/mldebug/client_debug.py index 8397e46..cb4f055 100644 --- a/src/mldebug/client_debug.py +++ b/src/mldebug/client_debug.py @@ -90,6 +90,8 @@ def __init__(self, args, ctx_id, pid, output_dir): ) ) + self._quiesce_inactive_stamps() + self.impl = self.impls[0] self.status_handle = AIEStatus( self.impl, self.design_info.overlay.get_tiles, args.aie_iface, self.design_info.overlay.get_repr() @@ -113,6 +115,19 @@ def __init__(self, args, ctx_id, pid, output_dir): self.dumper.debug_server.close() sys.exit(0) + def _quiesce_inactive_stamps(self): + """ + Clear debug-control registers on physical replicas excluded from the active + view so they run freely + """ + inactive_tiles = self.design_info.overlay.get_inactive_tiles() + if not inactive_tiles: + return + AIEUtil( + self.args.aie_iface, self.impls[0], inactive_tiles, self.design_info.work_dir.globals[0] + ).initialize_stamp() + LOGGER.log("[INFO] Using single stamp control. Please use multistamp flag for more data.") + # --- Batch mode delegation --- def execute_and_dump(self): From 1be8a0ea622dfdcf15711dc23a1d8f83cae6ccb7 Mon Sep 17 00:00:00 2001 From: anurag Date: Fri, 29 May 2026 10:11:50 -0600 Subject: [PATCH 06/17] simplify quiesce --- src/mldebug/aie_util.py | 12 ++++++++---- src/mldebug/batch_runner.py | 2 +- src/mldebug/client_debug.py | 9 ++------- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/mldebug/aie_util.py b/src/mldebug/aie_util.py index 1487a20..4210a7c 100644 --- a/src/mldebug/aie_util.py +++ b/src/mldebug/aie_util.py @@ -277,12 +277,16 @@ def read_control_instr(self): for c, r in self._filter_tiles(self.aie_iface.MEM_TILE_T) } - def initialize_stamp(self): + def initialize_stamp(self, tiles=None): """ - Initialize and clear DEBUG_CONTROL1 and DEBUG_CONTROL0 registers for all AIE tiles - belonging to the overlay instance (usually at the start of execution for multi-stamp). + Clear DEBUG_CONTROL1 unhalt specified tiles. + + Args: + tiles (list[tuple], optional): (col, row) tiles to clear. Default: this stamp's tiles. """ - for c, r in self._filter_tiles(self.aie_iface.AIE_TILE_T): + if tiles is None: + tiles = self.tiles + for c, r in self.aie_iface.filter_tiles(self.aie_iface.AIE_TILE_T, tiles): self.impl.write_register(c, r, self.aie_iface.Core_registers["DEBUG_CONTROL1"], 0) self.impl.write_register(c, r, self.aie_iface.Core_registers["DEBUG_CONTROL0"], 0) diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py index 90c4040..9ebf7f1 100644 --- a/src/mldebug/batch_runner.py +++ b/src/mldebug/batch_runner.py @@ -465,7 +465,7 @@ def execute_and_dump(self): for sid in overlay.get_stampids(): self.aie_utls[sid].initialize_stamp() - self.impls[sid].continue_aie() + LOGGER.log("\nFinished Execution") self._handle_fsp() self._write_run_summary("SUCCESS") diff --git a/src/mldebug/client_debug.py b/src/mldebug/client_debug.py index cb4f055..5cc01f8 100644 --- a/src/mldebug/client_debug.py +++ b/src/mldebug/client_debug.py @@ -123,9 +123,7 @@ def _quiesce_inactive_stamps(self): inactive_tiles = self.design_info.overlay.get_inactive_tiles() if not inactive_tiles: return - AIEUtil( - self.args.aie_iface, self.impls[0], inactive_tiles, self.design_info.work_dir.globals[0] - ).initialize_stamp() + self.aie_utls[0].initialize_stamp(inactive_tiles) LOGGER.log("[INFO] Using single stamp control. Please use multistamp flag for more data.") # --- Batch mode delegation --- @@ -310,10 +308,7 @@ def init_leftmost_stamp(self): For stamps with index > 1, initialize the stamp and continue execution. """ self.impls[0].enable_pc_halt() - for sid, impl in enumerate(self.impls): - if sid > 0: - self.aie_utls[sid].initialize_stamp() - impl.continue_aie() + self._quiesce_inactive_stamps() def wreg_stamp(self, offset, val, sid=0): """ From ce1ead792e5a02d6c8b782213fb423bc2c8f4b31 Mon Sep 17 00:00:00 2001 From: anurag Date: Fri, 29 May 2026 13:20:51 -0600 Subject: [PATCH 07/17] reset state Signed-off-by: anurag --- src/mldebug/batch_runner.py | 2 ++ src/mldebug/debug_state.py | 9 +++------ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py index 9ebf7f1..4b69ec4 100644 --- a/src/mldebug/batch_runner.py +++ b/src/mldebug/batch_runner.py @@ -432,6 +432,8 @@ def run_layer(self, layer, target_itr=None, cur_it=None): overlay = self.design_info.overlay total_replicas = len(self.state.pm_reload) if total_replicas > 1 and (target_itr is None or target_itr == layer.lcp.num_iter): + for sid, _pml, _stamp in stamps: + self.state.break_on_stamp_scheduled[sid] = False for sid in range(total_replicas): if overlay.is_leftmost_in_batch(sid): continue diff --git a/src/mldebug/debug_state.py b/src/mldebug/debug_state.py index decaed1..9ffc2dd 100644 --- a/src/mldebug/debug_state.py +++ b/src/mldebug/debug_state.py @@ -16,12 +16,9 @@ def __init__(self, layers, stampcount, stamps_per_batch=1) -> None: Initialize the DebugState object. Args: - layers (list): The list of layer objects that define the AIE execution steps. - stampcount (int): The total number of replicas (batches * stamps_per_batch). - stamps_per_batch (int): Number of stamps within a single batch (S from - the BxSxCxR overlay). Used by get_next_layer_for_stamp to convert a - flat replica id into a per-batch stamp index when checking layer - participation. + layers (list): In order BE layer list + stampcount (int): Number of replicas (batches * stamps_per_batch). + stamps_per_batch (int): Number of stamps within a single batch (S from BxSxCxR). """ self.current_layer = -1 self.cur_it = 1 From 336f29f61aceb4997bd36e34e2ac14d1bcd90125 Mon Sep 17 00:00:00 2001 From: anurag Date: Fri, 29 May 2026 13:52:54 -0600 Subject: [PATCH 08/17] reset state Signed-off-by: anurag --- src/mldebug/batch_runner.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py index 4b69ec4..32a42e9 100644 --- a/src/mldebug/batch_runner.py +++ b/src/mldebug/batch_runner.py @@ -207,22 +207,28 @@ def schedule_layer_start(self, next_layer): # Step to layer 3 : step to all 3 stamps for sid, pml in enumerate(self.state.pm_reload): target_layer = stamp_target_layers.get(sid) - is_leftmost = overlay.is_leftmost_in_batch(sid) - if not target_layer or (not is_leftmost and self.state.break_on_stamp_scheduled[sid]): + if not target_layer: continue - self.state.break_on_stamp_scheduled[sid] = True - if pml: - if target_layer.layer_order != next_layer.layer_order: - LOGGER.log( - f"\nArming PM RELOAD on stamp {sid} for Layer_{target_layer.layer_order} " - ) - else: - LOGGER.log(f"\nPM RELOAD on stamp: {sid}") + is_leftmost = overlay.is_leftmost_in_batch(sid) + reaches_now = target_layer.layer_order == next_layer.layer_order + already_armed = not is_leftmost and self.state.break_on_stamp_scheduled[sid] stamp = target_layer.get_stamp(sid) - skip_end_pc = not (self.args.run_flags.l1_ofm_dump and stamp.end_pc) - self._set_layer_breakpoint(target_layer, skip_end_pc, sid, pml) - bes_to_run.append(self.impls[sid]) - if target_layer.layer_order == next_layer.layer_order: + + if not already_armed: + self.state.break_on_stamp_scheduled[sid] = True + if pml: + if not reaches_now: + LOGGER.log( + f"\nArming PM RELOAD on stamp {sid} for Layer_{target_layer.layer_order} " + ) + else: + LOGGER.log(f"\nPM RELOAD on stamp: {sid}") + skip_end_pc = not (self.args.run_flags.l1_ofm_dump and stamp.end_pc) + self._set_layer_breakpoint(target_layer, skip_end_pc, sid, pml) + bes_to_run.append(self.impls[sid]) + + # We have reached previously scheduled breakpoint + if reaches_now: bes_to_poll.append(self.impls[sid]) active_stamps_all_batches.append((sid, pml, stamp)) @@ -432,8 +438,6 @@ def run_layer(self, layer, target_itr=None, cur_it=None): overlay = self.design_info.overlay total_replicas = len(self.state.pm_reload) if total_replicas > 1 and (target_itr is None or target_itr == layer.lcp.num_iter): - for sid, _pml, _stamp in stamps: - self.state.break_on_stamp_scheduled[sid] = False for sid in range(total_replicas): if overlay.is_leftmost_in_batch(sid): continue From e6c010ec4ac6e3be359fd5899c11ee91cd219787 Mon Sep 17 00:00:00 2001 From: anurag Date: Mon, 1 Jun 2026 10:04:20 -0600 Subject: [PATCH 09/17] revert unnecessary change Signed-off-by: anurag --- src/mldebug/layer_info.py | 44 ++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/src/mldebug/layer_info.py b/src/mldebug/layer_info.py index 5fe0338..7eff6a0 100644 --- a/src/mldebug/layer_info.py +++ b/src/mldebug/layer_info.py @@ -281,15 +281,13 @@ def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_repor # Fill missing TG metadata from mladf report if self.lcp.is_tg: - for s, stamp in enumerate(self.stamps): - sk_name = mladf_report.get_skname_for_bilo(self.layer_order, s) - elf_name = mladf_report.get_elfid_for_bilo(self.layer_order, s) - if not sk_name or elf_name == -1 or any(k in sk_name for k in unsupported_superkernels): - LOGGER.verbose_print(f"[WARNING] unsupported kernel {sk_name} at Layer {self.layer_order} will be skipped.") + for sid, stamp in enumerate(self.stamps): + stamp.name = mladf_report.get_skname_for_bilo(self.layer_order, sid) + stamp.elf_name = mladf_report.get_elfid_for_bilo(self.layer_order, sid) + if not stamp.name or stamp.elf_name == -1 or any(k in stamp.name for k in unsupported_superkernels): + LOGGER.verbose_print(f"[WARNING] unsupported kernel {stamp.name} at Layer {self.layer_order} will be skipped.") self.is_unsupported = True return - stamp.name = sk_name - stamp.elf_name = elf_name self.lcp.num_iter = mladf_report._get_iters_for_bilo(self.layer_order) self._initialize_l3_buffers(info, version) @@ -863,15 +861,14 @@ def _initialize_layers_from_workdir_x2(self, args): self.layers = [layer for layer in self.layers if not layer.lcp.is_tg] if not self.layers: raise RuntimeError("No layers found in the design.") - # Resolve PCs once per per-batch stamp index. Batches share the same - # ELF/PCs so no mirroring is needed on the Layer's stamps list. - for s in range(self.num_stamps): + # Resolve PCs once per stamp + for sid in range(self.num_stamps): for layer in self.layers: - flist = list(self.layer_workdir_map[layer.layer_order].aie_functions[s].values())[0] - self.layer_workdir_map[layer.layer_order].pm_reload_en[s] = True + flist = list(self.layer_workdir_map[layer.layer_order].aie_functions[sid].values())[0] + self.layer_workdir_map[layer.layer_order].pm_reload_en[sid] = True for f in flist: - if _strip_template(layer.stamps[s].name.lower()) == _strip_template(f.name.lower()): - stamp = layer.stamps[s] + if _strip_template(layer.stamps[sid].name.lower()) == _strip_template(f.name.lower()): + stamp = layer.stamps[sid] LOGGER.verbose_print("Layer found:", layer.layer_order, stamp.name, f.start_pc) stamp.elf_name = layer.pm_work_dir stamp.start_pc = f.start_pc @@ -906,18 +903,17 @@ def _initialize_layers_from_workdir(self, args): # Hierarchy of Data: # Stamp <- Elf <- Layers # AIECompiler only knows flexmlIDs so we use that to match with correct layer - # Resolve PCs once per per-batch stamp index (s). Batch copies share - # the same ELFs and PCs so no extra resolution is needed. - for s in range(self.num_stamps): - has_pm_reload = self.work_dir.pm_reload_en[s] - for elf_name, flist in self.work_dir.aie_functions[s].items(): - LOGGER.verbose_print(f"Initializing layers for stamp {s} ELF: {elf_name}") + # Resolve PCs once per stamp index (sid). + for sid in range(self.num_stamps): + has_pm_reload = self.work_dir.pm_reload_en[sid] + for elf_name, flist in self.work_dir.aie_functions[sid].items(): + LOGGER.verbose_print(f"Initializing layers for stamp {sid} ELF: {elf_name}") elf_id = elf_name.split("reloadable")[-1] for f, l in itertools.product(flist, self.layers): - if s > len(l.stamps) - 1: + if sid > len(l.stamps) - 1: continue - if _strip_template(l.stamps[s].name.lower()) == _strip_template(f.name.lower()): - stamp = l.stamps[s] + if _strip_template(l.stamps[sid].name.lower()) == _strip_template(f.name.lower()): + stamp = l.stamps[sid] if l.lcp.is_tg and stamp.elf_name == elf_id: stamp.start_pc = f.start_pc if f.name.lower() not in skip_end_pc_kernels: @@ -925,7 +921,7 @@ def _initialize_layers_from_workdir(self, args): continue # Check if this layer is present in the elf # In buffer_info the flexml_ids might not be in order of stamps - if has_pm_reload and not any(i in self.work_dir.elf_flxmlid_maps[s][elf_id] for i in l.flexml_ids): + if has_pm_reload and not any(i in self.work_dir.elf_flxmlid_maps[sid][elf_id] for i in l.flexml_ids): continue LOGGER.verbose_print("Layer found:", l.layer_order, stamp.name) stamp.elf_name = elf_id From a5867848cf68cece658849090a5042d3d07f4e73 Mon Sep 17 00:00:00 2001 From: anurag Date: Mon, 1 Jun 2026 10:19:19 -0600 Subject: [PATCH 10/17] simplify more Signed-off-by: anurag --- src/mldebug/layer_info.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/mldebug/layer_info.py b/src/mldebug/layer_info.py index 7eff6a0..e05771a 100644 --- a/src/mldebug/layer_info.py +++ b/src/mldebug/layer_info.py @@ -491,8 +491,6 @@ def __init__(self, args): self.x2 = False self.x2_work_dirs = {} self.layer_workdir_map = {} - self.num_batches = 1 - self.num_stamps = 1 self.mladf_report = None has_bi = args.buffer_info and Path(args.buffer_info).is_file() @@ -504,15 +502,15 @@ def __init__(self, args): # 2. Initialize Overlay from Layout self.overlay = Overlay(args, self.layout) # Re-sync local view in case Overlay applied -o overrides. - self.num_batches = self.overlay.get_batch_count() - self.num_stamps = self.overlay.get_stamps_per_batch() + num_batches = self.overlay.get_batch_count() + num_stamps = self.overlay.get_stamps_per_batch() # 3. Parse mladf report. # TBD: memory optimize this as this json can be large if not args.aie_only and has_bi and use_mladf: self.mladf_report = MladfReport(args.buffer_info, args.mladf_report, self.overlay.get_stampwidth()) # 4. Initialize Layers if not args.aie_only: - self._init_layers(data, args.aie_iface, self.num_stamps, self.num_batches) + self._init_layers(data, args.aie_iface, num_stamps, num_batches) # 5: Parse work dir if self.x2: for layer in self.layers: @@ -565,7 +563,7 @@ def is_stamped(self): Returns: bool: True if stamps_per_batch > 1, False otherwise. """ - return self.num_stamps > 1 + return self.overlay.get_stamps_per_batch() > 1 def is_batched(self): """ @@ -574,7 +572,7 @@ def is_batched(self): Returns: bool: True if more than one batch, False otherwise. """ - return self.num_batches > 1 + return self.overlay.get_batch_count() > 1 def _create_info(self): """ @@ -686,7 +684,7 @@ def initialize_l3_layer_mapping(self, flexmlrt_hsi, external_buffer_id): # replicate across batches, not across per-batch stamps. if self.is_batched(): original_buffers = list(layer.l3_buffers) - for b in range(1, self.num_batches): + for b in range(1, self.overlay.get_batch_count()): for orig_buffer in original_buffers: stamped_buffer = L3Buffer( name=f"{orig_buffer.name}_stamp_{b}", @@ -783,8 +781,7 @@ def _read_buffer_info(self, buffer_info_file): Returns: dict: Parsed JSON object from file. Side Effects: - - Sets self.layout to (B, S, R, C), self.num_batches, self.num_stamps, - self.x2. + - Sets self.layout to (B, S, R, C), self.x2. """ print("Initializing Buffer Info ...") with open(buffer_info_file, encoding="utf-8") as fd: @@ -805,8 +802,6 @@ def _read_buffer_info(self, buffer_info_file): else: stamps = overlay_stamps - self.num_batches = batches - self.num_stamps = stamps self.layout = (batches, stamps, nrow, ncol) if batches > 1: LOGGER.log("Batched design detected") @@ -862,7 +857,7 @@ def _initialize_layers_from_workdir_x2(self, args): if not self.layers: raise RuntimeError("No layers found in the design.") # Resolve PCs once per stamp - for sid in range(self.num_stamps): + for sid in range(self.overlay.get_stamps_per_batch()): for layer in self.layers: flist = list(self.layer_workdir_map[layer.layer_order].aie_functions[sid].values())[0] self.layer_workdir_map[layer.layer_order].pm_reload_en[sid] = True @@ -903,8 +898,8 @@ def _initialize_layers_from_workdir(self, args): # Hierarchy of Data: # Stamp <- Elf <- Layers # AIECompiler only knows flexmlIDs so we use that to match with correct layer - # Resolve PCs once per stamp index (sid). - for sid in range(self.num_stamps): + # Resolve PCs once per stamp. + for sid in range(self.overlay.get_stamps_per_batch()): has_pm_reload = self.work_dir.pm_reload_en[sid] for elf_name, flist in self.work_dir.aie_functions[sid].items(): LOGGER.verbose_print(f"Initializing layers for stamp {sid} ELF: {elf_name}") From 39c4b4f5e20dcdfc4357e8e876e499ea0479ea38 Mon Sep 17 00:00:00 2001 From: anurag Date: Mon, 1 Jun 2026 10:40:14 -0600 Subject: [PATCH 11/17] pass arch instead of guessing Signed-off-by: anurag --- src/mldebug/arch/aie2p_defs.py | 4 +++- src/mldebug/arch/aie2ps_defs.py | 6 ++++-- src/mldebug/layer_info.py | 7 ++++--- src/mldebug/work_dir.py | 31 +++++++++++++++++-------------- 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/mldebug/arch/aie2p_defs.py b/src/mldebug/arch/aie2p_defs.py index 855c25b..07801d9 100644 --- a/src/mldebug/arch/aie2p_defs.py +++ b/src/mldebug/arch/aie2p_defs.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -# Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved. """ AIE2/AIE2P Specific Defs @@ -12,6 +12,8 @@ MEM_TILE_T = "mem_tile" TILE_TYPES = [AIE_TILE_T, SHIM_TILE_T, MEM_TILE_T] +ARCH_NAME = "aie2p" + AIE_TILE_ROW_OFFSET = 2 MEM_TILE_SZ = 0x80000 HAS_UC_MODULE = False diff --git a/src/mldebug/arch/aie2ps_defs.py b/src/mldebug/arch/aie2ps_defs.py index 02db650..f2171eb 100644 --- a/src/mldebug/arch/aie2ps_defs.py +++ b/src/mldebug/arch/aie2ps_defs.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -# Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved. """ -AIE2/AIE2P Specific Defs +AIE2PS Specific Defs """ import json @@ -12,6 +12,8 @@ MEM_TILE_T = "mem_tile" TILE_TYPES = [AIE_TILE_T, SHIM_TILE_T, MEM_TILE_T] +ARCH_NAME = "aie2ps" + AIE_TILE_ROW_OFFSET = 3 MEM_TILE_SZ = 0x80000 HAS_UC_MODULE = True diff --git a/src/mldebug/layer_info.py b/src/mldebug/layer_info.py index e05771a..c23045b 100644 --- a/src/mldebug/layer_info.py +++ b/src/mldebug/layer_info.py @@ -111,7 +111,7 @@ def __init__(self, entry, buf_type, size_shift, aie_iface, ifm=False, ofm=False, ping = entry["l1_ping"] pong = entry["l1_pong"] self.l1 = L1Buffer(int(ping[0], 16), ping[1] * size_shift, int(pong[0], 16), pong[1] * size_shift) - + # Handle both "l2" format and "l2_ping/l2_pong" format l2_bufs_list = [] if "l2_ping" in entry: @@ -517,11 +517,12 @@ def __init__(self, args): if layer.pm_work_dir: path = os.path.join(args.aie_dir, layer.pm_work_dir) if layer.pm_work_dir not in self.x2_work_dirs: - self.x2_work_dirs[layer.pm_work_dir] = WorkDir(path, args.peano, self.overlay) + self.x2_work_dirs[layer.pm_work_dir] = WorkDir(path, args.peano, self.overlay, self.aie_iface.ARCH_NAME) self.layer_workdir_map[layer.layer_order] = self.x2_work_dirs[layer.pm_work_dir] self.work_dir = next(iter(self.layer_workdir_map.values())) else: - self.work_dir = WorkDir(args.aie_dir, args.peano, self.overlay, args.run_flags.dump_temps) + self.work_dir = WorkDir(args.aie_dir, args.peano, self.overlay, + self.aie_iface.ARCH_NAME, args.run_flags.dump_temps) if not args.aie_only: # Set PC Value for layers diff --git a/src/mldebug/work_dir.py b/src/mldebug/work_dir.py index e44544f..87c41b9 100644 --- a/src/mldebug/work_dir.py +++ b/src/mldebug/work_dir.py @@ -78,7 +78,7 @@ class WorkDir: Abstraction for AIE Work Directory """ - def __init__(self, aie_dir, peano, overlay, dump_lst=False): + def __init__(self, aie_dir, peano, overlay, arch_name, dump_lst=False): """ Initialize the AIE Work Directory abstraction. Sets up internal state and parses functions. Args: @@ -95,7 +95,6 @@ def __init__(self, aie_dir, peano, overlay, dump_lst=False): self.globals = [None] * num_stamps self.peano = peano self.aie_dir = aie_dir - self.dump_lst = dump_lst # Lock acquire instruction PC after layer execution # This pc can be used for skip_iter self.post_layer_lock_acq_pcs = [0] * num_stamps @@ -104,7 +103,7 @@ def __init__(self, aie_dir, peano, overlay, dump_lst=False): for sid in range(num_stamps): self._stamp_lst_map[sid] = [] - self._initialize_functions(aie_dir, overlay) + self._initialize_functions(aie_dir, overlay, arch_name, dump_lst) def _check_for_lock_acq(self, line, sid, llvm): """ @@ -165,34 +164,34 @@ def _parse_aie_runtime_control(self, work_dir, col, row, stampid): elf_layer_map[par].append(layeridx) self.elf_flxmlid_maps[stampid] = elf_layer_map - def _get_lst(self, elf_path, elf_name): + def _get_lst(self, elf_path, elf_name, arch_name, dump_lst): """ Generate and fetch a disassembly listing (lst) for an ELF file using llvm-objdump. Args: elf_path (str): Path to the ELF binary. elf_name (str): Base ELF file name (stem). + arch_name (str): Target architecture name passed to llvm-objdump. + dump_lst (bool): Whether to write the output listing to disk. Returns: str: Decoded assembly listing as text. Side effects: - If self.dump_lst is True, writes the output listing to disk. + If dump_lst is True, writes the output listing to disk. """ lst_data = "" exe = "llvm-objdump.elf" - archname = "aie2p" if is_windows(): exe = "llvm-objdump.exe" elif is_aarch64(): exe = "llvm-objdump.aarch64" - archname = "aie2ps" with resources.as_file(resources.files("mldebug") / "bin" / exe) as objdump_path: lst = subprocess.check_output( - [str(objdump_path), "-d", "-z", "--no-show-raw-insn", f"--arch-name={archname}", "-C", elf_path] + [str(objdump_path), "-d", "-z", "--no-show-raw-insn", f"--arch-name={arch_name}", "-C", elf_path] ) lst_data = lst.decode("utf-8") - if self.dump_lst: + if dump_lst: fname = elf_name + ".lst" print("Writing assembly listing to:", fname) with open(fname, "w", encoding="utf8") as fd: @@ -259,7 +258,7 @@ def _breakpoint_allowed(self, lines, i): return False return True - def _initialize_functions(self, work_dir, overlay): + def _initialize_functions(self, work_dir, overlay, arch_name, dump_lst): """ Parse work directory and its ELF files to extract function ranges, tail calls, global variables, and layer/partition info. @@ -272,6 +271,8 @@ def _initialize_functions(self, work_dir, overlay): Args: work_dir (str): Path to the AIE work directory. overlay: Overlay object for tile mapping. + arch_name (str): Target architecture name passed to llvm-objdump. + dump_lst (bool): Whether to write disassembly listings to disk. Side effects: Populates aie_functions, pm_reload_en, globals, elf_flxmlid_maps. """ @@ -304,7 +305,7 @@ def _initialize_functions(self, work_dir, overlay): print(f"[WARNING] Failed to parse LST for {p}. Assuming peano compiler.") self.peano = True if self.peano: - self._parse_lst_llvm(p, s) + self._parse_lst_llvm(p, s, arch_name, dump_lst) # Parse map file to find LCP # Only base map file has global variables @@ -530,7 +531,7 @@ def _extract_var(lines, var_name): _extract_var(lines, "lcpPing") _extract_var(lines, "lcpPong") - def _parse_lst_llvm(self, elf, stampid): + def _parse_lst_llvm(self, elf, stampid, arch_name, dump_lst): """ Parse LLVM-based LST disassembly to extract functions, boundaries, final lock release instructions, and tail call status. @@ -538,13 +539,15 @@ def _parse_lst_llvm(self, elf, stampid): Args: elf (Path): Path object for the ELF file directory. stampid (int): Index into aie_functions. + arch_name (str): Target architecture name passed to llvm-objdump. + dump_lst (bool): Whether to write disassembly listings to disk. Side effects: Populates self.aie_functions[stampid][elf_name] with AIEFunction objects. """ elf_name = elf.stem elf_path = f"{elf}/Release/{elf.stem}" - data = self._get_lst(elf_path, elf_name) - self._stamp_lst_map[stampid].append((elf_name, self._get_lst(elf_path, elf_name))) + data = self._get_lst(elf_path, elf_name, arch_name, dump_lst) + self._stamp_lst_map[stampid].append((elf_name, data)) lines = data.split("\n") is_base = "reloadable" not in elf_name From 39b0446a69e5f824c7d85589490432c910f61c79 Mon Sep 17 00:00:00 2001 From: anurag Date: Mon, 1 Jun 2026 12:43:36 -0600 Subject: [PATCH 12/17] simplify batchwise access for stamps Signed-off-by: anurag --- src/mldebug/batch_runner.py | 4 +- src/mldebug/client_debug.py | 2 +- src/mldebug/layer_info.py | 12 +-- src/mldebug/work_dir.py | 152 +++++++++++++++++++----------------- 4 files changed, 90 insertions(+), 80 deletions(-) diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py index 32a42e9..1ae117a 100644 --- a/src/mldebug/batch_runner.py +++ b/src/mldebug/batch_runner.py @@ -135,7 +135,7 @@ def check_pm_reload(self, stamp_id=0): True if program memory reload will occur at the next layer, False otherwise. """ layer = self.state.layers[self.state.current_layer] - if not self.design_info.work_dir.pm_reload_en[stamp_id] or self.state.current_layer + 1 >= len(self.state.layers): + if not self.design_info.work_dir.stamp(stamp_id).pm_reload_en or self.state.current_layer + 1 >= len(self.state.layers): return False if self.design_info.overlay.is_leftmost_in_batch(stamp_id): @@ -388,7 +388,7 @@ def _run_stamp(self, layer, sid, target_itr, cur_it=1): self.state.error = not utl.skip_iterations(target_itr - cur_it, sid) elif self.args.run_flags.skip_iter2: self.state.error = not utl.skip_iterations_to_lock_acq( - self.design_info.work_dir.post_layer_lock_acq_pcs[sid], target_itr - cur_it, sid) + self.design_info.work_dir.stamp(sid).post_layer_lock_acq_pc, target_itr - cur_it, sid) else: while cur_it < target_itr: self.hit_next_breakpoint(sid) diff --git a/src/mldebug/client_debug.py b/src/mldebug/client_debug.py index 5cc01f8..5e15f42 100644 --- a/src/mldebug/client_debug.py +++ b/src/mldebug/client_debug.py @@ -86,7 +86,7 @@ def __init__(self, args, ctx_id, pid, output_dir): self.impls.append(impl) self.aie_utls.append( AIEUtil( - args.aie_iface, impl, self.design_info.overlay.get_tiles(stamp_id=i), self.design_info.work_dir.globals[i] + args.aie_iface, impl, self.design_info.overlay.get_tiles(stamp_id=i), self.design_info.work_dir.stamp(i).globals ) ) diff --git a/src/mldebug/layer_info.py b/src/mldebug/layer_info.py index c23045b..7177e3c 100644 --- a/src/mldebug/layer_info.py +++ b/src/mldebug/layer_info.py @@ -607,7 +607,7 @@ def print_info(self): sep = "--------------------------------------------" m = "Design info (Excluding TG Layer IDs)\n" m += f"{sep}\nFlexml Layer Count: {len(self.layers)}\n{sep}" - if not self.work_dir.elf_flxmlid_maps or not self.layers: + if not self.work_dir.stamps or not self.layers: return for sid, imap in info.items(): m += f"\nStamp {sid}: " @@ -860,8 +860,8 @@ def _initialize_layers_from_workdir_x2(self, args): # Resolve PCs once per stamp for sid in range(self.overlay.get_stamps_per_batch()): for layer in self.layers: - flist = list(self.layer_workdir_map[layer.layer_order].aie_functions[sid].values())[0] - self.layer_workdir_map[layer.layer_order].pm_reload_en[sid] = True + flist = list(self.layer_workdir_map[layer.layer_order].stamps[sid].aie_functions.values())[0] + self.layer_workdir_map[layer.layer_order].stamps[sid].pm_reload_en = True for f in flist: if _strip_template(layer.stamps[sid].name.lower()) == _strip_template(f.name.lower()): stamp = layer.stamps[sid] @@ -901,8 +901,8 @@ def _initialize_layers_from_workdir(self, args): # AIECompiler only knows flexmlIDs so we use that to match with correct layer # Resolve PCs once per stamp. for sid in range(self.overlay.get_stamps_per_batch()): - has_pm_reload = self.work_dir.pm_reload_en[sid] - for elf_name, flist in self.work_dir.aie_functions[sid].items(): + has_pm_reload = self.work_dir.stamps[sid].pm_reload_en + for elf_name, flist in self.work_dir.stamps[sid].aie_functions.items(): LOGGER.verbose_print(f"Initializing layers for stamp {sid} ELF: {elf_name}") elf_id = elf_name.split("reloadable")[-1] for f, l in itertools.product(flist, self.layers): @@ -917,7 +917,7 @@ def _initialize_layers_from_workdir(self, args): continue # Check if this layer is present in the elf # In buffer_info the flexml_ids might not be in order of stamps - if has_pm_reload and not any(i in self.work_dir.elf_flxmlid_maps[sid][elf_id] for i in l.flexml_ids): + if has_pm_reload and not any(i in self.work_dir.stamps[sid].elf_flxmlid_maps[elf_id] for i in l.flexml_ids): continue LOGGER.verbose_print("Layer found:", l.layer_order, stamp.name) stamp.elf_name = elf_id diff --git a/src/mldebug/work_dir.py b/src/mldebug/work_dir.py index 87c41b9..69fb47b 100644 --- a/src/mldebug/work_dir.py +++ b/src/mldebug/work_dir.py @@ -9,7 +9,7 @@ import re import subprocess from importlib import resources -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path from mldebug.extra.calltree import AIECallTree @@ -53,6 +53,30 @@ class GlobalVar: size: int +@dataclass +class StampInfo: + """ + Per-stamp data parsed from the work directory. + + One instance exists per per-batch stamp (S of them). Batch copies run the + same ELFs, so callers map a flat replica id (b*S + s) back to its stamp via + WorkDir.stamp(sid) rather than storing B*S duplicates. + """ + + # True when the stamp has reloadable ELFs (program-memory reload). + pm_reload_en: bool = False + # elf_name -> list[AIEFunction] + aie_functions: dict = field(default_factory=dict) + # elf partition -> list of flexml layer ids (only set when pm reload). + elf_flxmlid_maps: dict = {} + # list[GlobalVar] for lcpPing/lcpPong (None until first var is found). + globals: list = [] + # Lock acquire instruction PC after layer execution (used for skip_iter). + post_layer_lock_acq_pc: int = 0 + # list[(elf_name, lst_text)] captured during LLVM parsing. + lst_map: list = field(default_factory=list) + + def _parse_flexml_layer_id(objstr): """ Parse a layer index from an object string using the current BE naming convention. @@ -87,30 +111,31 @@ def __init__(self, aie_dir, peano, overlay, arch_name, dump_lst=False): overlay: Overlay object with get_stampids() and get_first_relative_core_tile(). dump_lst (bool): Whether to dump LST files. """ - num_stamps = len(overlay.get_stampids()) - - self.pm_reload_en = [False] * num_stamps - self.aie_functions = [None] * num_stamps - self.elf_flxmlid_maps = [None] * num_stamps - self.globals = [None] * num_stamps self.peano = peano self.aie_dir = aie_dir - # Lock acquire instruction PC after layer execution - # This pc can be used for skip_iter - self.post_layer_lock_acq_pcs = [0] * num_stamps - - self._stamp_lst_map = {} - for sid in range(num_stamps): - self._stamp_lst_map[sid] = [] + self.stamps_per_batch = overlay.get_stamps_per_batch() + self.stamps = [StampInfo() for _ in range(self.stamps_per_batch)] self._initialize_functions(aie_dir, overlay, arch_name, dump_lst) + def stamp(self, sid): + """ + Map a flat replica id (b*S + s) back to its per-batch StampInfo. Batch + copies have the same ELFs, so all replicas of stamp s see the same data. + + Args: + sid (int): Flat replica id. + Returns: + StampInfo: The per-batch stamp info for this replica. + """ + return self.stamps[sid % self.stamps_per_batch] + def _check_for_lock_acq(self, line, sid, llvm): """ find lock acq in base lst """ if "acq" in line.lower(): - self.post_layer_lock_acq_pcs[sid] = self._get_pc(line, llvm) + self.stamps[sid].post_layer_lock_acq_pc = self._get_pc(line, llvm) def _demangle(self, fstring): """ @@ -140,7 +165,7 @@ def _parse_aie_runtime_control(self, work_dir, col, row, stampid): row (int): Row of the target core tile. stampid (int): Stamp index into overlay. Side effects: - Updates self.elf_flxmlid_maps[stampid] to map ELF partitions to layers. + Updates self.stamps[stampid].elf_flxmlid_maps to map ELF partitions to layers. """ elf_layer_map = {} # Elfs for different columns can be reloaded in same line so we have to create multiple groups @@ -162,7 +187,7 @@ def _parse_aie_runtime_control(self, work_dir, col, row, stampid): if layeridx in elf_layer_map[par]: break elf_layer_map[par].append(layeridx) - self.elf_flxmlid_maps[stampid] = elf_layer_map + self.stamps[stampid].elf_flxmlid_maps = elf_layer_map def _get_lst(self, elf_path, elf_name, arch_name, dump_lst): """ @@ -264,9 +289,9 @@ def _initialize_functions(self, work_dir, overlay, arch_name, dump_lst): global variables, and layer/partition info. For batched + stamped designs we only parse one batch's worth of stamps - (S replicas) and mirror the parsed data across the remaining batch copies. - The same ELF binaries are loaded into the additional batch columns, so - the PCs and global addresses are identical. + (S replicas). The same ELF binaries are loaded into the additional batch + columns, so the PCs and global addresses are identical; callers reach the + batch copies through self.stamp(sid) (sid % S). Args: work_dir (str): Path to the AIE work directory. @@ -274,17 +299,14 @@ def _initialize_functions(self, work_dir, overlay, arch_name, dump_lst): arch_name (str): Target architecture name passed to llvm-objdump. dump_lst (bool): Whether to write disassembly listings to disk. Side effects: - Populates aie_functions, pm_reload_en, globals, elf_flxmlid_maps. + Populates self.stamps[s] for each per-batch stamp. """ print("[INFO] Try to detect Work Directory ...") full_path = Path(work_dir + "/aie/") if not Path.exists(full_path): LOGGER.log(f"[INFO] Work directory {full_path} does not exist.") return - stamps_per_batch = overlay.get_stamps_per_batch() - batches = overlay.get_batch_count() - # Parse per-batch stamps once, then mirror across batches below. - for s in range(stamps_per_batch): + for s in range(self.stamps_per_batch): col, row = overlay.get_first_relative_core_tile(s) core_name = f"{col}_{row}" print(f"Core: {core_name}") @@ -292,9 +314,8 @@ def _initialize_functions(self, work_dir, overlay, arch_name, dump_lst): for elf in full_path.glob(f"{core_name}*"): plist.append(elf) if len(plist) > 1: - self.pm_reload_en[s] = True + self.stamps[s].pm_reload_en = True self._parse_aie_runtime_control(work_dir, col, row, s) - self.aie_functions[s] = {} # Parse LST for p in plist: @@ -315,19 +336,6 @@ def _initialize_functions(self, work_dir, overlay, arch_name, dump_lst): else: self._extract_globals_chess(first_elf, s) - # Mirror per-batch data into batch 1..B-1 replica slots so all callers - # that index by flat replica id (b*S + s) see consistent data. - for b in range(1, batches): - for s in range(stamps_per_batch): - sid = b * stamps_per_batch + s - if sid >= len(self.pm_reload_en): - break - self.pm_reload_en[sid] = self.pm_reload_en[s] - self.aie_functions[sid] = self.aie_functions[s] - self.elf_flxmlid_maps[sid] = self.elf_flxmlid_maps[s] - self.globals[sid] = self.globals[s] - self._stamp_lst_map[sid] = self._stamp_lst_map[s] - def _parse_lst_chess(self, elf, stampid): """ Extract function boundaries, lock-release PCs, and tail call status from Chess @@ -340,7 +348,7 @@ def _parse_lst_chess(self, elf, stampid): Returns: bool: True if parsed successfully, False if the LST file doesn't exist. Side effects: - Populates self.aie_functions[stampid][elf_name] with AIEFunction objects. + Populates self.stamps[stampid].aie_functions[elf_name] with AIEFunction objects. """ elf_name = elf.stem lst_file = f"{elf}/Release/{elf_name}.lst" @@ -349,7 +357,7 @@ def _parse_lst_chess(self, elf, stampid): is_base = "reloadable" not in elf_name - self.aie_functions[stampid][elf_name] = [] + self.stamps[stampid].aie_functions[elf_name] = [] with open(lst_file, encoding="utf-8") as fd: lines = fd.read().split("\n") count = len(lines) @@ -399,7 +407,7 @@ def _parse_lst_chess(self, elf, stampid): i -= 1 break i += 1 - self.aie_functions[stampid][elf_name].append( + self.stamps[stampid].aie_functions[elf_name].append( AIEFunction(demangled, start_pc, end_pc, final_lock_release_pc, tail_call) ) i += 1 @@ -449,9 +457,9 @@ def _extract_globals_llvm(self, elf, sid): Args: elf (Path): Path object of the ELF directory. - sid (int): Index into self.globals for this stamp. + sid (int): Index into self.stamps for this stamp. Side effects: - Appends GlobalVar objects to self.globals[sid] for lcpPing/lcpPong if present. + Appends GlobalVar objects to self.stamps[sid].globals for lcpPing/lcpPong if present. """ mapfile_path = f"{elf}/Release/{elf.stem}.map" if not Path(mapfile_path).exists(): @@ -460,21 +468,21 @@ def _extract_globals_llvm(self, elf, sid): def _extract_var(lines, var_name): """ - Find and add a global variable by name from the given lines to self.globals[sid]. + Find and add a global variable by name from the given lines to self.stamps[sid].globals. Args: lines (List[str]): Lines of map file. var_name (str): Variable name to search for. Side effects: - Updates self.globals[sid]. + Updates self.stamps[sid].globals. """ - if not self.globals[sid]: - self.globals[sid] = [] + if not self.stamps[sid].globals: + self.stamps[sid].globals = [] for line in lines: if var_name in line: tokens = line.split() if len(tokens) >= 3: try: - self.globals[sid].append(GlobalVar(var_name, int(tokens[0], base=16), int(tokens[2], base=16))) + self.stamps[sid].globals.append(GlobalVar(var_name, int(tokens[0], base=16), int(tokens[2], base=16))) LOGGER.verbose_print(f"[INFO] Found global variable: {var_name} at {tokens[0]} size {tokens[2]}") except ValueError: pass # Ignore lines that cannot be parsed @@ -492,9 +500,9 @@ def _extract_globals_chess(self, elf, sid): Args: elf (Path): Path object of the ELF directory. - sid (int): Index into self.globals for this stamp. + sid (int): Index into self.stamps for this stamp. Side effects: - Appends GlobalVar objects to self.globals[sid] for lcpPing/lcpPong if present. + Appends GlobalVar objects to self.stamps[sid].globals for lcpPing/lcpPong if present. """ mapfile_path = f"{elf}/Release/{elf.stem}.map" if not Path(mapfile_path).exists(): @@ -503,15 +511,15 @@ def _extract_globals_chess(self, elf, sid): def _extract_var(lines, var_name): """ - Find and add a global variable by name from the given lines to self.globals[sid]. + Find and add a global variable by name from the given lines to self.stamps[sid].globals. Args: lines (List[str]): Lines of map file. var_name (str): Variable name to search for. Side effects: - Updates self.globals[sid]. + Updates self.stamps[sid].globals. """ - if not self.globals[sid]: - self.globals[sid] = [] + if not self.stamps[sid].globals: + self.stamps[sid].globals = [] for line in lines: if var_name in line: tokens = line.split()[0].split("..") @@ -520,7 +528,7 @@ def _extract_var(lines, var_name): start_addr = int(tokens[0], base=16) end_addr = int(tokens[1], base=16) size = end_addr - start_addr + 1 - self.globals[sid].append(GlobalVar(var_name, start_addr, size)) + self.stamps[sid].globals.append(GlobalVar(var_name, start_addr, size)) LOGGER.verbose_print(f"[INFO] Found global variable: {var_name} at {start_addr} size {size}") except ValueError: pass # Ignore lines that cannot be parsed @@ -542,18 +550,18 @@ def _parse_lst_llvm(self, elf, stampid, arch_name, dump_lst): arch_name (str): Target architecture name passed to llvm-objdump. dump_lst (bool): Whether to write disassembly listings to disk. Side effects: - Populates self.aie_functions[stampid][elf_name] with AIEFunction objects. + Populates self.stamps[stampid].aie_functions[elf_name] with AIEFunction objects. """ elf_name = elf.stem elf_path = f"{elf}/Release/{elf.stem}" data = self._get_lst(elf_path, elf_name, arch_name, dump_lst) - self._stamp_lst_map[stampid].append((elf_name, data)) + self.stamps[stampid].lst_map.append((elf_name, data)) lines = data.split("\n") is_base = "reloadable" not in elf_name - self.aie_functions[stampid][elf_name] = [] - flist = self.aie_functions[stampid][elf_name] + self.stamps[stampid].aie_functions[elf_name] = [] + flist = self.stamps[stampid].aie_functions[elf_name] in_func = None for i, line in enumerate(lines): # function call @@ -602,7 +610,7 @@ def find_functions_by_pc(self, pc): List[str]: List of ":" strings whose PC range covers the input. """ funclist = [] - fmap = self.aie_functions[0] + fmap = self.stamps[0].aie_functions if fmap: for elf, flist in fmap.items(): for func in flist: @@ -620,21 +628,23 @@ def print_aie_functions(self, elf_id=None): Side effects: Prints formatted function info to stdout. """ - if all(x is None for x in self.aie_functions): + if all(not si.aie_functions for si in self.stamps): print("No functions found in design. Please specify aiedir option.") return sep = "--------------------------------------------" if elf_id: - for fmap in self.aie_functions: + for si in self.stamps: + fmap = si.aie_functions if elf_id in fmap: print(f"{sep}\nFunctions in {elf_id}\n{sep}") for f in fmap[elf_id]: print(f) return - for stamp, fmap in enumerate(self.aie_functions): + for stamp, si in enumerate(self.stamps): + fmap = si.aie_functions if not fmap: continue print(f"{sep}\nElfs in Stamp: {stamp}\n{sep}") @@ -652,10 +662,10 @@ def print_calltree(self, sid=0): Args: sid (int): Stamp index. """ - if sid not in self._stamp_lst_map: - LOGGER.log(f"[ERROR] Stamp {sid} not found in _stamp_lst_map.") + if not 0 <= sid < self.stamps_per_batch: + LOGGER.log(f"[ERROR] Stamp {sid} out of range.") return - for elf_id, lst_content in self._stamp_lst_map[sid]: + for elf_id, lst_content in self.stamps[sid].lst_map: LOGGER.log(f"[INFO] Printing calltree for {elf_id}\n") tree = AIECallTree.from_string(lst_content) tree.print_calltree() @@ -668,10 +678,10 @@ def dump_lst_to_file(self, sid=0): Args: sid (int): Stamp index. """ - if sid not in self._stamp_lst_map: - LOGGER.log(f"[ERROR] Stamp {sid} not found in _stamp_lst_map.") + if not 0 <= sid < self.stamps_per_batch: + LOGGER.log(f"[ERROR] Stamp {sid} out of range.") return - for elf_id, lst_content in self._stamp_lst_map[sid]: + for elf_id, lst_content in self.stamps[sid].lst_map: with open(f"{elf_id}.lst", "w", encoding="utf-8") as fd: fd.write(lst_content) LOGGER.log(f"[INFO] LST file dumped to {elf_id}.lst") From 2ec310df05b32d6511c7854e9c6b2afa3d56433a Mon Sep 17 00:00:00 2001 From: anurag Date: Mon, 1 Jun 2026 12:46:54 -0600 Subject: [PATCH 13/17] add factory Signed-off-by: anurag --- src/mldebug/work_dir.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mldebug/work_dir.py b/src/mldebug/work_dir.py index 69fb47b..9ddb2f1 100644 --- a/src/mldebug/work_dir.py +++ b/src/mldebug/work_dir.py @@ -68,9 +68,9 @@ class StampInfo: # elf_name -> list[AIEFunction] aie_functions: dict = field(default_factory=dict) # elf partition -> list of flexml layer ids (only set when pm reload). - elf_flxmlid_maps: dict = {} + elf_flxmlid_maps: dict = field(default_factory=dict) # list[GlobalVar] for lcpPing/lcpPong (None until first var is found). - globals: list = [] + globals: list = field(default_factory=list) # Lock acquire instruction PC after layer execution (used for skip_iter). post_layer_lock_acq_pc: int = 0 # list[(elf_name, lst_text)] captured during LLVM parsing. From c62b6535f42d0461b81b3676474d871f89319a88 Mon Sep 17 00:00:00 2001 From: anurag Date: Mon, 1 Jun 2026 14:55:40 -0600 Subject: [PATCH 14/17] make pc resolving code readable Signed-off-by: anurag --- src/mldebug/layer_info.py | 63 +++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/src/mldebug/layer_info.py b/src/mldebug/layer_info.py index 7177e3c..b88125a 100644 --- a/src/mldebug/layer_info.py +++ b/src/mldebug/layer_info.py @@ -898,34 +898,47 @@ def _initialize_layers_from_workdir(self, args): # Hierarchy of Data: # Stamp <- Elf <- Layers - # AIECompiler only knows flexmlIDs so we use that to match with correct layer - # Resolve PCs once per stamp. + # AIECompiler only knows flexmlIDs so we use that to match with correct layer. + # For each layer we pick the ELF its kernel lives in, then fill in the PCs. for sid in range(self.overlay.get_stamps_per_batch()): - has_pm_reload = self.work_dir.stamps[sid].pm_reload_en - for elf_name, flist in self.work_dir.stamps[sid].aie_functions.items(): - LOGGER.verbose_print(f"Initializing layers for stamp {sid} ELF: {elf_name}") - elf_id = elf_name.split("reloadable")[-1] - for f, l in itertools.product(flist, self.layers): - if sid > len(l.stamps) - 1: - continue - if _strip_template(l.stamps[sid].name.lower()) == _strip_template(f.name.lower()): - stamp = l.stamps[sid] - if l.lcp.is_tg and stamp.elf_name == elf_id: - stamp.start_pc = f.start_pc - if f.name.lower() not in skip_end_pc_kernels: - stamp.end_pc = f.final_lock_release_pc - continue - # Check if this layer is present in the elf - # In buffer_info the flexml_ids might not be in order of stamps - if has_pm_reload and not any(i in self.work_dir.stamps[sid].elf_flxmlid_maps[elf_id] for i in l.flexml_ids): - continue - LOGGER.verbose_print("Layer found:", l.layer_order, stamp.name) - stamp.elf_name = elf_id - stamp.start_pc = f.start_pc - if f.name.lower() not in skip_end_pc_kernels: - stamp.end_pc = f.final_lock_release_pc + aiec_info = self.work_dir.stamp(sid) + # Index functions by elf_id and stripped name for direct lookup. + funcs_by_elf = { + elf_name.split("reloadable")[-1]: + {_strip_template(f.name.lower()): f for f in flist} + for elf_name, flist in aiec_info.aie_functions.items() + } + for layer in self.layers: + if sid >= len(layer.stamps): + continue + stamp = layer.stamps[sid] + key = _strip_template(stamp.name.lower()) + # Pick the ELF this layer's kernel comes from. + if layer.lcp.is_tg: + # TG layers already carry their elf_name. + elf_id = stamp.elf_name + elif aiec_info.pm_reload_en: + # In buffer_info the flexml_ids might not be in order of stamps, so + # match on flexml-id membership and name within the same ELF. + elf_id = next((e for e, fns in funcs_by_elf.items() + if key in fns + and any(i in aiec_info.elf_flxmlid_maps[e] for i in layer.flexml_ids)), + None) + else: + elf_id = next((e for e, fns in funcs_by_elf.items() if key in fns), None) + + f = funcs_by_elf.get(elf_id, {}).get(key) if elf_id is not None else None + if f is None: + continue + LOGGER.verbose_print("Layer found:", layer.layer_order, stamp.name) + if not layer.lcp.is_tg: + stamp.elf_name = elf_id + stamp.start_pc = f.start_pc + if f.name.lower() not in skip_end_pc_kernels: + stamp.end_pc = f.final_lock_release_pc # Under right conditions, we don't even go through iterations + # This is optional enhancement for stability. if args.run_flags.skip_iter: for idx, layer in enumerate(self.layers): if idx >= len(self.layers) - 1: From c72da269cf255da8a4b72677ed937864e3d8a19b Mon Sep 17 00:00:00 2001 From: anurag Date: Mon, 1 Jun 2026 17:01:42 -0600 Subject: [PATCH 15/17] try to fix pm load Signed-off-by: anurag --- CLAUDE.md | 5 ++-- src/mldebug/batch_runner.py | 9 ++++++- src/mldebug/layer_info.py | 51 +++++++++++++++++++++---------------- 3 files changed, 40 insertions(+), 25 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 1d1b0eb..b8c3298 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -320,8 +320,9 @@ the rest of the system uses. be less than the overlay's S - higher-indexed stamps skip this layer). Batches share kernel/PC metadata, so callers translate a flat replica id with `layer.get_stamp(sid)` (returns - `stamps[sid % stamps_per_batch]`) or - `layer.get_stamps_for_all_batches()` for the expanded `B*S` view. + `stamps[sid % S]`, the per-batch stamp index) and gate participation + with `layer.runs_replica(sid)` (false when this layer's + `stamps_per_batch` is smaller, e.g. TG layers). - A `Buffer` is the user-level concept (one IFM, one OFM, one weight set). Internally it holds an `L1Buffer` (ping/pong) and a list of `L2Buffer` chunks. Buffers larger than the memory-tile size are diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py index 1ae117a..b816c54 100644 --- a/src/mldebug/batch_runner.py +++ b/src/mldebug/batch_runner.py @@ -138,12 +138,19 @@ def check_pm_reload(self, stamp_id=0): if not self.design_info.work_dir.stamp(stamp_id).pm_reload_en or self.state.current_layer + 1 >= len(self.state.layers): return False + # This replica must actually run the current layer. Higher-indexed per-batch + # stamps skip layers whose stamps_per_batch is smaller (e.g. TG layers run + # only the leftmost stamp of each batch), so there is no current ELF to + # compare against and no reload to schedule here. + if not layer.runs_replica(stamp_id): + return False + if self.design_info.overlay.is_leftmost_in_batch(stamp_id): next_layer = self.state.layers[self.state.current_layer + 1] else: next_layer = self.state.get_next_layer_for_stamp(stamp_id, idx=1) - if next_layer is None: + if next_layer is None or not next_layer.runs_replica(stamp_id): return False cur_stamp = layer.get_stamp(stamp_id) next_stamp = next_layer.get_stamp(stamp_id) diff --git a/src/mldebug/layer_info.py b/src/mldebug/layer_info.py index b88125a..1b12038 100644 --- a/src/mldebug/layer_info.py +++ b/src/mldebug/layer_info.py @@ -254,6 +254,10 @@ def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_repor self.pm_work_dir = info.get("pm", None) self.is_unsupported = False self.num_batches = num_batches + # Global stamps-per-batch (S from BxSxCxR), captured before the per-layer + # reduction below. Flat replica ids map to a per-batch stamp via `sid % S`, + # so this is the correct modulus even when this layer runs fewer stamps. + self.overlay_stamps_per_batch = num_stamps self.lcp.is_tg = "templated_graph" in info kname = [i.lower() for i in info["kernel_name"]][0] @@ -268,7 +272,7 @@ def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_repor # Per-batch stamp count for this layer. Batches share the same stamp # metadata; the per-batch stamp list is mirrored across batches at call - # sites via get_stamp / get_stamps_for_all_batches. + # sites via get_stamp. self.stamps_per_batch = num_stamps self.stamps = [Stamp(name=kname) for _ in range(num_stamps)] @@ -301,22 +305,19 @@ def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_repor self._initialize_iters(info, version) LOGGER.verbose_print(f"{self.layer_order}: {kname} {self.lcp.num_iter}") - def get_stamp(self, sid): + def runs_replica(self, sid): """ - Return the per-batch stamp metadata for a flat replica id. - - Batches are data-parallel copies that share the same kernel/PCs, so the - canonical per-batch stamp list (length `stamps_per_batch`) is indexed by - `sid % stamps_per_batch`. + True if flat replica `sid` runs this layer: its per-batch index `sid % S` + is below this layer's (possibly reduced) `stamps_per_batch`. """ - return self.stamps[int(sid % self.stamps_per_batch)] + return (sid % self.overlay_stamps_per_batch) < self.stamps_per_batch - def get_stamps_for_all_batches(self): + def get_stamp(self, sid): """ - Return a list of stamps expanded across all batches (length B * S). Used - by callers that want to iterate over flat replica ids. + Per-batch stamp metadata for flat replica `sid`, indexed by `sid % S`. + Caller must ensure participation (see `runs_replica`). """ - return self.stamps * self.num_batches + return self.stamps[int(sid % self.overlay_stamps_per_batch)] def _initialize_flexml_ids(self, info): """ @@ -585,18 +586,24 @@ def _create_info(self): info = {} for n in range(len(self.overlay.get_stampids())): info[n] = {} + s_per_batch = self.overlay.get_stamps_per_batch() for layer in self.layers: order = layer.layer_order - for sid, stamp in enumerate(layer.get_stamps_for_all_batches()): - imap = info[sid] - elf = stamp.elf_name - if not elf: - continue - if elf not in imap: - imap[elf] = [order, order] - else: - imap[elf][0] = min(imap[elf][0], order) - imap[elf][1] = max(imap[elf][1], order) + # Map each per-batch stamp to its flat replica id in every batch: + # sid = b * S + s. Reduced layers (s < stamps_per_batch) simply omit + # the higher per-batch stamps, mirroring participation across batches. + for b in range(self.overlay.get_batch_count()): + for s, stamp in enumerate(layer.stamps): + sid = b * s_per_batch + s + imap = info[sid] + elf = stamp.elf_name + if not elf: + continue + if elf not in imap: + imap[elf] = [order, order] + else: + imap[elf][0] = min(imap[elf][0], order) + imap[elf][1] = max(imap[elf][1], order) return info def print_info(self): From 8132a8030a0e1957a7ec4b1ac3f76c6aec539d27 Mon Sep 17 00:00:00 2001 From: anurag Date: Mon, 1 Jun 2026 17:21:12 -0600 Subject: [PATCH 16/17] try to fix pm load2 Signed-off-by: anurag --- src/mldebug/batch_runner.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py index b816c54..c37e787 100644 --- a/src/mldebug/batch_runner.py +++ b/src/mldebug/batch_runner.py @@ -473,8 +473,12 @@ def execute_and_dump(self): f" stamps: {len(layer.stamps)}, iters {layer.lcp.num_iter}") self.schedule_layer_start(layer) self.run_layer(layer) - for sid, _ in enumerate(self.state.pm_reload): - self.state.pm_reload[sid] = self.check_pm_reload(sid) + # Only recompute reload state for replicas that run THIS layer. Stamps + # skipping it (e.g. across TG layers) keep the early-armed combo state + # they were scheduled with, instead of being clobbered here. + for sid in range(len(self.state.pm_reload)): + if layer.runs_replica(sid): + self.state.pm_reload[sid] = self.check_pm_reload(sid) for sid in overlay.get_stampids(): self.aie_utls[sid].initialize_stamp() From ec03ad0084b73b76217dc95a8b754c2325eae2ac Mon Sep 17 00:00:00 2001 From: anurag Date: Tue, 2 Jun 2026 10:09:27 -0600 Subject: [PATCH 17/17] clean comments Signed-off-by: anurag --- CLAUDE.md | 3 ++- src/mldebug/batch_runner.py | 26 +++++++------------------- src/mldebug/layer_info.py | 9 ++------- 3 files changed, 11 insertions(+), 27 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index b8c3298..1163064 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -580,7 +580,8 @@ Before implementing: - No "flexibility" or "configurability" that wasn't requested. - No error handling for impossible scenarios. - If you write 200 lines and it could be 50, rewrite it. -- Keep docstrings short - ideally 3-4 lines. +- Keep docstrings short - ideally 1-2 lines. If possible, add small one line comments interspersed + throughout rather than big chunks of comments. Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify. diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py index c37e787..2bba4e1 100644 --- a/src/mldebug/batch_runner.py +++ b/src/mldebug/batch_runner.py @@ -122,36 +122,26 @@ def _set_layer_breakpoint(self, layer, skip_end_pc, sid, pm_reload_expected): def check_pm_reload(self, stamp_id=0): """ Check if the next ELF will be loaded (PM Reload) for the given replica. - - For each batch, the leftmost replica (per-batch stamp index 0) always - participates in every layer, so we look at `current_layer + 1` directly. - Other replicas may skip layers, so we walk forward to the next layer - they actually run via `get_next_layer_for_stamp`. - Args: stamp_id: Replica id to check for reload (default 0). - Returns: True if program memory reload will occur at the next layer, False otherwise. """ layer = self.state.layers[self.state.current_layer] + # PM Load is not enabled for this stamp or this is last layer if not self.design_info.work_dir.stamp(stamp_id).pm_reload_en or self.state.current_layer + 1 >= len(self.state.layers): return False - - # This replica must actually run the current layer. Higher-indexed per-batch - # stamps skip layers whose stamps_per_batch is smaller (e.g. TG layers run - # only the leftmost stamp of each batch), so there is no current ELF to - # compare against and no reload to schedule here. + # Stamp id doesn't run for this layer if not layer.runs_replica(stamp_id): return False - + # Find next layer that runs this stamp if self.design_info.overlay.is_leftmost_in_batch(stamp_id): next_layer = self.state.layers[self.state.current_layer + 1] else: next_layer = self.state.get_next_layer_for_stamp(stamp_id, idx=1) - if next_layer is None or not next_layer.runs_replica(stamp_id): return False + cur_stamp = layer.get_stamp(stamp_id) next_stamp = next_layer.get_stamp(stamp_id) return cur_stamp.elf_name != next_stamp.elf_name @@ -462,7 +452,6 @@ def run_layer(self, layer, target_itr=None, cur_it=None): def execute_and_dump(self): """ Execute all layers in batch mode, dumping buffers as required. - Primary entry point for batch mode execution in MLDebugger. """ self.common_init() @@ -473,10 +462,9 @@ def execute_and_dump(self): f" stamps: {len(layer.stamps)}, iters {layer.lcp.num_iter}") self.schedule_layer_start(layer) self.run_layer(layer) - # Only recompute reload state for replicas that run THIS layer. Stamps - # skipping it (e.g. across TG layers) keep the early-armed combo state - # they were scheduled with, instead of being clobbered here. - for sid in range(len(self.state.pm_reload)): + + # Only recompute reload state for replicas that run THIS layer + for sid, _ in enumerate(self.state.pm_reload): if layer.runs_replica(sid): self.state.pm_reload[sid] = self.check_pm_reload(sid) diff --git a/src/mldebug/layer_info.py b/src/mldebug/layer_info.py index 1b12038..11e9777 100644 --- a/src/mldebug/layer_info.py +++ b/src/mldebug/layer_info.py @@ -270,10 +270,6 @@ def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_repor if n_stamps and n_stamps < num_stamps: num_stamps = n_stamps - # Per-batch stamp count for this layer. Batches share the same stamp - # metadata; the per-batch stamp list is mirrored across batches at call - # sites via get_stamp. - self.stamps_per_batch = num_stamps self.stamps = [Stamp(name=kname) for _ in range(num_stamps)] # 1. Layers without any kernel should be skipped @@ -307,10 +303,9 @@ def __init__(self, info, size_shift, version, aie_iface, num_stamps, mladf_repor def runs_replica(self, sid): """ - True if flat replica `sid` runs this layer: its per-batch index `sid % S` - is below this layer's (possibly reduced) `stamps_per_batch`. + A layer can run less no of stamps than maximum. """ - return (sid % self.overlay_stamps_per_batch) < self.stamps_per_batch + return (sid % self.overlay_stamps_per_batch) < len(self.stamps) def get_stamp(self, sid): """