diff --git a/ext/run_tests.py b/ext/run_tests.py index 2249ad3..5c223db 100644 --- a/ext/run_tests.py +++ b/ext/run_tests.py @@ -29,7 +29,7 @@ "PEANO_BATCH": f"{CMD} -a ext/tests/peano -b ext/tests/peano/buffer_info.json -f l2_ifm_dump --peano", "PEANO_L2_DUMP": f"{CMD} -a ext/tests/peano -b ext/tests/peano/buffer_info.json -f l2_ifm_dump --peano -e 15", "WTS_ITER_FLAGS": f"{CMD} -a ext/tests/wts_iter -b ext/tests/wts_iter/buffer_info.json" - " -e 2 -f layer_status text_dump l1_ofm_dump", + " -e 2 -f layer_status text_dump", "VAIML": f"{CMD} -v ext/tests/vaiml -f skip_dump", # "X2": f"{CMD} -a ext/tests/x2 -b ext/tests/x2/buffer_info.json -f skip_dump", } diff --git a/src/mldebug/aie_util.py b/src/mldebug/aie_util.py index 119c613..9a78257 100644 --- a/src/mldebug/aie_util.py +++ b/src/mldebug/aie_util.py @@ -9,6 +9,7 @@ from mldebug.utils import LOGGER + class AIEUtil: """ AIE Utility class @@ -162,6 +163,7 @@ def skip_iterations(self, count, sid): start_time = time.time() perf_cntr_1 = reg_map["PERF_CNTR_1"] while True: + time.sleep(0.1) values = self.read_aie_regs(perf_cntr_1) if all(v == count for v in values.values()): break @@ -171,13 +173,42 @@ def skip_iterations(self, count, sid): f"Design might be hung. Values={values}" ) return False - time.sleep(0.1) # Step6: Reset debug control to stop at program counter event pc_event = self._get_eventid("PC_0_CORE") write(reg_map["DEBUG_CONTROL1"], pc_event << 16) return True + def skip_iterations_to_lock_acq(self, lock_acq_pc, count, sid): + """ + Skip iterations without using counter + """ + if self._is_test_mode() or count == 0: + return True + + self.impl.set_pc_breakpoint(lock_acq_pc) + self.impl.continue_aie() + timeout = 10 + start_time = time.time() + while time.time() - start_time < timeout: + time.sleep(0.1) + if self.impl.poll_core_status(): + break + + pcs = self.impl.read_core_pc(True) + is_valid = self.pcs_match_target(pcs, lock_acq_pc) + if not is_valid: + LOGGER.log( + f"{sid}: Invalid result in skip_iterations_to_lock_acq. " + f"target_pc={lock_acq_pc} pcs={pcs} " + ) + #else: + # LOGGER.log( + # f"{sid}: Successfully skipped to lock acq pc. " + # f"target_pc={lock_acq_pc} pcs={pcs} " + # ) + return is_valid + def read_performance_counters(self, c, r): """ Read and display the values and configuration registers of all performance counters @@ -398,6 +429,25 @@ def read_core_pc(self): """ return self.read_aie_regs(self.aie_iface.Core_registers["CORE_PC"]) + def read_core_pc_dict(self): + """ + Read the core program counter from all AIE tiles + """ + return self.read_aie_regs(self.aie_iface.Core_registers["CORE_PC"]) + + def read_core_pc_tile(self, c, r): + """ + Read the core program counter from all AIE tiles + """ + return self.impl.read_register(c, r, self.aie_iface.Core_registers["CORE_PC"]) + + def single_step_core(self, c, r): + """ + Single step an aie core + """ + offset = self.aie_iface.Core_registers["DEBUG_CONTROL0"] + self.impl.write_register(c, r, offset, (1<<2)) + def disable_ecc_event(self): """ Disable ECC Event for this stamp @@ -406,3 +456,37 @@ def disable_ecc_event(self): return for c, r in self._filter_tiles(self.aie_iface.AIE_TILE_T): self.impl.write_register(c, r, self.aie_iface.Core_registers["ECC_SCRUB_EVENT"], 0) + + def pcs_match_target(self, pcs, target_pc, allow_combo_delay=False): + """ + PC matching utility + """ + # AIE PC can lag the breakpoint by 1-2 cycles; combo events add more delay. + # 8 cycles is a safe margin for most cases + num_pipeline_stages = 5 + max_pc_tolerance = 32 + + delay_allowed = max_pc_tolerance if allow_combo_delay else 1 + pc_matches = all(abs(pc - target_pc) < delay_allowed for pc in pcs) + if not pc_matches: + # some tiles aren't halted + if not self.impl.poll_core_status(): + return False + pc_dict = self.read_core_pc_dict() + for tile, val in pc_dict.items(): + if target_pc == val: + continue + #print(f"Try to reconcile tile {tile} {val}") + col, row = tile + for _ in range(num_pipeline_stages): + self.single_step_core(col, row) + newpc = self.read_core_pc_tile(col, row) + delta = newpc - target_pc + if target_pc == newpc or max_pc_tolerance > delta > 0 : + break + # if core pc is slightly ahead, we should be okay + # but if not, execution can run into trouble later + if target_pc > self.read_core_pc_tile(col, row): + return False + #print("Successfully reconciled") + return True diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py index 07dde77..8463e92 100644 --- a/src/mldebug/batch_runner.py +++ b/src/mldebug/batch_runner.py @@ -19,9 +19,6 @@ from mldebug.utils import LOGGER, cleanup_and_exit, timeit -# 16 byte pm, we assume 2 clock cycle delay -COMBO_EVENT_MAX_DELAY_CYCLES = 32 - class BatchRunner: """ @@ -265,9 +262,7 @@ def schedule_layer_start(self, next_layer): pcs = self.impls[sid].read_core_pc(True) # combo event trigger has one cycle delay - is_correct_pc = all(stamp.start_pc == pc for pc in pcs) - if not is_correct_pc and pml: - is_correct_pc = all(pc - stamp.start_pc < COMBO_EVENT_MAX_DELAY_CYCLES for pc in pcs) + is_correct_pc = utl.pcs_match_target(pcs, stamp.start_pc, allow_combo_delay=pml) if is_correct_pc: self._process_start_breakpoint(next_layer, 1, sid=sid) @@ -400,21 +395,26 @@ def _run_stamp(self, layer, sid, target_itr, cur_it=1): True on success, False on error. """ stamp = layer.stamps[sid] + utl = self.aie_utls[sid] + skip_end_pc = not (self.args.run_flags.l1_ofm_dump and stamp.end_pc) if not target_itr: target_itr = layer.lcp.num_iter if self.args.run_flags.skip_iter: - self.state.error = not self.aie_utls[sid].skip_iterations(target_itr - cur_it, sid) + self.state.error = not utl.skip_iterations(target_itr - cur_it, sid) + elif self.args.run_flags.skip_iter2: + self.state.error = not utl.skip_iterations_to_lock_acq( + self.design_info.work_dir.post_layer_lock_acq_pcs[sid], target_itr - cur_it, sid) else: while cur_it < target_itr: self.hit_next_breakpoint(sid) all_pc = self.impls[sid].read_core_pc(True) - if all(stamp.start_pc == pc for pc in all_pc): + if utl.pcs_match_target(all_pc, stamp.start_pc): if cur_it % layer.lcp.depth_iter != 0 or skip_end_pc: cur_it += 1 self._process_start_breakpoint(layer, cur_it, sid=sid) - elif all(stamp.end_pc == pc for pc in all_pc): + elif utl.pcs_match_target(all_pc, stamp.end_pc): cur_it += 1 self._process_end_breakpoint(layer, cur_it, sid) else: diff --git a/src/mldebug/input_parser.py b/src/mldebug/input_parser.py index 93e64d3..6db5d64 100644 --- a/src/mldebug/input_parser.py +++ b/src/mldebug/input_parser.py @@ -35,6 +35,7 @@ class RunFlags: l2_ifm_dump: bool text_dump: bool skip_iter: bool + skip_iter2: bool # Test Flags mock_hang: bool dump_temps: bool @@ -121,6 +122,7 @@ def get_flag(s, default=False): get_flag("l2_ifm_dump"), get_flag("text_dump"), get_flag("skip_iter"), + get_flag("skip_iter2"), get_flag("mock_hang"), get_flag("dump_temps"), get_flag("multistamp"), diff --git a/src/mldebug/mldebug_cli.py b/src/mldebug/mldebug_cli.py index bdbd5bb..b41ac19 100644 --- a/src/mldebug/mldebug_cli.py +++ b/src/mldebug/mldebug_cli.py @@ -332,7 +332,8 @@ def app(): "skip_iter", "dump_temps", "multistamp", - "disable_tg" + "disable_tg", + "skip_iter2" ], help="Specify one or more runtime flags:\n" "skip_dump : Do not dump memory\n" @@ -342,6 +343,7 @@ def app(): "l1_ofm_dump : Dump L1 ofm buffers in addition to others\n" "text_dump : Dump in text format\n" "skip_iter : Skip iterations in batch mode when possible\n" + "skip_iter2 : skip_iter using lcp lock.(Telluride only)\n" #"dump_temps : Write intermediate (.lst) files to disk\n" "multistamp : Enable N Stamp/Batch mode\n", #"disable_tg : Disable Step to TG layers\n", diff --git a/src/mldebug/work_dir.py b/src/mldebug/work_dir.py index 31b2811..0f9bb3e 100644 --- a/src/mldebug/work_dir.py +++ b/src/mldebug/work_dir.py @@ -96,6 +96,9 @@ def __init__(self, aie_dir, peano, overlay, dump_lst=False): self.peano = peano self.aie_dir = aie_dir self.dump_lst = dump_lst + # Lock acquire instruction PC after layer execution + # This pc can be used for skip_iter + self.post_layer_lock_acq_pcs = [0] * num_stamps self._stamp_lst_map = {} for sid in range(num_stamps): @@ -103,6 +106,13 @@ def __init__(self, aie_dir, peano, overlay, dump_lst=False): self._initialize_functions(aie_dir, overlay) + def _check_for_lock_acq(self, line, sid, llvm): + """ + find lock acq in base lst + """ + if "acq" in line.lower(): + self.post_layer_lock_acq_pcs[sid] = self._get_pc(line, llvm) + def _demangle(self, fstring): """ Demangle a C++ mangled function name using c++filt. @@ -315,6 +325,8 @@ def _parse_lst_chess(self, elf, stampid): if not Path(lst_file).is_file(): return False + is_base = "reloadable" not in elf_name + self.aie_functions[stampid][elf_name] = [] with open(lst_file, encoding="utf-8") as fd: lines = fd.read().split("\n") @@ -339,6 +351,9 @@ def _parse_lst_chess(self, elf, stampid): while i < count: line = lines[i] pc_val = self._get_pc(line) + if demangled == "_main" and is_base: + # Find LCP Lock Acquire (Last lock acquire in base lst) + self._check_for_lock_acq(lines[i], stampid, False) if pc_val: last_valid_pc = pc_val if "REL" in line and self._breakpoint_allowed(lines, i): @@ -511,6 +526,8 @@ def _parse_lst_llvm(self, elf, stampid): self._stamp_lst_map[stampid].append((elf_name, self._get_lst(elf_path, elf_name))) lines = data.split("\n") + is_base = "reloadable" not in elf_name + self.aie_functions[stampid][elf_name] = [] flist = self.aie_functions[stampid][elf_name] in_func = None @@ -545,6 +562,9 @@ def _parse_lst_llvm(self, elf, stampid): if not in_func: continue in_func.final_lock_release_pc = self._get_pc(line, llvm=True) + # lock acq + elif is_base and in_func and in_func.name == "main": + self._check_for_lock_acq(line, stampid, True) def find_functions_by_pc(self, pc): """