From 0d4ae7961fd29d86369f5fb088f6804195e68441 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 9 Sep 2025 05:12:05 +0000 Subject: [PATCH 001/194] [Frontend] Use ops instead of raw assembly code --- .../mlir/mlir_codegen_backend.py | 522 ++++++++---------- PyTorchSimFrontend/mlir/mlir_common.py | 81 +-- PyTorchSimFrontend/mlir/mlir_template.py | 145 ++--- Simulator/simulator.py | 2 +- 4 files changed, 351 insertions(+), 399 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 6650f429..d4c2fdd6 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -36,19 +36,9 @@ def reduction_init(reduction_type, dtype): if reduction_type == "prod": return float(1) if dtype.is_floating_point else int(1) if reduction_type in {"max", "argmax"}: - if dtype == torch.float32: - return f"0x{mlir_common.MLIR_INF['-inf']['f32']:x}" - elif dtype == torch.float64: - return f"0x{mlir_common.MLIR_INF['-inf']['f64']:x}" - else: - return "0.0" + return "-inf" if reduction_type in {"min", "argmin"}: - if dtype == torch.float32: - return f"0x{mlir_common.MLIR_INF['inf']['f32']:x}" - elif dtype == torch.float64: - return f"0x{mlir_common.MLIR_INF['inf']['f64']:x}" - else: - return "0.0" + return "inf" if reduction_type in {"welford_reduce"}: return f"0.0" raise AssertionError(reduction_type) @@ -221,9 +211,9 @@ class ExtensionOverrides(common.OpOverrides): def custom_cast(operand, target_type, *args, var_info=None, **kwargs): dtype = var_info[operand][1] if dtype == "index": - ret = ops.index_cast(operand, target_type, var_info=var_info) + ret = ops.index_cast(operand, target_type) else: - ret = ops.to_dtype(operand, target_type, var_info=var_info) + ret = ops.to_dtype(operand, target_type) return ret, var_info[ret] @staticmethod @@ -238,26 +228,26 @@ def binary_elementwise_common(operand1, operand2, var_info): lhs_tile_size, lhs_dtype = op_type1 rhs_tile_size, rhs_dtype = op_type2 if lhs_tile_size > rhs_tile_size: - operand2 = ops.broadcast(operand2, operand1, var_info=var_info) + operand2 = ops.broadcast(operand2, lhs_tile_size) op_type2 = var_info[operand2] elif lhs_tile_size < rhs_tile_size: - operand1 = ops.broadcast(operand1, operand2, var_info=var_info) + operand1 = ops.broadcast(operand1, rhs_tile_size) op_type1 = var_info[operand1] # Data type check if op_type1[1] != op_type2[1]: if op_type1[1] == "index" or op_type1 == "index": if op_type1[1] == "index": - operand1 = ops.index_cast(operand1, op_type2[1], var_info) + operand1 = ops.index_cast(operand1, op_type2[1]) op_type1 = var_info[operand1] if op_type2[1] == "index": - operand2 = ops.index_cast(operand2, op_type1[1], var_info) + operand2 = ops.index_cast(operand2, op_type1[1]) op_type2 = var_info[operand2] elif op_type1[1][0] == "i" and op_type2[1][0] == "f": - operand1 = ops.to_dtype(operand1, op_type2[1], var_info) + operand1 = ops.to_dtype(operand1, op_type2[1]) op_type1 = var_info[operand1] elif op_type1[1][0] == "f" and op_type2[1][0] == "i": - operand2 = ops.to_dtype(operand2, op_type1[1], var_info) + operand2 = ops.to_dtype(operand2, op_type1[1]) op_type2 = var_info[operand2] elif op_type1[1][0] == op_type2[1][0]: if mlir_common.MLIR_TO_BIT[op_type1[1]] > mlir_common.MLIR_TO_BIT[op_type2[1]]: @@ -332,7 +322,7 @@ def minimum(operand1, operand2, *args, var_info=None, **kwargs): if ret_type[0] == "f": opcode = f'arith.minimumf' else: - opcode = f'arith.minimumui' + opcode = f'arith.minui' return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod @@ -342,14 +332,14 @@ def maximum(operand1, operand2, *args, var_info=None, **kwargs): if ret_type[0] == "f": opcode = f'arith.maximumf' else: - opcode = f'arith.maximumui' + opcode = f'arith.maxui' return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs): src_mlir_dtype = var_info[operand][1] if src_mlir_dtype == "index": - operand = ops.index_cast(operand, "i64", var_info=var_info) + operand = ops.index_cast(operand, "i64") src_mlir_dtype = var_info[operand][1] tile_size = var_info[operand][0] @@ -368,7 +358,7 @@ def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs): return f"arith.extui %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype] elif dst_bits < src_bits: return f"arith.trunc %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype] - return f"arith.maximumi %{operand}, %{operand} : {shape}", [tile_size, dst_mlir_dtype] + return f"arith.maxui %{operand}, %{operand} : {shape}", [tile_size, dst_mlir_dtype] elif dst_mlir_dtype[0] == "f": if dst_bits > src_bits: return f"arith.extf %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype] @@ -389,7 +379,7 @@ def constant(value, src_type, *args, var_info=None, **kwargs): elif "e" in str(value): value = format(float(value), ".20f") elif src_type[0] == "f": - value = format(value, ".20f") + value = format(float(value), ".20f") elif src_type[0] == "i": value = int(value) return f'arith.constant {value} : {src_type}', [1, src_type] @@ -412,9 +402,7 @@ def exp(operand, *args, var_info=None, **kwargs): # Check scalar op_type = var_info[operand] if op_type[0] == 1: - val = ops.constant(0, op_type[1]) - var_info[val][0] = 4 - operand = ops.broadcast(operand, val) + operand = ops.broadcast(operand, 4) val = ops.exp(operand) result = ops.extractelement(val, 0) return result, var_info[result] @@ -440,9 +428,7 @@ def erf(operand, *args, var_info=None, **kwargs): # Check scalar op_type = var_info[operand] if op_type[0] == 1: - val = ops.constant(0, op_type[1]) - var_info[val][0] = 4 - operand = ops.broadcast(operand, val) + operand = ops.broadcast(operand, 4) val = ops.erf(operand) result = ops.extractelement(val, 0) return result, var_info[result] @@ -459,9 +445,7 @@ def tanh(operand, *args, var_info=None, **kwargs): # Check scalar op_type = var_info[operand] if op_type[0] == 1: - val = ops.constant(0, op_type[1]) - var_info[val][0] = 4 - operand = ops.broadcast(operand, val) + operand = ops.broadcast(operand, 4) val = ops.tanh(operand) result = ops.extractelement(val, 0) return result, var_info[result] @@ -471,7 +455,7 @@ def tanh(operand, *args, var_info=None, **kwargs): # Type check & auto cast if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info) + operand, dtype = ops.to_dtype(operand, "f32") var_info[operand] = dtype shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype return f'math.tanh %{operand} : {shape}', [tile_size, dtype] @@ -483,9 +467,7 @@ def sin(operand, *args, var_info=None, **kwargs): # Check scalar op_type = var_info[operand] if op_type[0] == 1: - val = ops.constant(0, op_type[1]) - var_info[val][0] = 4 - operand = ops.broadcast(operand, val) + operand = ops.broadcast(operand, 4) val = ops.sin(operand) result = ops.extractelement(val, 0) return result, var_info[result] @@ -495,7 +477,7 @@ def sin(operand, *args, var_info=None, **kwargs): # Type check & auto cast if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info) + operand, dtype = ops.to_dtype(operand, "f32") var_info[operand] = dtype shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype return f'math.sin %{operand} : {shape}', [tile_size, dtype] @@ -507,9 +489,7 @@ def cos(operand, *args, var_info=None, **kwargs): # Check scalar op_type = var_info[operand] if op_type[0] == 1: - val = ops.constant(0, op_type[1]) - var_info[val][0] = 4 - operand = ops.broadcast(operand, val) + operand = ops.broadcast(operand, 4) val = ops.cos(operand) result = ops.extractelement(val, 0) return result, var_info[result] @@ -519,7 +499,7 @@ def cos(operand, *args, var_info=None, **kwargs): # Type check & auto cast if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info) + operand, dtype = ops.to_dtype(operand, "f32") var_info[operand] = dtype shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype return f'math.cos %{operand} : {shape}', [tile_size, dtype] @@ -532,7 +512,7 @@ def sqrt(operand, *args, var_info=None, **kwargs): # Type check & auto cast if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info) + operand, dtype = ops.to_dtype(operand, "f32") var_info[operand] = dtype shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype @@ -546,7 +526,7 @@ def rsqrt(operand, *args, var_info=None, **kwargs): # Type check & auto cast if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info) + operand, dtype = ops.to_dtype(operand, "f32") var_info[operand] = dtype shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype @@ -557,12 +537,12 @@ def pow(operand1, operand2, *args, var_info=None, **kwargs): tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) # Type check & auto cast if ret_type[0] != "f": - operand1, ret_type = ops.to_dtype(operand1, "f32", var_info=var_info) + operand1, ret_type = ops.to_dtype(operand1, "f32") var_info[operand1] = ret_type # Type check & auto cast if ret_type[0] != "f": - operand2, ret_type = ops.to_dtype(operand2, "f32", var_info=var_info) + operand2, ret_type = ops.to_dtype(operand2, "f32") var_info[operand2] = ret_type shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type @@ -576,7 +556,7 @@ def log(operand, *args, var_info=None, **kwargs): # Type check & auto cast if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info) + operand, dtype = ops.to_dtype(operand, "f32") var_info[operand] = dtype shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype @@ -590,7 +570,7 @@ def reciprocal(operand, *args, var_info=None, **kwargs): # Type check & auto cast if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info) + operand, dtype = ops.to_dtype(operand, "f32") var_info[operand] = dtype return ops.div(ops.constant(1.0, dtype), operand), [tile_size, dtype] @@ -615,7 +595,7 @@ def neg(operand, *args, var_info=None, **kwargs): # Type check & auto cast if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info) + operand, dtype = ops.to_dtype(operand, "f32") var_info[operand] = dtype shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype @@ -718,12 +698,12 @@ def and_(operand1, operand2, *args, var_info=None, **kwargs): # Type check & auto cast if op_type1[1][0] != "i": - operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info) + operand1, dtype = ops.to_dtype(operand1, "i32") var_info[operand1] = dtype # Type check & auto cast if op_type2[1][0] != "i": - operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info) + operand1, dtype = ops.to_dtype(operand1, "i32") var_info[operand2] = dtype ret_type = op_type1[1] @@ -739,12 +719,12 @@ def or_(operand1, operand2, *args, var_info=None, **kwargs): # Type check & auto cast if op_type1[1][0] != "i": - operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info) + operand1, dtype = ops.to_dtype(operand1, "i32") var_info[operand1] = dtype # Type check & auto cast if op_type2[1][0] != "i": - operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info) + operand1, dtype = ops.to_dtype(operand1, "i32") var_info[operand2] = dtype ret_type = op_type1[1] @@ -760,12 +740,12 @@ def xor(operand1, operand2, *args, var_info=None, **kwargs): # Type check & auto cast if op_type1[1][0] != "i": - operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info) + operand1, dtype = ops.to_dtype(operand1, "i32") var_info[operand1] = dtype # Type check & auto cast if op_type2[1][0] != "i": - operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info) + operand1, dtype = ops.to_dtype(operand1, "i32") var_info[operand2] = dtype ret_type = op_type1[1] @@ -791,7 +771,7 @@ def logical_not(operand, *args, var_info=None, **kwargs): tile_size = op_type[0] shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type const_one = ops.constant(0, ret_type) - const_one = ops.broadcast(const_one, operand, var_info=var_info) + const_one = ops.broadcast(const_one, tile_size) ret = ops.eq(operand,const_one) return ret, [tile_size, var_info[ret]] @@ -831,17 +811,22 @@ def sigmoid(operand, *args, var_info=None, **kwargs): def where(condition, operand1, operand2, *args, var_info=None, **kwargs): tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) cond_type = var_info[condition] + operand_type = var_info[operand1] if cond_type[0] < tile_size: - condition = ops.broadcast(condition, operand1, var_info=var_info) + condition = ops.broadcast(condition, operand_type[0]) elif cond_type[0] > tile_size: - operand1 = ops.broadcast(operand1, condition, var_info=var_info) - operand2 = ops.broadcast(operand2, condition, var_info=var_info) + operand1 = ops.broadcast(operand1, operand_type[0]) + operand2 = ops.broadcast(operand2, operand_type[0]) tile_size, ret_type = var_info[operand1] shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type cond_shape = f"vector<{tile_size}xi1>," if tile_size > 1 else "" return f"arith.select %{condition}, %{operand1}, %{operand2} : {cond_shape} {shape}", [tile_size, ret_type] + @staticmethod + def step(size, dtype, *args, **kwargs): + index_shape = f"vector<{size}x{dtype}>" + return f"vector.step : {index_shape}", [size, dtype] @staticmethod def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", ninf_declared=False, **kwargs): @@ -858,32 +843,77 @@ def index_cast(operand, target_type, *args, var_info=None, **kwrags): return f"arith.index_cast %{operand} : {src_shape} to {des_shape}", [op_type[0], target_type] @staticmethod - def broadcast_unflat(operand1, operand2, *args, var_info=None, **kwargs): + def broadcast_unflat(operand1, target_size, *args, var_info=None, **kwargs): op_type1 = var_info[operand1] - op_type2 = var_info[operand2] src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>"# if op_type1[0] > 1 else op_type1[1] - des_shape = f"vector<{op_type2[0]//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>"# if op_type2[0] > 1 else op_type1[1] # Use tile size only + des_shape = f"vector<{target_size//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>"# if op_type2[0] > 1 else op_type1[1] # Use tile size only expand = f"vector.broadcast %{operand1} : {src_shape} to {des_shape}" - return expand, [op_type2[0], op_type1[1]] + return expand, [target_size, op_type1[1]] @staticmethod - def broadcast(operand1, operand2, *args, var_info=None, **kwargs): + def broadcast(operand1, target_size, *args, var_info=None, **kwargs): op_type1 = var_info[operand1] - op_type2 = var_info[operand2] src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>" if op_type1[0] > 1 else op_type1[1] - des_shape = f"vector<{op_type2[0]}x{op_type1[1]}>" # if op_type2[0] > 1 else op_type1[1] # Use tile size only + des_shape = f"vector<{target_size}x{op_type1[1]}>" # if op_type2[0] > 1 else op_type1[1] # Use tile size only # Special case for length 2 vector. We used this vector to avoid scalar operations... - if op_type1[0] != 1 and op_type2[0] % op_type1[0] == 0: - unflat_operand = ops.broadcast_unflat(operand1, operand2) - unflat_shape = f"vector<{op_type2[0]//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>" + if op_type1[0] != 1 and target_size % op_type1[0] == 0: + unflat_operand = ops.broadcast_unflat(operand1, target_size) + unflat_shape = f"vector<{target_size//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>" expand = f"vector.shape_cast %{unflat_operand} : {unflat_shape} to {des_shape}" elif op_type1[0] == 1: expand = f"vector.broadcast %{operand1} : {src_shape} to {des_shape}" else: raise NotImplementedError("Not supporting broadcast type...") - return expand, [op_type2[0], op_type1[1]] + return expand, [target_size, op_type1[1]] + + @staticmethod + def shape_cast(operand, src_shape, dst_shape, *args, var_info=None, **kwargs): + operand_type = var_info[operand] + return f"vector.shape_cast %{operand} : {src_shape} to {dst_shape}", operand_type + + @staticmethod + def multi_reduction(acc, init, vec_size, red_size, red_shape, red_type, type_name, *args, **kwargs): + if red_size == 1: + final_reduced_shape = f"{type_name}" + line = reduction_combine_vec(red_type, acc, init, axis=0, shape=red_shape, reduced_shape=final_reduced_shape) + else: + final_reduced_shape = f"vector<{red_size}x{type_name}>" + new_vshape= f"vector<{vec_size//red_size}x{red_size}x{type_name}>" + value = ops.shape_cast(acc, red_shape, new_vshape) + line = reduction_combine_vec(red_type, value, init, axis=0, shape=new_vshape, reduced_shape=final_reduced_shape) + return line, [red_size, type_name] + + @staticmethod + def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, var_info=None, **kwargs): + if compute_vec_size == 1: + vshape = f"{mlir_dtype}" + operation = "affine.load" + line = f"{operation} %{buffer}[{indices}] : {buffer_shape}" + else: + vshape = f"vector<{compute_vec_size}x{mlir_dtype}>" + operation = "affine.vector_load" + line = f"{operation} %{buffer}[{indices}] : {buffer_shape}, {vshape}" + return line, [compute_vec_size, mlir_dtype] + + @staticmethod + def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, var_info=None, **kwargs): + compute_vec_size, mlir_dtype = var_info[operand][0], var_info[operand][1] + + if compute_vec_size == 1: + vshape = f"{mlir_dtype}" + operation = "affine.store" + line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}" + else: + vshape = f"vector<{compute_vec_size}x{mlir_dtype}>" + operation = "affine.vector_store" + line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}, {vshape}" + + if buffer_name is not None: + return common.DeferredLine(buffer_name, line), [None, None] + else: + return line, [None, None] RTYPE_TO_MLIR = { "sum": "add", @@ -1031,7 +1061,6 @@ def parse_indices(self, expr, buffer=None, comments="", indirect_dims=[]) -> com def parse_index_list(self, expr_list:list, buffer=None, offset=sympy.Number(0)) -> common.CSEVariable: if buffer is None: buffer = self.applys - zero_var = self.get_const_cse(0) expr_list = [arg for arg in expr_list] dim_list = [f"d{i}" for i in range(len(expr_list))] @@ -1102,6 +1131,7 @@ def load(self, name: str, index: sympy.Expr): # Define scratch pad buffer sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index) + compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"]) # MVIN Encoding attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding={padding}}}" @@ -1110,24 +1140,15 @@ def load(self, name: str, index: sympy.Expr): self.cse.generate(dma_buffer, code, assignment = False) # FIXME: assignment = False does not support caching if not comptute_depedency: - compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"]) # Generate vector load instruction - if compute_vec_size > 1: - operation = "affine.vector_load" - line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}" - else: - operation = "affine.load" - line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}" - - out = self.cse.generate(load_buffer, line) - self.register_var_info(out, [compute_vec_size, mlir_dtype]) - self.spad_buffer_dict[str(out)] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape] - return out + with self.override_buffer_cse(buffer=load_buffer): + out = ops._load(compute_vec_size, mlir_dtype, sram_var, compute_index_var, tile_shape) else: + # FIXME. Any good idea? out = sram_var self.register_var_info(out, [compute_vec_size, mlir_dtype]) - self.spad_buffer_dict[str(out)] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape] - return out + self.spad_buffer_dict[str(out)] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape] + return out def store(self, name: str, index: sympy.Expr, value, *args, **kwargs): index = self.rename_indexing(index) @@ -1148,30 +1169,25 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs): vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype) compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size() require_store = True - if compute_vec_size < self.var_info[value][0]: - value = self.cse.generate(self.stores, f"vector.extract_strided_slice %{value} {{offsets = [0], sizes = [{compute_vec_size}], strides = [1]}}: vector<{self.var_info[value][0]}x{self.var_info[value][1]}> to {vshape}") - self.register_var_info(value, [compute_vec_size, mlir_dtype]) if str(value) in self.spad_buffer_dict: # Todo. If tile_size is not same (i.e., view operation), we can't apply peephole optimization easily require_store = self.spad_buffer_dict[str(value)][1] != tile_size + if compute_vec_size < self.var_info[value][0]: + value = self.cse.generate(self.stores, f"vector.extract_strided_slice %{value} {{offsets = [0], sizes = [{compute_vec_size}], strides = [1]}}: vector<{self.var_info[value][0]}x{self.var_info[value][1]}> to {vshape}") + self.register_var_info(value, [compute_vec_size, mlir_dtype]) + if require_store: # Define scratch pad buffer sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index) compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"]) # Generate vector store instruction - store_size, operand_type = self.var_info[value] + _, operand_type = self.var_info[value] if mlir_dtype != operand_type: - value = ops.custom_cast(value, mlir_dtype, var_info=self.var_info) - - if compute_vec_size > 1 and store_size > 1: - operation = "affine.vector_store" - line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}" - else: - operation = "affine.store" - line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}" - self.stores.writeline(common.DeferredLine(name, line)) # TODO: Should be changed to self.compute? + value = ops.custom_cast(value, mlir_dtype) + with self.override_buffer_cse(buffer=self.stores): + ops._store(value, sram_var, compute_index_var, tile_shape, buffer_name=name) else: sram_var = self.spad_buffer_dict[str(value)][0] sram_index_var = self.spad_buffer_dict[str(value)][3] @@ -1207,9 +1223,9 @@ def reduction(self, dtype, src_dtype, reduction_type, value): reduced_shape = self.kernel_group.tile_desc.get_mlir_vshape(type_name) # Prepare reduction init - init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}") - init_vec = init if vec_len == 1 else self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {type_name} to {reduced_shape}") - self.register_var_info(init_vec, [vec_len, type_name]) + with self.override_buffer_cse(cse=self.const_cse, buffer=self.const_buffer): + init = self.get_const_cse(reduction_init(reduction_type, dtype), type_name) + init_vec = init if vec_len == 1 else ops.broadcast(init, vec_len) acc_var_list = [] iter_var_list = [] @@ -1248,95 +1264,65 @@ def reduction(self, dtype, src_dtype, reduction_type, value): self.affine_yield[acc] = reduced_shape, reduction_depth # Final reduction - acc = acc_var_list[0] # Set outermost acc var reduction_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_reduction_numel() + acc = acc_var_list[0] # Set outermost acc var + self.register_var_info(acc, [reduction_size, type_name]) assert(vec_len % reduction_size==0) - if vec_len > reduction_size: - init = self.const_cse.generate(self.reductions_suffix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}") - if reduction_size == 1: - final_reduced_shape = f"{type_name}" - out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, acc, init, axis=0, shape=reduced_shape, reduced_shape=final_reduced_shape)) - else: - final_reduced_shape = f"vector<{reduction_size}x{type_name}>" - init_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{init} : {type_name} to {final_reduced_shape}") - new_vshape= f"vector<{vec_len//reduction_size}x{reduction_size}x{type_name}>" - value = self.cse.generate(self.reductions_suffix, f"vector.shape_cast %{acc} : {reduced_shape} to {new_vshape}") - out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, value, init_vec, axis=0, shape=new_vshape, reduced_shape=final_reduced_shape)) - acc = out - - # reigster reduction output - var_info = [reduction_size, mlir_common.DTYPE_TO_MLIR[dtype]] - self.register_var_info(acc, var_info) + + # Prepare init value + init = self.get_const_cse(reduction_init(reduction_type, dtype), type_name) + if reduction_size != 1: + with self.override_buffer_cse(buffer=self.reductions_suffix): + init = ops.broadcast(init, reduction_size) + + # Final reduction codegen + with self.override_buffer_cse(buffer=self.reductions_suffix): + if vec_len > reduction_size: + acc = ops.multi_reduction(acc, init, vec_len, reduction_size, reduced_shape, reduction_type, type_name) return acc def store_reduction(self, name, index, value): - # Note: Change cse temporaily # Store reduction can't share cached value stored in cse, # since it is not innermost loop body. - tmp_cse = self.cse - tmp_apply_cse = self.apply_cse - self.cse = self.reduction_cse - self.apply_cse = self.reduction_cse - dram_var = self.kernel_group.args.output(name) dtype = V.graph.get_dtype(name) mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype] index = self.rename_indexing(index) - # Tile is always reuduced in inner loop - local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index, broadcast=False, store_reduction=True, buffer=self.reductions_suffix) - vlane_split_axis = local_tile_desc.vmap.vlane_split_axis - vlane_stride = local_tile_desc.vmap.vlane_stride - - dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name]) - tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype) - tile_stride = local_tile_desc.get_tile_stride() - compute_vec_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_reduction_numel() - if compute_vec_size == 1: - vshape = f"{mlir_dtype}" - else: - vshape = f"vector<{compute_vec_size}x{mlir_dtype}>" - sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index) - if self.welford_reduce_out is not None: - sum, sqr_sum, _ = self.welford_reduce_out - # mean - reduction_numel = reduce(mul, self.ranges[self.reduction_depth:], 1) - divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(reduction_numel)} : f32") - if compute_vec_size > 1: - divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to vector<{self.var_info[sum][0]}x{mlir_dtype}>") - else: - divider_vec = divider - mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{sum}, %{divider_vec} : {vshape}") - - # m2 = (E(X^2) - E(X)^2) * N - sqr_mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{sqr_sum}, %{divider_vec} : {vshape}") - mean_sqr = self.cse.generate(self.reductions_suffix, f"arith.mulf %{mean}, %{mean} : {vshape}") - variance = self.cse.generate(self.reductions_suffix, f"arith.subf %{sqr_mean}, %{mean_sqr} : {vshape}") - m2 = self.cse.generate(self.reductions_suffix, f"arith.mulf %{variance}, %{divider_vec} : {vshape}") - if self.current_node.node.origin_node: # FIXME: This is a temporary solution - value = mean - else: - value = m2 - - # Select src type - if compute_vec_size == 1: - operation = "affine.store" - line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}" - else: - operation = "affine.vector_store" - line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}" - self.reductions_suffix.writeline(common.DeferredLine(name, line)) + with self.override_buffer_cse(cse=self.reduction_cse): + # Tile is always reuduced in inner loop + local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index, broadcast=False, store_reduction=True, buffer=self.reductions_suffix) + vlane_split_axis = local_tile_desc.vmap.vlane_split_axis + vlane_stride = local_tile_desc.vmap.vlane_stride - # MVOUT Encoding - # Generate DMA instruction - attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}" - code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var, - dram_shape, tile_shape, attribute) - self.reductions_suffix.writeline(common.DeferredLine(name, code)) + dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name]) + tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype) + tile_stride = local_tile_desc.get_tile_stride() - # Restore origin cse - self.cse = tmp_cse - self.apply_cse = tmp_apply_cse + sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index) + with self.override_buffer_cse(buffer=self.reductions_suffix): + if self.welford_reduce_out is not None: + # Calc var and mean + sum, sqr_sum, _ = self.welford_reduce_out + reduction_numel = reduce(mul, self.ranges[self.reduction_depth:], 1) + divider = self.get_const_cse(float(reduction_numel), "f32") + mean = ops.div(sum, divider) + sqr_mean = ops.div(sqr_sum, divider) + mean_sqr = ops.mul(mean, mean) + variance = ops.sub(sqr_mean, mean_sqr) + m2 = ops.mul(variance, divider) + if self.current_node.node.origin_node: # FIXME: This is a temporary solution + value = mean + else: + value = m2 + # Store value to scratch pad + ops._store(value, sram_var, sram_index_var, tile_shape, buffer_name=name) + + # Generate DMA instruction + attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}" + code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var, + dram_shape, tile_shape, attribute) + self.reductions_suffix.writeline(common.DeferredLine(name, code)) def indirect_indexing(self, index_var, size, check=True): return str(index_var) @@ -1354,77 +1340,71 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index): strides = tile_desc.get_tile_stride_per_lane() # Create vector index - compute_vec = self.cse.generate(self.compute, f"vector.broadcast %{self.compute_idx} : index to vector<{compute_vec_size}xindex>") - self.register_var_info(compute_vec, [compute_vec_size, "index"]) + compute_vec = ops.broadcast(self.compute_idx, compute_vec_size) vector_index = ops.add(base_vector_index, compute_vec) # Create tile_dim index dim_list = [] for idx in range(len(tile_size)): - div_coeff = self.get_const_cse(strides[idx], "index") - mod_coeff = self.get_const_cse(tile_size[idx], "index") - div_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{div_coeff} : index to vector<{compute_vec_size}xindex>") - mod_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{mod_coeff} : index to vector<{compute_vec_size}xindex>") - self.register_var_info(div_vec, [compute_vec_size, "index"]) - self.register_var_info(mod_vec, [compute_vec_size, "index"]) - dim = ops.modular(ops.div(vector_index, div_vec), mod_vec) - if idx == tile_desc.vmap.vlane_split_axis: # Need to add vector lane offset - offset = tile_desc.vmap.vlane_stride #* strides[idx] - outer_sz = tile_size[idx] // tile_desc.vmap.vlane_stride - + # Prepare initial values + offset = tile_desc.vlane_stride #* strides[idx] + outer_sz = tile_size[idx] // tile_desc.vlane_stride + with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse): + div_coeff = self.get_const_cse(strides[idx], "index") + mod_coeff = self.get_const_cse(tile_size[idx], "index") + vlane_stride_coeff = self.get_const_cse(tile_desc.vlane_stride, "index") + vlane_outer_coeff = self.get_const_cse(outer_sz, "index") nr_vector_lane = self.get_const_cse(self.vector_lane, "index") - nr_vector_lane_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{nr_vector_lane} : index to vector<{compute_vec_size}xindex>") - self.register_var_info(nr_vector_lane_vec, [compute_vec_size, "index"]) + vlane_coeff = self.get_const_cse(0, "i64") - vlane_stride_coeff = self.get_const_cse(tile_desc.vmap.vlane_stride, "index") - vlane_outer_coeff = self.get_const_cse(outer_sz, "index") - vlane_stride_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{vlane_stride_coeff} : index to vector<{compute_vec_size}xindex>") - vlane_outer_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{vlane_outer_coeff} : index to vector<{compute_vec_size}xindex>") - self.register_var_info(vlane_stride_vec, [compute_vec_size, "index"]) - self.register_var_info(vlane_outer_vec, [compute_vec_size, "index"]) + div_vec = ops.broadcast(div_coeff, compute_vec_size) + mod_vec = ops.broadcast(mod_coeff, compute_vec_size) + nr_vector_lane_vec = ops.broadcast(nr_vector_lane, compute_vec_size) + vlane_stride_vec = ops.broadcast(vlane_stride_coeff, compute_vec_size) + vlane_outer_vec = ops.broadcast(vlane_outer_coeff, compute_vec_size) + + # Prepare vlane offset (vidx) + vlane_vec_size = 4 + vlane_vec = ops.broadcast(vlane_coeff, vlane_vec_size) + + dim = ops.modular(ops.div(vector_index, div_vec), mod_vec) + if idx == tile_desc.vlane_split_axis: # Need to add vector lane offset stride_dim = ops.modular(dim, vlane_stride_vec) outer_dim = ops.modular(ops.div(dim, vlane_stride_vec), vlane_outer_vec) - dim = ops.add(stride_dim, ops.mul(outer_dim, nr_vector_lane_vec)) - # Prepare vlane offset (vidx) - vlane_coeff = self.get_const_cse(0, "i64") - vlane_vec_size = 4 - vlane_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{vlane_coeff} : i64 to vector<{vlane_vec_size}xi64>") vlane_offset = self.const_cse.generate(self.const_buffer, f"arith.addi %{vlane_vec}, %{vlane_vec} {{ vlane_offset={offset} }} : vector<{vlane_vec_size}xi64> // vlane offset") self.register_var_info(vlane_offset, [vlane_vec_size, "i64"]) vlane_offset = ops.index_cast(vlane_offset, "index") - self.register_var_info(vlane_offset, [vlane_vec_size, "index"]) - dim = ops.add(dim, vlane_offset) dim_list.append(dim) indices = [str(i) for i in index.free_symbols] for idx in indices: i = int(idx[5:]) - index_vec = self.cse.generate(self.compute, f"vector.broadcast %{idx} : index to vector<{compute_vec_size}xindex>") - self.register_var_info(index_vec, [compute_vec_size, "index"]) + idx = self.itervar_cses[idx] + index_vec = ops.broadcast(idx, compute_vec_size) offset = ops.add(index_vec, dim_list[i]) dim_list[i] = offset arg_lists = [] for arg in renamed_expression.args: if isinstance(arg, sympy.Integer): - offset = self.get_const_cse(int(arg)) - offset_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{offset} : index to vector<{compute_vec_size}xindex>") - self.register_var_info(offset_vec, [compute_vec_size, "index"]) + with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse): + offset = self.get_const_cse(int(arg), "index") + offset_vec = ops.broadcast(offset, compute_vec_size) arg_lists.append(offset_vec) elif isinstance(arg, sympy.Mul): if isinstance(arg.args[0], sympy.Integer) and isinstance(arg.args[1], sympy.Symbol): - coeff = self.get_const_cse(int(arg.args[0])) - coeff_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{coeff} : index to vector<{compute_vec_size}xindex>") - self.register_var_info(coeff_vec, [compute_vec_size, "index"]) + with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse): + coeff = self.get_const_cse(int(arg.args[0]), "index") + coeff_vec = ops.broadcast(coeff, compute_vec_size) result = ops.mul(dim_list[int(str(arg.args[1])[1:])], coeff_vec) arg_lists.append(result) elif isinstance(arg.args[1], sympy.Integer) and isinstance(arg.args[0], sympy.Symbol): - coeff = self.get_const_cse(int(arg.args[1])) - coeff_vec = self.cse.generate(self.compute, f"vector.broadcast %{coeff} : index to vector<{compute_vec_size}xindex>") - self.register_var_info(coeff_vec, [compute_vec_size, "index"]) + with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse): + coeff = self.get_const_cse(int(arg.args[1]), "index") + coeff_vec = ops.broadcast(coeff, compute_vec_size) result = ops.mul(dim_list[int(str(arg.args[0])[1:])], coeff_vec) arg_lists.append(result) else: @@ -1474,18 +1454,16 @@ def index_expr(self, index, dtype): # Initialize base vector if not self.base_vector_initialized: - init_iter = "iter" + init_iter = self.register_var_cse("init_iter", 1, "index") parallel_map = f"affine.parallel (%{init_iter}) = ({0}) to ({compute_vec_size}) {{ // Base vector initializer" self.spad_buffer.writeline(parallel_map) with self.spad_buffer.indent(): - self.spad_buffer.writeline(f"%init_vec = vector.broadcast %{init_iter} : index to vector<2xindex>") - self.spad_buffer.writeline(f"affine.vector_store %init_vec, %{sram_var}[%{init_iter}] : {tile_shape}, vector<2xindex>") + with self.override_buffer_cse(buffer=self.spad_buffer, cse=self.init_vec_cse): + init_vec = ops.broadcast(init_iter, 2) + ops._store(init_vec, sram_var, f"%{init_iter}", tile_shape) self.spad_buffer.writeline("}") self.base_vector_initialized = True - - line = f"affine.vector_load %{sram_var}[0] : {tile_shape}, {vshape}" - base_vector_index = self.cse.generate(self.compute, line) - self.register_var_info(base_vector_index, [compute_vec_size, "index"]) + base_vector_index = ops._load(compute_vec_size, "index", sram_var, "0", tile_shape) renamed_symbols = {symbol: "d"+str(symbol)[5:] for symbol in index.free_symbols} renamed_expression = index.subs(renamed_symbols) @@ -1744,7 +1722,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe if broadcast and (total_dims != local_dims or (self.reduction_depth!=len(total_dims) and total_dims[:self.reduction_depth] == local_dims)): local_dims = total_dims # Brodatcast tile shape - index_var = self.parse_indices(index, buffer=buffer, indirect_dims=indirect_dims) + index_var = self.parse_indices(index, buffer=buffer, indirect_dims=indirect_dims, comments=f"// store_reduction={store_reduction}") if kg_tile_desc.vmap.vlane_split_axis in local_dims: local_vlane_split_axis = local_dims.index(kg_tile_desc.vmap.vlane_split_axis) @@ -1957,14 +1935,18 @@ def get_scratchpad_buffer(self, dtype, dram_name, tile_desc, raw_index, buffer=N return sram_var, sram_index_var def get_const_cse(self, value, dtype="index") -> common.CSEVariable: + # Why not use ops.constant? Because there are some cases that can't use ops (e.g., def_dma_op) # Type convert - if dtype[0] == "f": + if value in ["inf", "-inf", "nan"]: + value = f"0x{mlir_common.MLIR_INF[value][dtype]:x}" + elif dtype[0] == "f": value = float(value) else: value = int(value) if value not in self.consts: self.consts[str(value)+dtype] = self.const_cse.generate(self.const_buffer, f"arith.constant {value} : {dtype}") + self.register_var_info(self.consts[str(value)+dtype], [1, dtype]) return self.consts[str(value)+dtype] def get_tag_cse(self, value=None, shape="memref<1xi32>"): @@ -1979,16 +1961,16 @@ def get_mask(self): if self.compute_body_loop.size % self.compute_body_loop.step == 0: return None, None compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size() - index_shape = f"vector<{self.compute_body_loop.step}xindex>" mask_shape = f"vector<{compute_vec_size}xi1>" - upper_bound = self.get_const_cse(self.compute_body_loop.size) - step_vec = self.const_cse.generate(self.const_buffer, f"vector.step : {index_shape}") + with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse): + upper_bound = ops.constant(self.compute_body_loop.size, "index") + step_vec = ops.step(self.compute_body_loop.step, "index") - gap = self.mask_cse.generate(self.masks, f"arith.subi %{upper_bound}, %{self.compute_idx} : index") - gap_vec = self.mask_cse.generate(self.masks, f"vector.broadcast %{gap} : index to {index_shape}") - mask_var = self.mask_cse.generate(self.masks, f"arith.cmpi ult, %{step_vec}, %{gap_vec} : {index_shape}") - self.register_var_info(mask_var, [compute_vec_size, "i1"]) + with self.override_buffer_cse(buffer=self.masks, cse=self.mask_cse): + gap = ops.sub(upper_bound, self.compute_idx) + gap_vec = ops.broadcast(gap, self.compute_body_loop.step) + mask_var = ops.lt(step_vec, gap_vec) return mask_shape, mask_var def convert_indirect_indexing(self, index :sympy.Expr): @@ -2007,14 +1989,8 @@ def convert_indirect_indexing(self, index :sympy.Expr): indirect_dims.sort() first_dim = indirect_dims[0] spad_vars = dict() - old_compute, old_dma_lods, old_dma_stores = self.compute, self.dma_loads, self.dma_stores compute_dependecy = any([target_dim not in self.spad_buffer_dict for target_dim in indirect_dims]) - if compute_dependecy: - self.compute = old_dma_stores - target_dma_buffers = self.dma_stores - else: - self.compute = old_dma_lods - target_dma_buffers = self.dma_loads + target_dma_buffers = self.dma_stores if compute_dependecy else self.dma_loads # Load indirect operands for target_dim in indirect_dims: @@ -2028,6 +2004,7 @@ def convert_indirect_indexing(self, index :sympy.Expr): local_tile_desc = self.kernel_group.tile_desc tile_numel_per_lane = local_tile_desc.get_numel_per_lane() tile_shape = local_tile_desc.get_mlir_shape(var_info[1]) + tile_vec = local_tile_desc.get_compute_vec_size() vshape = f"vector<{var_info[0]}x{var_info[1]}>" sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, target_dim, local_tile_desc, target_dim) self.spad_buffer_dict[target_dim] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape] @@ -2038,52 +2015,37 @@ def convert_indirect_indexing(self, index :sympy.Expr): line = f"{opeartion} %{target_dim}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}" self.stores.writeline(line) mlir_dtype = vshape.split("x")[1][:-1] - vshape = f"vector<{tile_numel_per_lane}x{mlir_dtype}>" # FIXME. Maybe require fine grain compute... - if tile_numel_per_lane > 1: - operation = "affine.vector_load" - line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape} // For indirect access" - else: - operation = "affine.load" - line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape} // For indirect access" - out = self.cse.generate(target_dma_buffers, line) - self.register_var_info(out, [tile_numel_per_lane, mlir_dtype]) - spad_vars[target_dim] = out - - # Apply stride - for arg in index.args: - if "tmp" not in str(arg): - continue - if arg.is_Mul and arg.args[0].is_number: - coeff_dtype = self.var_info[spad_vars[str(arg.args[1])]][1] - coeff = ops.constant(int(arg.args[0]), coeff_dtype) - spad_vars[str(arg.args[1])] = ops.mul(spad_vars[str(arg.args[1])], coeff) - index = index.replace(arg, 0) - - # Sum - for dim, var in spad_vars.items(): - if dim == first_dim: - continue - spad_vars[first_dim] = ops.add(spad_vars[first_dim], var) + with self.override_buffer_cse(buffer=target_dma_buffers): + out = ops._load(tile_numel_per_lane, mlir_dtype, sram_var, sram_index_var, tile_shape) + spad_vars[target_dim] = out + + with self.override_buffer_cse(buffer=target_dma_buffers): + # Apply stride + for arg in index.args: + if "tmp" not in str(arg): + continue + if arg.is_Mul and arg.args[0].is_number: + coeff_dtype = self.var_info[spad_vars[str(arg.args[1])]][1] + coeff = self.get_const_cse(int(arg.args[0]), coeff_dtype) + spad_vars[str(arg.args[1])] = ops.mul(spad_vars[str(arg.args[1])], coeff) + index = index.replace(arg, 0) + + # Sum + for dim, var in spad_vars.items(): + if dim == first_dim: + continue + spad_vars[first_dim] = ops.add(spad_vars[first_dim], var) # Store index var sram_var, _, tile_numel_per_lane, sram_index_var, tile_shape, vshape = self.spad_buffer_dict[first_dim] mlir_dtype = vshape.split("x")[1][:-1] - vshape = f"vector<{tile_numel_per_lane}x{mlir_dtype}>" # FIXME. Maybe require fine grain compute... - if tile_numel_per_lane > 1: - operation = "affine.vector_store" - line = f"{operation} %{spad_vars[first_dim]}, %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}" - else: - operation = "affine.store" - line = f"{operation} %{spad_vars[first_dim]}, %{sram_var}[{sram_index_var}] : {tile_shape}" - out = self.cse.generate(target_dma_buffers, line, assignment=False) + with self.override_buffer_cse(buffer=target_dma_buffers): + ops._store(spad_vars[first_dim], sram_var, sram_index_var, tile_shape) # FIXME. Maybe require fine grain compute... # Conversion mlir_dtype = self.var_info[spad_vars[first_dim]][1] - line = f"affine.load %{sram_var}[{sram_index_var}] : {tile_shape}" - out = self.cse.generate(target_dma_buffers, line) - if mlir_dtype != "index": - line = f"arith.index_cast %{out} : {mlir_dtype} to {'index'}" - out = self.cse.generate(target_dma_buffers, line) - self.register_var_info(out, [1, "index", [1]]) - self.compute, self.dma_loads, self.dma_stores = old_compute, old_dma_lods, old_dma_stores + with self.override_buffer_cse(buffer=target_dma_buffers): + out = ops._load(1, mlir_dtype, sram_var, sram_index_var, tile_shape) + if mlir_dtype != "index": + out = ops.index_cast(out, "index") return index + sympy.Symbol(str(out)), compute_dependecy diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 4d33eea4..f4dbe678 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -1,5 +1,7 @@ import dataclasses import math +import contextvars +from contextlib import contextmanager from dataclasses import dataclass from typing import Dict from typing import List @@ -68,7 +70,7 @@ torch.int8: "int8_t", torch.uint8: "uint8_t", torch.bool: "uint8_t", - torch.bfloat16: "bfloat16", + torch.bfloat16: "uint16_t", } MLIR_TO_BIT = { @@ -588,6 +590,7 @@ def __init__(self, kernel_group, reason=None): self.ranges = None self.reduction_depth = None self.itervars = None + self.itervar_cses = None # Code buffer self.vector_compute = IndentedBuffer() self.reductions_suffix = IndentedBuffer() @@ -595,12 +598,17 @@ def __init__(self, kernel_group, reason=None): # MLIR SSA tracker self.var_info = {} # MLIR variable info self.buffer_types : dict = None # format: dtype, numel, size, stride - self.compute_idx = "compute_idx" + # Create compute idx + self.compute_idx = self.register_var_cse("compute_idx", 1, "index") self.compute_body_loop = LoopLevel(self.compute_idx, 1) self.prologue_compute_body_loop = LoopLevel(self.compute_idx, 1) self.recodegen = reason # spad overflow, tile size, vlane stride self.stop_autotune = False + # Context var for codegen + self.target_buffer_override = contextvars.ContextVar("Handler_compute_override", default=self.compute) + self.target_cse_override = contextvars.ContextVar("Handler_cse_override", default=self.cse) + def set_ranges(self, lengths, reduction_lengths): if self.call_ranges: assert self.call_ranges == tuple(lengths) + tuple( @@ -611,6 +619,7 @@ def set_ranges(self, lengths, reduction_lengths): self.call_ranges = tuple(lengths) + tuple(reduction_lengths) self.ranges = [self.rename_indexing(x) for x in self.call_ranges] self.itervars = [sympy.Symbol(f"index{n}") for n in range(len(self.ranges))] + self.itervar_cses = {str(index) : self.register_var_cse(str(index), 1, "index") for index in self.itervars} self.reduction_depth = len(lengths) return ( self.itervars[: self.reduction_depth], @@ -801,28 +810,6 @@ def get_constant_vector(self, expr): constant_vector = [[int(expr.coeff(var)),None] for var in self.itervars] return constant_vector - def get_constant_vector2(self, expr): - # Case 0. symbol ex) index 0 - # Case 1. inner product form ex) 16 * index0 + 1 * index1 - # Case 2. Complicated form ex) 16 * index0 + 8 * (index//4) + (index % 4) - constant_vector = [] - if expr.is_symbol: - constant_vector.append(tuple([1, expr])) - return constant_vector - - for arg in expr.args: - if arg.is_symbol: - constant_vector.append(tuple([1,arg])) - continue - if len(arg.args) == 0: #TODO: check this - continue - if arg.args[0].is_number: - constant_vector.append(arg.args) - else: - constant_vector.append([1, arg]) - - return constant_vector - def find_node_by_name(self, name): if name in V.graph.graph_inputs: return V.graph.graph_inputs[name] @@ -837,6 +824,11 @@ def is_scalar(self, name): def roundup_vectorlane(self, size, amp=1): return ((size + self.vector_lane - 1) // self.vector_lane) * self.vector_lane * amp + def register_var_cse(self, name, size, dtype): + var = self.create_cse_var(name, ValueRanges.unknown()) + self.register_var_info(var, [size, dtype]) + return var + def register_var_info(self, var, var_info): self.var_info[var] = var_info @@ -854,6 +846,21 @@ def rename_indexing(self, index) -> sympy.Expr: } return sympy_subs(index, replacements) + @contextmanager + def override_buffer_cse(self, *, buffer=None, cse=None): + target_buffer = target_cse = None + try: + if buffer is not None: + target_buffer = self.target_buffer_override.set(buffer) + if cse is not None: + target_cse = self.target_cse_override.set(cse) + yield self + finally: + if target_cse is not None: + self.target_cse_override.reset(target_cse) + if target_buffer is not None: + self.target_buffer_override.reset(target_buffer) + def __enter__(self): class CSEProxy: self.name = "CSEProxy" @@ -861,16 +868,22 @@ class CSEProxy: @staticmethod def __getattr__(name: str) -> Callable[..., common.CSEVariable]: # type: ignore[misc] def inner(*args, **kwargs): - code, ret_info = getattr(parent_handler, name)(*args, var_info=self.var_info) - csevar = self.cse.generate( - self.compute, - code, - bounds=ValueRanges.unknown(), - assignment=(ret_info[0] is not None) - ) - if ret_info[0] is not None: - self.register_var_info(csevar, ret_info) - csevar.update_on_args(name, args, kwargs) + code, ret_info = getattr(parent_handler, name)(*args, var_info=self.var_info, **kwargs) + target_buffer = self.target_buffer_override.get() + target_cse = self.target_cse_override.get() + if isinstance(code, common.DeferredLine): + target_buffer.writeline(code) + return None + else: + csevar = target_cse.generate( + target_buffer, + code, + bounds=ValueRanges.unknown(), + assignment=(ret_info[0] is not None) + ) + if ret_info[0] is not None: + self.register_var_info(csevar, ret_info) + csevar.update_on_args(name, args, kwargs) return csevar return inner diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index e493464a..12782ce8 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -85,7 +85,8 @@ def as_local(self): } try: self.set_buffers() - yield self + with self.kernel.override_buffer_cse(buffer=self.compute, cse=self.cse): + yield self finally: self.restore_buffers() @@ -822,7 +823,7 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}") attribute = " {" + ", ".join(attribute_parts) + "}" code = self.get_dma_code(dma_type, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var, - dram_shape, tile_shape, "") + dram_shape, tile_shape, "") local_code.writeline(code) local_code.writeline(attribute) return textwrap.indent(local_code.getvalue(), " "*indent_size).strip() @@ -885,28 +886,18 @@ def load_epilogue(self, name: str, index: sympy.Expr): zero_var = self.get_const_cse(0) if not self.reduction_fusion: compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"]) - if compute_vec_size > 1: - operation = "affine.vector_load" - line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}" - else: - operation = "affine.load" - line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}" - out = self.cse.generate(self.loads, line) - self.register_var_info(out, [compute_vec_size, mlir_dtype]) + with self.override_buffer_cse(buffer=self.loads): + out = ops._load(compute_vec_size, mlir_dtype, sram_var, compute_index_var, tile_shape) else: # For reduction case reduce_size = self.reduction_nr_outer_loop vsize = compute_vec_size//reduce_size - vshape = f"vector<{vsize}x{mlir_dtype}>" if compute_vec_size > 1: offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.r_tile_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})") compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{offset}"]) - operation = "affine.vector_load" - line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}" - out = self.cse.generate(self.loads, line) - else: - line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}" - out = self.cse.generate(self.loads, line) + + with self.override_buffer_cse(buffer=self.loads): + out = ops._load(vsize, mlir_dtype, sram_var, compute_index_var, tile_shape) self.register_var_info(out, [self.compute_body_loop.step, mlir_dtype]) return out @@ -924,10 +915,6 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs): tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype) tile_stride = self.kernel_group.tile_desc.get_tile_stride() - # Compute vector unit size - vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype) - compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size() - if name not in self.buffer_names: sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, self.kernel_group.tile_desc, index) self.buffer_names[name] = sram_var @@ -945,14 +932,9 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs): value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info) compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"]) # Generate vector load instruction - if compute_vec_size > 1: - operation = "affine.vector_store" - line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}" - else: - operation = "affine.store" - line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}" - line = line if store_force else DeferredLine(name, line) - self.stores.writeline(line) + buffer_name = name if not store_force else None + with self.override_buffer_cse(buffer=self.stores): + ops._store(value, sram_var, compute_index_var, tile_shape, buffer_name=buffer_name) # Generate DMA instruction attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}" @@ -991,6 +973,7 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value): tile_shape = local_tile_desc.get_mlir_shape(type_name) vshape = local_tile_desc.get_mlir_vshape(type_name) + compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size() name = f"{reduction_type}_buffer{self.reduction_buffer_idx}" self.reduction_buffer_idx += 1 @@ -1002,24 +985,21 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value): zero_var_list = [f"%{self.get_const_cse(0)}"] * local_tile_desc.get_nr_dim() zero_var_list[-2] = f"%{self.reduction_loop_idx}" compute_index_var = ", ".join(zero_var_list) - operation = "affine.vector_load" - line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}" - out = self.cse.generate(self.loads, line) - self.register_var_info(out, [self.compute_body_loop.step, type_name]) + with self.override_buffer_cse(buffer=self.loads): + out = ops._load(vec_size, type_name, sram_var, compute_index_var, tile_shape) # Reduction body codegen - init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}") - init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {type_name} to {vshape}") - self.register_var_info(init_vec, [local_tile_desc.get_compute_vec_size(), type_name]) + with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse): + init = ops.constant(reduction_init(reduction_type, dtype), type_name) + init_vec = ops.broadcast(init, compute_vec_size) + mask_shape, mask_var = self.get_mask() if mask_var is not None: value = ops.where(mask_var, value, init_vec) result = reduction_partial_combine_vec(reduction_type, value, out) # Store partial result - operation = "affine.vector_store" - line = f"{operation} %{result}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}" - self.compute.writeline(line) # Need to be placed after partial reduction + ops._store(result, sram_var, compute_index_var, tile_shape) # Need to be placed after partial reduction self.reduction_info[sram_var] = [reduction_type, local_tile_desc] return sram_var @@ -1050,63 +1030,60 @@ def store_reduction_epilogue(self, name, index, value): partial_tile_shape = partial_tile_desc.get_mlir_shape(mlir_dtype) # Prepare constant - init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(self.reduction_info[value][0], dtype)} : {mlir_dtype}") + with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse): + init = ops.constant(reduction_init(self.reduction_info[value][0], dtype), mlir_dtype) + init_vec = ops.broadcast(init, partial_vec_size) + init_vec2 = ops.broadcast(init, 2) + partial_zero_var_list = [f"%{self.get_const_cse(0)}"] * partial_tile_desc.get_nr_dim() final_zero_var_list = [f"%{self.get_const_cse(0)}"] * final_tile_desc.get_nr_dim() for i in range(self.reduction_body_loop.size): # Load partial result - body_index_var = self.const_cse.generate(self.const_buffer, f"arith.constant {i} : index") - partial_zero_var_list[-2] = f"%{body_index_var}" - compute_index_var = ",".join(partial_zero_var_list) - - operation = "affine.vector_load" - line = f"{operation} %{value}[{compute_index_var}] : {partial_tile_shape}, {partial_vshape}" - out = self.cse.generate(self.reductions_suffix, line) - operation = "affine.vector_store" - init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {mlir_dtype} to {partial_vshape}") - line = f"{operation} %{init_vec}, %{value}[{compute_index_var}] : {partial_tile_shape}, {partial_vshape}" - self.reductions_suffix.writeline(line) - - # 2 step reduction - new_vec_size = 2 - new_vshape = f"vector<{partial_vec_size//new_vec_size}x{new_vec_size}x{mlir_dtype}>" - new_reduced_shape = f"vector<{new_vec_size}x{mlir_dtype}>" - out = self.cse.generate(self.reductions_suffix, f"vector.shape_cast %{out} : {partial_vshape} to {new_vshape}") - init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {mlir_dtype} to {new_reduced_shape}") - out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(self.reduction_info[value][0], out, init_vec, axis=0, shape=new_vshape, reduced_shape=new_reduced_shape)) - out2 = self.cse.generate(self.reductions_suffix, f"vector.shuffle %{out}, %{out} [1, 0] : {new_reduced_shape}, {new_reduced_shape}") + with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse): + body_index_var = ops.constant(i, "index") + partial_zero_var_list[-2] = f"%{body_index_var}" + compute_index_var = ",".join(partial_zero_var_list) + + with self.override_buffer_cse(buffer=self.reductions_suffix): + out = ops._load(partial_vec_size, mlir_dtype, sram_var, compute_index_var, partial_tile_shape) + ops._store(init_vec, value, compute_index_var, partial_tile_shape) # Clear the partial buffer to zero + + # 2 step reduction + new_vec_size = 2 + new_reduced_shape = f"<{new_vec_size}x{mlir_dtype}>" + reduction_type = self.reduction_info[value][0] + out = ops.multi_reduction(out, init_vec, partial_vec_size, new_vec_size, reduction_type, partial_vshape, self.reduction_info[value][0], mlir_dtype) - self.compute, self.reductions_suffix = self.reductions_suffix, self.compute - self.register_var_info(out, [new_vec_size, mlir_dtype]) + out2 = self.cse.generate(self.reductions_suffix, f"vector.shuffle %{out}, %{out} [1, 0] : {new_reduced_shape}, {new_reduced_shape}") self.register_var_info(out2, [new_vec_size, mlir_dtype]) - out = reduction_partial_combine_vec(self.reduction_info[value][0], out, out2) - self.compute, self.reductions_suffix = self.reductions_suffix, self.compute + + with self.override_buffer_cse(buffer=self.reductions_suffix): + out = reduction_partial_combine_vec(self.reduction_info[value][0], out, out2) if self.welford_reduce_out is not None: # NOTE: It not a real welford algorithm... We just used E(X^2) - E(X)^2 - divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.r_dim_size)} : f32") - if self.buffer_types[name][1] > 1: - divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to {new_reduced_shape}") - else: - divider_vec = divider - - if self.current_node.node.origin_node: # FIXME: This is a temporary solution - # mean = SUM(X) / N - self.reduction_mean.append(self.cse.generate(self.reductions_suffix, f"arith.divf %{out}, %{divider_vec} : {new_reduced_shape}")) - out = self.reduction_mean[i] - else: - # m2 = (E(X^2) - E(X)^2) * N - sqr_mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{out}, %{divider_vec} : {new_reduced_shape}") - mean_sqr = self.cse.generate(self.reductions_suffix, f"arith.mulf %{self.reduction_mean[i]}, %{self.reduction_mean[i]} : {new_reduced_shape}") - variance = self.cse.generate(self.reductions_suffix, f"arith.subf %{sqr_mean}, %{mean_sqr} : {new_reduced_shape}") - m2 = self.cse.generate(self.reductions_suffix, f"arith.mulf %{variance}, %{divider_vec} : {new_reduced_shape}") - out = m2 + with self.override_buffer_cse(buffer=self.reductions_suffix): + divider = ops.constant(float(self.reduction_axis_size), "f32") + if self.buffer_types[name][1] > 1: + divider_vec = ops.broadcast(divider, new_vec_size) + else: + divider_vec = divider + + if self.current_node.node.origin_node: # FIXME: This is a temporary solution + # mean = SUM(X) / N + self.reduction_mean.append(ops.div(out, divider_vec)) + out = self.reduction_mean[i] + else: + # m2 = (E(X^2) - E(X)^2) * N + sqr_mean = ops.div(out, divider_vec) + mean_sqr = ops.mul(self.reduction_mean[i], self.reduction_mean[i]) + variance = ops.sub(sqr_mean, mean_sqr) + m2 = ops.mul(variance, divider_vec) + out = m2 final_zero_var_list[-1] = f"%{body_index_var}" final_compute_index_var = ",".join(final_zero_var_list) - operation = "affine.vector_store" - line = f"{operation} %{out}, %{sram_var}[{final_compute_index_var}] : {final_tile_shape}, {new_reduced_shape}" - self.reductions_suffix.writeline(DeferredLine(name, line)) + ops._store(out, sram_var, final_compute_index_var, final_tile_shape, buffer_name=name) # MVOUT Encoding # Generate DMA instruction diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 322d9b12..91d53b09 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -53,7 +53,7 @@ def write_arg(self, arg, path, name): tensor = arg.cpu().detach() buffer_size = tensor.untyped_storage().size() buffer = (ctypes.c_char * buffer_size).from_address(tensor.data_ptr()) - t_arr = np.frombuffer(buffer, dtype=tensor.numpy().dtype, count=buffer_size // tensor.element_size()) + t_arr = np.frombuffer(buffer, dtype=TORCH_TO_NUMPY[tensor.dtype], count=buffer_size // tensor.element_size()) t_arr.tofile(data_path) else: assert(0) From bea9bd2f6c7575b1a456a91d844493defc296f6b Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 9 Sep 2025 12:12:51 +0000 Subject: [PATCH 002/194] [Test] Add matmul vector fusion case --- tests/Fusion/test_matmul_vector.py | 52 ++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 tests/Fusion/test_matmul_vector.py diff --git a/tests/Fusion/test_matmul_vector.py b/tests/Fusion/test_matmul_vector.py new file mode 100644 index 00000000..bf1bd513 --- /dev/null +++ b/tests/Fusion/test_matmul_vector.py @@ -0,0 +1,52 @@ +import torch +import torch._dynamo +import torch.utils.cpp_extension + +def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): + if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): + message = f"|{name} Test Passed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + else: + message = f"|{name} Test Failed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + print("custom out: ", out.cpu()) + print("cpu out: ", cpu_out) + exit(1) + +def test_matmul_vector(device, size=[56, 78, 239], dim=0): + def matmul_fused(a, b, c, d): + return torch.matmul(a, b) + c + d + torch.manual_seed(0) + input = torch.randn(size[:2]) + weight = torch.randn(size[1:]) + output_sz = [size[0], size[2]] + output_sz[dim]=1 + bias = torch.zeros(output_sz) + add = torch.zeros(output_sz) + x1 = input.to(device=device) + w1 = weight.to(device=device) + b1 = bias.to(device=device) + a1 = add.to(device=device) + x2 = input.to("cpu") + w2 = weight.to("cpu") + b2 = bias.to("cpu") + a2 = add.to("cpu") + opt_fn = torch.compile(dynamic=False)(matmul_fused) + res = opt_fn(x1, w1, a1, b1) + y = matmul_fused(x2, w2, a2, b2) + test_result("Matmul Vector Fusion Forward", res, y) + +if __name__ == "__main__": + import os + import sys + sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) + + from Scheduler.scheduler import ExecutionEngine + module = ExecutionEngine.setup_device() + device = module.custom_device() + test_matmul_vector(device, size=[253, 123, 47], dim=0) + test_matmul_vector(device, size=[253, 123, 47], dim=1) \ No newline at end of file From 837b0627df4a5c3134a493a4996c0847d7d123ba Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 9 Sep 2025 14:14:09 +0000 Subject: [PATCH 003/194] [Frontend] Fix ops conversion --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 16 ++++++++-------- PyTorchSimFrontend/mlir/mlir_template.py | 17 ++++++++--------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index d4c2fdd6..13d75c94 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -813,12 +813,11 @@ def where(condition, operand1, operand2, *args, var_info=None, **kwargs): cond_type = var_info[condition] operand_type = var_info[operand1] if cond_type[0] < tile_size: - condition = ops.broadcast(condition, operand_type[0]) + condition = ops.broadcast(condition, tile_size) elif cond_type[0] > tile_size: - operand1 = ops.broadcast(operand1, operand_type[0]) - operand2 = ops.broadcast(operand2, operand_type[0]) + operand1 = ops.broadcast(operand1, cond_type[0]) + operand2 = ops.broadcast(operand2, cond_type[0]) tile_size, ret_type = var_info[operand1] - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type cond_shape = f"vector<{tile_size}xi1>," if tile_size > 1 else "" return f"arith.select %{condition}, %{operand1}, %{operand2} : {cond_shape} {shape}", [tile_size, ret_type] @@ -1174,10 +1173,6 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs): # Todo. If tile_size is not same (i.e., view operation), we can't apply peephole optimization easily require_store = self.spad_buffer_dict[str(value)][1] != tile_size - if compute_vec_size < self.var_info[value][0]: - value = self.cse.generate(self.stores, f"vector.extract_strided_slice %{value} {{offsets = [0], sizes = [{compute_vec_size}], strides = [1]}}: vector<{self.var_info[value][0]}x{self.var_info[value][1]}> to {vshape}") - self.register_var_info(value, [compute_vec_size, mlir_dtype]) - if require_store: # Define scratch pad buffer sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index) @@ -1186,6 +1181,11 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs): _, operand_type = self.var_info[value] if mlir_dtype != operand_type: value = ops.custom_cast(value, mlir_dtype) + + if compute_vec_size < self.var_info[value][0]: + value = self.cse.generate(self.stores, f"vector.extract_strided_slice %{value} {{offsets = [0], sizes = [{compute_vec_size}], strides = [1]}}: vector<{self.var_info[value][0]}x{self.var_info[value][1]}> to {vshape}") + self.register_var_info(value, [compute_vec_size, mlir_dtype]) + with self.override_buffer_cse(buffer=self.stores): ops._store(value, sram_var, compute_index_var, tile_shape, buffer_name=name) else: diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 12782ce8..b51c2794 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -1045,14 +1045,14 @@ def store_reduction_epilogue(self, name, index, value): compute_index_var = ",".join(partial_zero_var_list) with self.override_buffer_cse(buffer=self.reductions_suffix): - out = ops._load(partial_vec_size, mlir_dtype, sram_var, compute_index_var, partial_tile_shape) + out = ops._load(partial_vec_size, mlir_dtype, value, compute_index_var, partial_tile_shape) ops._store(init_vec, value, compute_index_var, partial_tile_shape) # Clear the partial buffer to zero # 2 step reduction new_vec_size = 2 - new_reduced_shape = f"<{new_vec_size}x{mlir_dtype}>" + new_reduced_shape = f"vector<{new_vec_size}x{mlir_dtype}>" reduction_type = self.reduction_info[value][0] - out = ops.multi_reduction(out, init_vec, partial_vec_size, new_vec_size, reduction_type, partial_vshape, self.reduction_info[value][0], mlir_dtype) + out = ops.multi_reduction(out, init_vec2, partial_vec_size, new_vec_size, partial_vshape, reduction_type, mlir_dtype) out2 = self.cse.generate(self.reductions_suffix, f"vector.shuffle %{out}, %{out} [1, 0] : {new_reduced_shape}, {new_reduced_shape}") self.register_var_info(out2, [new_vec_size, mlir_dtype]) @@ -1060,9 +1060,8 @@ def store_reduction_epilogue(self, name, index, value): with self.override_buffer_cse(buffer=self.reductions_suffix): out = reduction_partial_combine_vec(self.reduction_info[value][0], out, out2) - if self.welford_reduce_out is not None: - # NOTE: It not a real welford algorithm... We just used E(X^2) - E(X)^2 - with self.override_buffer_cse(buffer=self.reductions_suffix): + if self.welford_reduce_out is not None: + # NOTE: It not a real welford algorithm... We just used E(X^2) - E(X)^2 divider = ops.constant(float(self.reduction_axis_size), "f32") if self.buffer_types[name][1] > 1: divider_vec = ops.broadcast(divider, new_vec_size) @@ -1081,9 +1080,9 @@ def store_reduction_epilogue(self, name, index, value): m2 = ops.mul(variance, divider_vec) out = m2 - final_zero_var_list[-1] = f"%{body_index_var}" - final_compute_index_var = ",".join(final_zero_var_list) - ops._store(out, sram_var, final_compute_index_var, final_tile_shape, buffer_name=name) + final_zero_var_list[-1] = f"%{body_index_var}" + final_compute_index_var = ",".join(final_zero_var_list) + ops._store(out, sram_var, final_compute_index_var, final_tile_shape, buffer_name=name) # MVOUT Encoding # Generate DMA instruction From a33659af759442cacf4c6e85b0582c0c30b964f1 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 10 Sep 2025 06:05:59 +0000 Subject: [PATCH 004/194] [Frontend] Use custom malloc in the validation wrapper code --- .../mlir/mlir_caller_codegen.py | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py index dff6b0fd..38a1f7a9 100644 --- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py +++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py @@ -58,7 +58,11 @@ def load_arg(self): if self.is_in_arg(arg_attribute[0]): argv_idx = self.get_argv_idx() if arg_name not in self.load_args else self.load_args[arg_name] self.load_args[arg_name] = argv_idx - self.writeline(f'if(load_arg(c_{arg_name}, sizeof(c_{arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}') + ctype = DTYPE_TO_C[arg_attribute[1]] + elem_count = arg_attribute[2] + size_expr = f'({elem_count}ULL * sizeof({ctype}))' + + self.writeline(f'if(load_arg(c_{arg_name}, {size_expr}, argv[{argv_idx}]) == -1){self.open_bracket}') with self.code.indent(): self.writeline(f'return -1{self.ending}') self.writeline(self.closed_bracket) @@ -67,7 +71,10 @@ def dump_arg(self): for arg_name, arg_attribute in self.arg_attributes: if self.is_out_arg(arg_attribute[0]): argv_idx = self.get_argv_idx() if not self.is_inout_arg(arg_attribute[0]) else self.load_args[arg_name] - self.writeline(f'if(dump_arg(c_{arg_name}, sizeof(c_{arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}') + ctype = DTYPE_TO_C[arg_attribute[1]] + elem_count = arg_attribute[2] + size_expr = f'({elem_count}ULL * sizeof({ctype}))' + self.writeline(f'if(dump_arg(c_{arg_name}, {size_expr}, argv[{argv_idx}]) == -1){self.open_bracket}') with self.code.indent(): self.writeline(f'return -1{self.ending}') self.writeline(self.closed_bracket) @@ -84,29 +91,24 @@ def generate_kernel_declare(self): def generate_args_define(self): name_set = set() if self.validation: - self.writeline(f'int padding[0x100000]{self.ending}') # FIXME. For pooling operation... Some pooling layer use negative offset + self.writeline(f"int* padding = malloc(0x100000ULL * sizeof(int)){self.ending}") for arg_name, (_, arg_type, arg_size, arg_sizes, arg_stride) in self.arg_attributes: if not arg_name in name_set: - if self.validation: - self.writeline(f'{DTYPE_TO_C[arg_type]} c_{arg_name}[{arg_size}ULL]{self.ending}') + if torch.is_floating_point(torch.tensor([], dtype=arg_type)): + bits = torch.finfo(arg_type).bits + elif arg_type == torch.bool: + bits = 8 else: - if torch.is_floating_point(torch.tensor([], dtype=arg_type)): - bits = torch.finfo(arg_type).bits - elif arg_type == torch.bool: - bits = 8 - else: - bits = torch.iinfo(arg_type).bits - self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({arg_size * bits // 8}ULL){self.ending}') + bits = torch.iinfo(arg_type).bits + self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({arg_size * bits // 8}ULL){self.ending}') name_set.add(arg_name) self.writeline(self.newline) def generate_main(self): - if self.validation: - self.generate_args_define() - self.writeline(f'{self.newline}int main(int argc, char *argv[]) {self.open_bracket}{self.newline}') with self.code.indent(): if self.validation: + self.generate_args_define() self.load_arg() self.writeline(self.newline) else: From 4e2d0a022bc01b8551a73f1dbe690afffaadb63d Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 10 Sep 2025 13:55:27 +0000 Subject: [PATCH 005/194] [Device] Add missing operations --- PyTorchSimFrontend/extension_device.cpp | 301 ++++++++++++++++++------ 1 file changed, 225 insertions(+), 76 deletions(-) diff --git a/PyTorchSimFrontend/extension_device.cpp b/PyTorchSimFrontend/extension_device.cpp index 1a02bfe3..b728a852 100644 --- a/PyTorchSimFrontend/extension_device.cpp +++ b/PyTorchSimFrontend/extension_device.cpp @@ -16,6 +16,34 @@ #include #include #include +#include +namespace py = pybind11; + +namespace { + bool g_amp_enabled = false; + at::ScalarType g_amp_dtype = at::kFloat; +} + +static at::ScalarType to_scalar_type(const py::object& dtype_obj) { + py::module torch_mod = py::module::import("torch"); + if (dtype_obj.is(torch_mod.attr("bfloat16"))) return at::kBFloat16; + if (dtype_obj.is(torch_mod.attr("float16"))) return at::kHalf; + if (dtype_obj.is(torch_mod.attr("float32"))) return at::kFloat; + if (dtype_obj.is(torch_mod.attr("float64"))) return at::kDouble; + throw std::runtime_error("Unsupported dtype for extension_device AMP"); +} + +static py::object to_torch_dtype(at::ScalarType st) { + py::module torch_mod = py::module::import("torch"); + switch (st) { + case at::kBFloat16: return torch_mod.attr("bfloat16"); + case at::kHalf: return torch_mod.attr("float16"); + case at::kFloat: return torch_mod.attr("float32"); + case at::kDouble: return torch_mod.attr("float64"); + default: + throw std::runtime_error("Unsupported scalar type in get_autocast_dtype"); + } +} static uint64_t op_counter = 0; static uint64_t last_saved_value = 0; @@ -99,8 +127,16 @@ at::Tensor custom_to_device( TORCH_CHECK(self.is_contiguous()); op_counter += 1; - if (device != at::DeviceType::CPU) { - return at::empty(self.sizes(), self.options()); + if (device.type() == at::DeviceType::CPU) { + auto out = at::empty(self.sizes(), dtype, self.options().layout(), + device, false, memory_format); + std::memcpy(out.mutable_data_ptr(), self.data_ptr(), self.nbytes()); + return out; + } else { + auto opts = self.options().device(device).dtype(dtype); + auto out = at::empty(self.sizes(), opts); + std::memcpy(out.mutable_data_ptr(), self.data_ptr(), self.nbytes()); + return out; } auto out = at::empty(self.sizes(), dtype, self.options().layout(), device, false, memory_format); @@ -135,33 +171,86 @@ static DummyCustomAllocator global_custom_alloc; REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_custom_alloc); at::Tensor & custom_fill__scalar(at::Tensor & self, const at::Scalar & value) { - TORCH_CHECK(self.device().type() == c10::DeviceType::PrivateUse1, "Dummy test only allows dummy device."); + TORCH_CHECK(self.device().type() == c10::DeviceType::PrivateUse1, + "Dummy test only allows dummy device."); TORCH_CHECK(self.is_contiguous()); - // TORCH_CHECK(self.scalar_type() == c10::ScalarType::Float); op_counter += 1; - if (self.scalar_type() == c10::ScalarType::Float) { - auto _data = static_cast(self.mutable_data_ptr()); - for (size_t idx = 0; idx < self.numel(); idx++) { - _data[idx] = value.toFloat(); + + switch (self.scalar_type()) { + case c10::ScalarType::Float: { + auto* data = self.mutable_data_ptr(); + for (int64_t i = 0; i < self.numel(); i++) { + data[i] = value.toFloat(); + } + break; } - return self; - } else if (self.scalar_type() == c10::ScalarType::Int) { - auto _data = static_cast(self.mutable_data_ptr()); - for (size_t idx = 0; idx < self.numel(); idx++) { - _data[idx] = value.toInt(); + case c10::ScalarType::Double: { + auto* data = self.mutable_data_ptr(); + for (int64_t i = 0; i < self.numel(); i++) { + data[i] = value.toDouble(); + } + break; } - return self; - } else if (self.scalar_type() == c10::ScalarType::Long) { - auto _data = static_cast(self.mutable_data_ptr()); - for (size_t idx = 0; idx < self.numel(); idx++) { - _data[idx] = value.toLong(); + case c10::ScalarType::Half: { + auto* data = self.mutable_data_ptr(); + for (int64_t i = 0; i < self.numel(); i++) { + data[i] = at::Half(value.toHalf()); + } + break; } - return self; - } else { - TORCH_CHECK(false, "Unsupported scalar type."); + case c10::ScalarType::BFloat16: { + auto* data = self.mutable_data_ptr(); + for (int64_t i = 0; i < self.numel(); i++) { + data[i] = at::BFloat16(value.toBFloat16()); + } + break; + } + case c10::ScalarType::Int: { + auto* data = self.mutable_data_ptr(); + for (int64_t i = 0; i < self.numel(); i++) { + data[i] = value.toInt(); + } + break; + } + case c10::ScalarType::Long: { + auto* data = self.mutable_data_ptr(); + for (int64_t i = 0; i < self.numel(); i++) { + data[i] = value.toLong(); + } + break; + } + case c10::ScalarType::Short: { + auto* data = self.mutable_data_ptr(); + for (int64_t i = 0; i < self.numel(); i++) { + data[i] = static_cast(value.toShort()); + } + break; + } + case c10::ScalarType::Char: { + auto* data = self.mutable_data_ptr(); + for (int64_t i = 0; i < self.numel(); i++) { + data[i] = static_cast(value.toChar()); + } + break; + } + case c10::ScalarType::Byte: { + auto* data = self.mutable_data_ptr(); + for (int64_t i = 0; i < self.numel(); i++) { + data[i] = static_cast(value.toByte()); + } + break; + } + case c10::ScalarType::Bool: { + auto* data = self.mutable_data_ptr(); + for (int64_t i = 0; i < self.numel(); i++) { + data[i] = value.toBool(); + } + break; + } + default: + TORCH_CHECK(false, "Unsupported scalar type: ", self.scalar_type()); } - return self; } @@ -204,6 +293,9 @@ at::Tensor custom__copy_from(const at::Tensor& self, const at::Tensor& dst, bool "Dummy test only allows copy from cpu -> dummy device."); // Some dummy asserts for the basic use case: inputs are the same size / dtype, all contiguous. + if (self.numel() != dst.numel()) { + custom_resize_(dst, self.sizes(), c10::nullopt); + } TORCH_CHECK(self.sizes() == dst.sizes()); const bool same_dtype = (self.scalar_type() == dst.scalar_type()); @@ -255,9 +347,36 @@ at::Tensor& custom_arange_start_out_impl( const c10::Scalar& end, const c10::Scalar& step, at::Tensor& out) { - //const int64_t n = arange_len(start.toDouble(), end.toDouble(), step.toDouble()); - //at::native::resize_output(out, {n}); - return out; + double s = start.toDouble(); + double e = end.toDouble(); + double st = step.toDouble(); + TORCH_CHECK(st != 0.0, "step must be nonzero"); + + int64_t length = 0; + if (st > 0) { + if (e > s) length = static_cast(std::ceil((e - s) / st)); + } else { + if (e < s) length = static_cast(std::ceil((e - s) / st)); + } + + // Resize out tensor + custom_resize_(out, {length}, c10::nullopt); + + if (out.scalar_type() == at::kFloat || out.scalar_type() == at::kDouble) { + double* data = out.mutable_data_ptr(); + for (int64_t i = 0; i < length; i++) { + data[i] = s + i * st; + } + } else if (out.scalar_type() == at::kLong) { + int64_t* data = out.mutable_data_ptr(); + for (int64_t i = 0; i < length; i++) { + data[i] = static_cast(s + i * st); + } + } else { + TORCH_CHECK(false, "Unsupported dtype for arange on dummy device"); + } + + return out; } static at::Tensor custom_to_dtype_impl(const at::Tensor& self, @@ -276,16 +395,16 @@ static at::Tensor custom_to_dtype_impl(const at::Tensor& self, // This macro registers your kernels to the PyTorch Dispatcher. // More details on the dispatcher can be found at http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/. TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { - m.impl("to.Device", &custom_to_device); - m.impl("to.dtype", &custom_to_dtype_impl); - m.impl("fill_.Scalar", &custom_fill__scalar); - m.impl("_copy_from", &custom__copy_from); + m.impl("to.Device", &custom_to_device); + m.impl("to.dtype", &custom_to_dtype_impl); + m.impl("fill_.Scalar", &custom_fill__scalar); + m.impl("_copy_from", &custom__copy_from); m.impl("_copy_from_and_resize", &custom__copy_from_and_resize); - m.impl("empty_strided", &custom_empty_strided); - m.impl("empty.memory_format", &custom_empty); - m.impl("as_strided", at::native::as_strided_tensorimpl); - m.impl("view", at::native::view); - m.impl("arange.start_out", &custom_arange_start_out_impl); + m.impl("empty_strided", &custom_empty_strided); + m.impl("empty.memory_format", &custom_empty); + m.impl("as_strided", at::native::as_strided_tensorimpl); + m.impl("view", at::native::view); + m.impl("arange.start_out", &custom_arange_start_out_impl); } TORCH_LIBRARY_IMPL(aten, AutogradPrivateUse1, m) { @@ -293,11 +412,11 @@ TORCH_LIBRARY_IMPL(aten, AutogradPrivateUse1, m) { } TORCH_LIBRARY_FRAGMENT(aten, m) { -m.def( - "_reinterpret_tensor(Tensor self, int[] size, int[] stride, int offset_increment=0) -> Tensor", - torch::dispatch( - c10::DispatchKey::AutogradPrivateUse1, _reinterpret_tensor), - {at::Tag::pt2_compliant_tag}); + m.def( + "_reinterpret_tensor(Tensor self, int[] size, int[] stride, int offset_increment=0) -> Tensor", + torch::dispatch(c10::DispatchKey::AutogradPrivateUse1, _reinterpret_tensor), + {at::Tag::pt2_compliant_tag} + ); } void custom_cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { @@ -305,39 +424,56 @@ void custom_cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack } TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { - m.impl("add.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("add.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("abs.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("sub.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("mul.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("div.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("pow.Tensor_Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("zero_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_add.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("index.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("triu_indices", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("neg.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("sum.IntList_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("eq.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("all.all_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_local_scalar_dense", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_log_softmax", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_log_softmax_backward_data", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("mse_loss.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("nll_loss_forward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("nll_loss_backward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_lerp_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_mul_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_addcmul_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_sqrt", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_div_.ScalarList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_add_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_addcdiv_.ScalarList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_add_.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("cat.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_native_multi_head_attention", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("resize_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("exp.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("add.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("add.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("abs.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("sub.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("mul.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("div.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("pow.Tensor_Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("zero_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_foreach_add.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("index.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("triu_indices", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("neg.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("sum.IntList_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("eq.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("all.all_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_local_scalar_dense", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_log_softmax", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_log_softmax_backward_data", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("mse_loss.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("nll_loss_forward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("nll_loss_backward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_foreach_lerp_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_foreach_mul_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_foreach_addcmul_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_foreach_sqrt", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_foreach_div_.ScalarList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_foreach_add_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_foreach_addcdiv_.ScalarList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_foreach_add_.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("cat.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_native_multi_head_attention", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("resize_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("exp.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("where.self", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("ge.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("ge.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("le.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("le.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("lt.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("lt.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("gt.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("gt.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("triu", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("tril", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_and.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_and.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_or.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_or.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_not.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_not.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); } // This basic implementation doesn't bother dealing with different device indices @@ -360,7 +496,6 @@ bool custom_op_called() { class PrivateGeneratorImpl : public at::CPUGeneratorImpl { public: - // Constructors PrivateGeneratorImpl(c10::DeviceIndex device_index) { device_ = c10::Device(c10::DeviceType::PrivateUse1, device_index); key_set_ = c10::DispatchKeySet(c10::DispatchKey::PrivateUse1); @@ -382,7 +517,21 @@ void register_generator() { // that's implemented in C++. // The implementation in this file maps directly to the `PrivateUse1` device type. PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("custom_device", &get_custom_device, "get custom device object"); - m.def("custom_op_called", &custom_op_called, "check if our custom function was called"); - m.def("register_generator", ®ister_generator, "register generator for custom device"); + m.def("custom_device", &get_custom_device, "get custom device object"); + m.def("custom_op_called", &custom_op_called, "check if our custom function was called"); + m.def("register_generator", ®ister_generator, "register generator for custom device"); + m.def("is_autocast_enabled", []() -> bool { return g_amp_enabled;}); + m.def("set_autocast_enabled", [](bool flag) -> void {g_amp_enabled = flag;}); + m.def("get_autocast_dtype", []() -> py::object { return to_torch_dtype(g_amp_dtype); }); + m.def("set_autocast_dtype", [](py::object dtype_obj) -> void { + auto st = to_scalar_type(dtype_obj); + g_amp_dtype = st; + }); + m.def("get_amp_supported_dtype", []() -> py::list { + py::module torch_mod = py::module::import("torch"); + py::list lst; + lst.append(torch_mod.attr("float16")); + lst.append(torch_mod.attr("float32")); + return lst; + }); } \ No newline at end of file From 6e70edccfb515b85551933f20be22bd4e2f1fd35 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 10 Sep 2025 13:56:00 +0000 Subject: [PATCH 006/194] [Frontend] Add typecasting for logical operation --- .../mlir/mlir_codegen_backend.py | 48 ++++++++++++++----- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 13d75c94..f8195b58 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -624,7 +624,7 @@ def ne(operand1, operand2, *args, var_info=None, **kwargs): attribute = "one" elif ret_type[0] == "i": op_type = "arith.cmpi" - attribute = "sne" + attribute = "ne" else: raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}") @@ -754,13 +754,25 @@ def xor(operand1, operand2, *args, var_info=None, **kwargs): shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type return f'arith.xori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + @staticmethod + def to_bool(operand, *args, var_info=None, **kwargs): + tile_size, ret_type = var_info[operand] + const_one = ops.constant(0, ret_type) + if tile_size > 1: + const_one = ops.broadcast(const_one, tile_size) + ret = ops.ne(operand, const_one) + return ret, [tile_size, "i1"] @staticmethod def logical_and(operand1, operand2, *args, var_info=None, **kwargs): - op_type = var_info[operand1] + op_type1 = var_info[operand1] + op_type2 = var_info[operand2] # Type check & auto cast - if op_type[1] != "i1": - raise NotImplementedError("Logical operation with not bool data type") + if op_type1[1] != "i1": + operand1 = ops.to_bool(operand1) + if op_type2[1] != "i1": + operand2 = ops.to_bool(operand2) + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) return ExtensionOverrides.and_(operand1, operand2, *args, var_info=var_info, **kwargs) @staticmethod @@ -773,22 +785,30 @@ def logical_not(operand, *args, var_info=None, **kwargs): const_one = ops.constant(0, ret_type) const_one = ops.broadcast(const_one, tile_size) ret = ops.eq(operand,const_one) - return ret, [tile_size, var_info[ret]] + return ret, [tile_size, "i1"] @staticmethod def logical_or(operand1, operand2, *args, var_info=None, **kwargs): - op_type = var_info[operand1] + op_type1 = var_info[operand1] + op_type2 = var_info[operand2] # Type check & auto cast - if op_type[1] != "i1": - raise NotImplementedError("Logical operation with not bool data type") + if op_type1[1] != "i1": + operand1 = ops.to_bool(operand1) + if op_type2[1] != "i1": + operand2 = ops.to_bool(operand2) + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) return ExtensionOverrides.or_(operand1, operand2, *args, var_info=var_info, **kwargs) @staticmethod def logical_xor(operand1, operand2, *args, var_info=None, **kwargs): - op_type = var_info[operand1] + op_type1 = var_info[operand1] + op_type2 = var_info[operand2] # Type check & auto cast - if op_type[1] != "i1": - raise NotImplementedError("Logical operation with not bool data type") + if op_type1[1] != "i1": + operand1 = ops.to_bool(operand1) + if op_type2[1] != "i1": + operand2 = ops.to_bool(operand2) + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) return ExtensionOverrides.xor(operand1, operand2, *args, var_info=var_info, **kwargs) @staticmethod @@ -1006,8 +1026,10 @@ def convert_index(self, expr, buffer): expr_str = expr_str.replace("//", " floordiv ") else: raise NotImplementedError("What is this case?") - - indices = [expr.args[0]] + first_arg = expr.args[0] + if len(first_arg.free_symbols) != 1: + raise NotImplementedError("What is this case?") + indices = [list(first_arg.free_symbols)[0]] args = ", ".join(map(str, indices)) map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args}) -> ({expr_str})>") args = ", ".join([f"%{i}" for i in indices]) From 54f450a44101425fcc3dfa30d15e761ce1b53c33 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 10 Sep 2025 13:56:28 +0000 Subject: [PATCH 007/194] [Device] register amp --- Scheduler/scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index ffe8e4fc..d10df556 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -179,6 +179,7 @@ def setup_device(): ) torch.utils.rename_privateuse1_backend("npu") + torch._register_device_module("extension_device", module) from torch._inductor.codegen.common import ( get_scheduling_for_device, get_wrapper_codegen_for_device, From 8985ab8f47779be62b0efffafbcc24a33f7da134 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 12 Sep 2025 11:52:03 +0000 Subject: [PATCH 008/194] [Frontend+Test] Support scatter pattern with a test case --- PyTorchSimFrontend/extension_device.cpp | 195 ++++++++++++++---- .../mlir/mlir_codegen_backend.py | 22 +- PyTorchSimFrontend/mlir/mlir_common.py | 2 - tests/test_indirect_access.py | 36 ++++ 4 files changed, 205 insertions(+), 50 deletions(-) diff --git a/PyTorchSimFrontend/extension_device.cpp b/PyTorchSimFrontend/extension_device.cpp index b728a852..f1351fab 100644 --- a/PyTorchSimFrontend/extension_device.cpp +++ b/PyTorchSimFrontend/extension_device.cpp @@ -424,56 +424,165 @@ void custom_cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack } TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { - m.impl("add.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("add.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("abs.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("sub.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("mul.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("div.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("pow.Tensor_Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("zero_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_add.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("index.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("triu_indices", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("neg.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("sum.IntList_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("eq.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("abs", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("abs.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("abs_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("absolute", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("absolute.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("absolute_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("add.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("add.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("add.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("add_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("add_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("cat", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("cat.names", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("cat.names_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("cat.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("div.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("div.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("div.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("div_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("div_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("eq.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("eq.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("eq.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("eq.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("equal", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("erf", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("erf.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("erf_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("erfc", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("erfc.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("erfc_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("exp", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("exp.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("ge.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("ge.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("ge.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("ge.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("gt.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("gt.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("gt.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("gt.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("le.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("le.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("le.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("le.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("lt.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("lt.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("lt.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("lt.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("ne.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("ne.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("ne.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("ne.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("logical_and", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_and.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_and_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_not", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_not.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_not_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_or", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_or.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_or_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_xor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_xor.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("logical_xor_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("neg", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("neg.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("neg_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("mul.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("mul.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("mul_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("pow.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("pow.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("pow.Tensor_Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("pow.Tensor_Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("pow.Tensor_Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("pow.Tensor_Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("pow_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("pow_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("sub.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("sub.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("sub.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("sub_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("sub_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("sum", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("sum.DimnameList_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("sum.IntList_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("sum.dim_DimnameList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("sum.dim_IntList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("resize_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("resize_as_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + // Foreach ops + m.impl("_foreach_add.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_foreach_add_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_foreach_add_.ScalarList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_foreach_add.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("_foreach_add_.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + // Indexed + m.impl("index_add.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("index_add_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("index_copy.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("index_copy_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("index_fill.int_Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("index_fill.int_Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("index_fill.int_Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("index_fill.int_Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("index_fill_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("tril", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("tril_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("triu", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("triu_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("triu_indices", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("nll_loss2d_forward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("nll_loss2d_backward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("nll_loss_backward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("nll_loss_forward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("scatter.src_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("scatter.value_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("index_put.Default", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("index.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("mm.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("sigmoid.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("gather.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("silu.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("all.all_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); m.impl("_local_scalar_dense", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); m.impl("_log_softmax", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); m.impl("_log_softmax_backward_data", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); m.impl("mse_loss.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("nll_loss_forward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("nll_loss_backward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_lerp_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_mul_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_addcmul_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_sqrt", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_div_.ScalarList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_add_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_addcdiv_.ScalarList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_add_.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("cat.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); m.impl("_native_multi_head_attention", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("resize_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("exp.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); m.impl("where.self", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("ge.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("ge.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("le.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("le.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("lt.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("lt.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("gt.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("gt.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("triu", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("tril", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_and.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_and.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_or.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_or.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_not.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_not.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("min", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("max", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("index_select", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("nonzero", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + + m.impl("zero_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); + m.impl("zeros_like", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); } // This basic implementation doesn't bother dealing with different device indices diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index f8195b58..382825f5 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1171,12 +1171,24 @@ def load(self, name: str, index: sympy.Expr): self.spad_buffer_dict[str(out)] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape] return out - def store(self, name: str, index: sympy.Expr, value, *args, **kwargs): + def store(self, name: str, index: sympy.Expr, value, mode=None, *args, **kwargs): index = self.rename_indexing(index) - dram_var = self.kernel_group.args.output(name) dtype = V.graph.get_dtype(name) mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype] + # Handle scatter store + if "tmp" in str(index): + if mode == "atomic_add": + # Convert the output buffer type to the inplace buffer + arg_name = V.graph.scheduler.mutation_real_name.get(name, name) + if arg_name not in self.kernel_group.args.inplace_buffers: + self.kernel_group.args.make_inplace(arg_name, arg_name) + + loaded_value = ops.load(name, index) + value = ops.add(loaded_value, value) + index, _ = self.convert_indirect_indexing(index) + dram_var = self.kernel_group.args.output(name) + # Prepare dma instruction local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index) vlane_split_axis = local_tile_desc.vmap.vlane_split_axis @@ -1736,9 +1748,9 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe total_dims = [int(str(i)[5:]) for i in self.itervars] local_tile_desc = mlir_common.MLIRMultiDimTile([1], self.vector_lane) local_dims.sort() # Assume that smaller index is placed in the outer loop - indirect_dims = [f"{i}" for i in index.free_symbols if "tmp" in str(i)] - for indirect_dim in indirect_dims: - index = index.replace(sympy.Symbol(indirect_dim), 0) + indirect_syms = [s for s in index.free_symbols if "tmp" in s.name] + index = index.subs({s: 0 for s in indirect_syms}, simultaneous=True) + indirect_dims = [f"{i}" for i in indirect_syms] # Reduction can have two type of tile size if broadcast and (total_dims != local_dims or (self.reduction_depth!=len(total_dims) and total_dims[:self.reduction_depth] == local_dims)): diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index f4dbe678..15408c0d 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -792,8 +792,6 @@ def codegen_kernel(self, kernel_name): code.splice(self.codegen_global_init()) code.writeline(f'func.func @{kernel_decl_name}({arg_defs})') with code.indent(): - for old, new in self.kernel_group.args.aliases(): - code.writeline(f"auto {old} = {new};") # Loop body part code.splice(self.codegen_loops()) return code.getvalue() diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py index c6afaf86..6cfa7b58 100644 --- a/tests/test_indirect_access.py +++ b/tests/test_indirect_access.py @@ -43,6 +43,40 @@ def test_embedding(device, vocab_size, dim): cpu_res = cpu_emb(cpu_prompt) test_result("Embedding", res, cpu_res) +def test_scatter_add(device, num_tokens=256, hidden_size=256, num_assignments=3, dtype=torch.float32, seed=0): + torch.manual_seed(seed) + + def scatter_only(out, token_indices, weighted_output): + # token_indices: [N] (long), weighted_output: [N, H] + out.index_add_(0, token_indices, weighted_output) + return out + + out = torch.randn(num_tokens, hidden_size, dtype=dtype) + out_cp = out.clone() + token_indices = torch.randint(0, num_tokens, (num_assignments,)) + weighted_output = torch.randn(num_assignments, hidden_size, dtype=dtype) + + cpu_out = scatter_only(out, token_indices, weighted_output) + + out = out_cp.to(device=device) + token_indices = token_indices.to(device=device) + weighted_output = weighted_output.to(device=device) + opt_fn = torch.compile(dynamic=False)(scatter_only) + res = opt_fn(out, token_indices, weighted_output) + test_result("ScatterAdd(index_add_)", res, cpu_out) + +def test_scatter_full(device, size=(128, 128)): + def vectoradd(a, idx, b): + a[idx, :] = b + return a + x = torch.randn(size, dtype=torch.float32).to(device=device) + idx = torch.randint(0,128, [128]).to(device=device) + y = torch.randn(128, dtype=torch.float32).to(device=device) + opt_fn = torch.compile(dynamic=False)(vectoradd) + res = opt_fn(x, idx, y) + out = vectoradd(x.cpu(), idx.cpu(), y.cpu()) + test_result("Indirect VectorAdd", res, out) + if __name__ == "__main__": import os import sys @@ -51,5 +85,7 @@ def test_embedding(device, vocab_size, dim): from Scheduler.scheduler import PyTorchSimRunner module = PyTorchSimRunner.setup_device() device = module.custom_device() + test_scatter_full(device) + test_scatter_add(device) test_indirect_vectoradd(device) #test_embedding(device, 1024, 2048) \ No newline at end of file From 1c2c8bf010661b2695b932288cb6ced19dfd47fa Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 5 Dec 2025 13:12:45 +0000 Subject: [PATCH 009/194] [Fix] minor bugs --- PyTorchSimFrontend/extension_config.py | 2 +- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 6 +++--- Scheduler/scheduler.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index 239bbefe..8d668b58 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -67,7 +67,7 @@ def __getattr__(name): "multi_tile_conv", "subtile" } - if opt_level == "all" or opt_level is "none": + if opt_level == "all" or opt_level == "none": pass elif isinstance(opt_level, list): # Check if provided list contains only valid options diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 382825f5..5a29bc87 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1381,12 +1381,12 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index): dim_list = [] for idx in range(len(tile_size)): # Prepare initial values - offset = tile_desc.vlane_stride #* strides[idx] - outer_sz = tile_size[idx] // tile_desc.vlane_stride + offset = tile_desc.vmap.vlane_stride #* strides[idx] + outer_sz = tile_size[idx] // tile_desc.vmap.vlane_stride with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse): div_coeff = self.get_const_cse(strides[idx], "index") mod_coeff = self.get_const_cse(tile_size[idx], "index") - vlane_stride_coeff = self.get_const_cse(tile_desc.vlane_stride, "index") + vlane_stride_coeff = self.get_const_cse(tile_desc.vmap.vlane_stride, "index") vlane_outer_coeff = self.get_const_cse(outer_sz, "index") nr_vector_lane = self.get_const_cse(self.vector_lane, "index") vlane_coeff = self.get_const_cse(0, "i64") diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index d10df556..31dbf6c0 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -179,7 +179,7 @@ def setup_device(): ) torch.utils.rename_privateuse1_backend("npu") - torch._register_device_module("extension_device", module) + torch._register_device_module("npu", module) from torch._inductor.codegen.common import ( get_scheduling_for_device, get_wrapper_codegen_for_device, From 1895958e75c7f094b35927734be2d76d4fda661e Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 8 Dec 2025 05:56:29 +0000 Subject: [PATCH 010/194] [Fix] Fix the acceess to wrong variable --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +- PyTorchSimFrontend/mlir/mlir_template.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 5a29bc87..a14dd10b 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1402,7 +1402,7 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index): vlane_vec = ops.broadcast(vlane_coeff, vlane_vec_size) dim = ops.modular(ops.div(vector_index, div_vec), mod_vec) - if idx == tile_desc.vlane_split_axis: # Need to add vector lane offset + if idx == tile_desc.vmap.vlane_split_axis: # Need to add vector lane offset stride_dim = ops.modular(dim, vlane_stride_vec) outer_dim = ops.modular(ops.div(dim, vlane_stride_vec), vlane_outer_vec) dim = ops.add(stride_dim, ops.mul(outer_dim, nr_vector_lane_vec)) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index b51c2794..cc17ada1 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -1062,7 +1062,7 @@ def store_reduction_epilogue(self, name, index, value): if self.welford_reduce_out is not None: # NOTE: It not a real welford algorithm... We just used E(X^2) - E(X)^2 - divider = ops.constant(float(self.reduction_axis_size), "f32") + divider = ops.constant(float(self.r_dim_size), "f32") if self.buffer_types[name][1] > 1: divider_vec = ops.broadcast(divider, new_vec_size) else: From cd14109e6db330170ebbfe6b2bfb1aa13a4f8867 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 8 Dec 2025 05:56:56 +0000 Subject: [PATCH 011/194] [Log] Add print lock to prevent log crash --- Simulator/simulator.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 91d53b09..4786fd32 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -16,6 +16,8 @@ from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs from PyTorchSimFrontend import extension_config +print_lock = threading.Lock() + TORCH_TO_NUMPY = { torch.float32: np.float32, torch.float64: np.float64, @@ -157,9 +159,12 @@ def show_progress(): while not finished: i = (i + 1) % 3 tail = "." * i + " " * (3-i) - sys.stdout.write("\r[Gem5] Gem5 is running." + tail) + with print_lock: + sys.stdout.write("\r[Gem5] Gem5 is running." + tail) + sys.stdout.flush() time.sleep(1) - print("") + with print_lock: + print("") dir_path = os.path.join(os.path.dirname(target_binary), "m5out") gem5_script_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "gem5_script/script_systolic.py") From 5fe87e9a64168c7a1f8640801e1d23fadbeb8c4e Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 8 Dec 2025 10:07:22 +0000 Subject: [PATCH 012/194] [Device] Add custom zero_, zeors_like --- PyTorchSimFrontend/extension_device.cpp | 73 +++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/PyTorchSimFrontend/extension_device.cpp b/PyTorchSimFrontend/extension_device.cpp index f1351fab..cfaecf2b 100644 --- a/PyTorchSimFrontend/extension_device.cpp +++ b/PyTorchSimFrontend/extension_device.cpp @@ -45,6 +45,16 @@ static py::object to_torch_dtype(at::ScalarType st) { } } +static inline at::MemoryFormat fix_memory_format(c10::optional mf_opt) { + if (!mf_opt.has_value()) return at::MemoryFormat::Contiguous; + + auto mf = mf_opt.value(); + if (mf == at::MemoryFormat::Preserve) { + return at::MemoryFormat::Contiguous; + } + return mf; +} + static uint64_t op_counter = 0; static uint64_t last_saved_value = 0; @@ -339,7 +349,7 @@ at::Tensor custom_empty(c10::IntArrayRef size, c10::optional dty constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1); auto dtype = c10::dtype_or_default(dtype_opt); - return at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, dtype, optional_memory_format); + return at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, dtype, fix_memory_format(optional_memory_format)); } at::Tensor& custom_arange_start_out_impl( @@ -386,6 +396,62 @@ static at::Tensor custom_to_dtype_impl(const at::Tensor& self, return at::native::to(self, dtype, non_blocking, copy, memory_format); } +at::Tensor custom_zeros_like( + const at::Tensor& input, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt, + c10::optional memory_format_opt) +{ + // dtype / layout / device fallback + auto dtype = dtype_opt.value_or(input.scalar_type()); + auto layout = layout_opt.value_or(input.layout()); + auto device = device_opt.value_or(input.device()); + auto memfmt = memory_format_opt.value_or(c10::MemoryFormat::Contiguous); + + TORCH_CHECK( + device.type() == c10::DeviceType::PrivateUse1, + "custom_zeros_like: device must be PrivateUse1"); + + at::Tensor out = custom_empty( + input.sizes(), + dtype, + layout, + device, + pin_memory_opt, + memfmt + ); + size_t nbytes = out.numel() * out.element_size(); + void* ptr = out.mutable_data_ptr(); + + TORCH_CHECK(ptr != nullptr, + "custom_zeros_like: out.mutable_data_ptr() returned NULL"); + std::memset(ptr, 0, nbytes); + return out; +} + +at::Tensor& custom_zero_impl(at::Tensor& self) +{ + TORCH_CHECK( + self.device().type() == c10::DeviceType::PrivateUse1, + "custom_zero_: expected a PrivateUse1 device tensor"); + + if (self.numel() == 0) { + return self; + } + + void* data = self.mutable_data_ptr(); + TORCH_CHECK(data != nullptr, + "custom_zero_: self.mutable_data_ptr() returned NULL " + "(storage was not allocated)"); + + size_t nbytes = self.numel() * self.element_size(); + std::memset(data, 0, nbytes); + + return self; +} + // With TORCH_LIBRARY_IMPL, you can register custom kernels for your backend. // For open registration, we're registering all of our kernels to the PrivateUse1 dispatch key. // Later in this file, we map a custom device to the PrivateUse1 device type, @@ -405,6 +471,8 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { m.impl("as_strided", at::native::as_strided_tensorimpl); m.impl("view", at::native::view); m.impl("arange.start_out", &custom_arange_start_out_impl); + m.impl("zeros_like", &custom_zeros_like); + m.impl("zero_", &custom_zero_impl); } TORCH_LIBRARY_IMPL(aten, AutogradPrivateUse1, m) { @@ -580,9 +648,6 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { m.impl("max", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); m.impl("index_select", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); m.impl("nonzero", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("zero_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("zeros_like", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); } // This basic implementation doesn't bother dealing with different device indices From db18cbd3fac47f9a3c88462a6fdc5941aa55af94 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 8 Dec 2025 10:57:01 +0000 Subject: [PATCH 013/194] [Frontend/Spike] Use 64byte aligned buffer size --- PyTorchSimFrontend/mlir/mlir_caller_codegen.py | 4 +++- tests/Mixtral_8x7B/test_attention.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py index 38a1f7a9..a539bdb9 100644 --- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py +++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py @@ -1,4 +1,5 @@ import os +import math import subprocess import shlex import re @@ -100,7 +101,8 @@ def generate_args_define(self): bits = 8 else: bits = torch.iinfo(arg_type).bits - self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({arg_size * bits // 8}ULL){self.ending}') + buffer_size = int(math.ceil(arg_size * bits // 8 / 64) * 64) # Round up to 64 bytes + self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({buffer_size}ULL){self.ending}') name_set.add(arg_name) self.writeline(self.newline) diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py index 6a7747f7..58955928 100644 --- a/tests/Mixtral_8x7B/test_attention.py +++ b/tests/Mixtral_8x7B/test_attention.py @@ -166,8 +166,8 @@ def test_rmsnorm(device, seq=32): from Scheduler.scheduler import PyTorchSimRunner module = PyTorchSimRunner.setup_device() device = module.custom_device() - test_rmsnorm(device, seq=1) - test_concat(device, size1=(1, 8, 64, 64), size2=(1,8,1,64), dim=2) + #test_rmsnorm(device, seq=1) + #test_concat(device, size1=(1, 8, 64, 64), size2=(1,8,1,64), dim=2) test_decode(device, 32, 3) #test_attention(device) #test_ffn(device) From 11524280328b41bcecb728b80cb730cd5835f3b5 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 8 Dec 2025 16:06:32 +0000 Subject: [PATCH 014/194] [Refactor] Seperate OpOverrides --- .../mlir/mlir_codegen_backend.py | 760 +----------- PyTorchSimFrontend/mlir/mlir_ops.py | 1034 +++++++++++++++++ PyTorchSimFrontend/mlir/mlir_template.py | 6 +- 3 files changed, 1049 insertions(+), 751 deletions(-) create mode 100644 PyTorchSimFrontend/mlir/mlir_ops.py diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index a14dd10b..cda996ab 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -24,6 +24,7 @@ from PyTorchSimFrontend import extension_config from . import mlir_common from .mlir_common import LoopLevel, LoopNest +from .mlir_ops import ExtensionOverrides from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest def reduction_init(reduction_type, dtype): @@ -56,19 +57,6 @@ def reduction_partial_combine_vec(reduction_type, vector_value, init_value): return ops.logical_and(vector_value, init_value) raise AssertionError(reduction_type) -def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape, reduced_shape): - if reduction_type == "sum": - return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" - if reduction_type == "prod": - return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" - if reduction_type == "max": - return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" - if reduction_type == "min": - return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" - if reduction_type == "any": - return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" - raise AssertionError(reduction_type) - class ExtensionWrapperCodegen(wrapper.WrapperCodeGen): def __init__(self): super().__init__() @@ -205,734 +193,6 @@ def generate(self, is_inference): def memory_plan(self): self.lines = memory_planning.MemoryPlanner(self).plan(self.lines) -class ExtensionOverrides(common.OpOverrides): - # Binary element wise operations - @staticmethod - def custom_cast(operand, target_type, *args, var_info=None, **kwargs): - dtype = var_info[operand][1] - if dtype == "index": - ret = ops.index_cast(operand, target_type) - else: - ret = ops.to_dtype(operand, target_type) - return ret, var_info[ret] - - @staticmethod - def binary_elementwise_common(operand1, operand2, var_info): - operand1.bounds = operand1.bounds.unknown() - operand2.bounds = operand2.bounds.unknown() - op_type1 = var_info[operand1] - op_type2 = var_info[operand2] - # Tile size check - if op_type1[0] != op_type2[0]: - # Try to broad cast - lhs_tile_size, lhs_dtype = op_type1 - rhs_tile_size, rhs_dtype = op_type2 - if lhs_tile_size > rhs_tile_size: - operand2 = ops.broadcast(operand2, lhs_tile_size) - op_type2 = var_info[operand2] - elif lhs_tile_size < rhs_tile_size: - operand1 = ops.broadcast(operand1, rhs_tile_size) - op_type1 = var_info[operand1] - - # Data type check - if op_type1[1] != op_type2[1]: - if op_type1[1] == "index" or op_type1 == "index": - if op_type1[1] == "index": - operand1 = ops.index_cast(operand1, op_type2[1]) - op_type1 = var_info[operand1] - if op_type2[1] == "index": - operand2 = ops.index_cast(operand2, op_type1[1]) - op_type2 = var_info[operand2] - elif op_type1[1][0] == "i" and op_type2[1][0] == "f": - operand1 = ops.to_dtype(operand1, op_type2[1]) - op_type1 = var_info[operand1] - elif op_type1[1][0] == "f" and op_type2[1][0] == "i": - operand2 = ops.to_dtype(operand2, op_type1[1]) - op_type2 = var_info[operand2] - elif op_type1[1][0] == op_type2[1][0]: - if mlir_common.MLIR_TO_BIT[op_type1[1]] > mlir_common.MLIR_TO_BIT[op_type2[1]]: - operand2 = ops.ext(operand2, op_type1[1]) - op_type2 = var_info[operand2] - elif mlir_common.MLIR_TO_BIT[op_type1[1]] < mlir_common.MLIR_TO_BIT[op_type2[1]]: - operand1 = ops.ext(operand1, op_type2[1]) - op_type1 = var_info[operand1] - else: - raise NotImplementedError("Unsupported type converting") - - # Updated var info - tile_size = op_type1[0] - ret_type = op_type1[1] - return tile_size, ret_type, operand1, operand2 - - @staticmethod - def add(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - opcode = f'arith.add{ret_type[0]}' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] - - @staticmethod - def sub(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - opcode = f'arith.sub{ret_type[0]}' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] - - @staticmethod - def mul(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - opcode = f'arith.mul{ret_type[0]}' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] - - @staticmethod - def div(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - if ret_type[0] == "f": - opcode = f'arith.divf' - else: - opcode = f'arith.divui' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] - - @staticmethod - def truediv(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - if ret_type[0] == "f": - opcode = f'arith.divf' - else: - opcode = f'arith.divui' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] - - @staticmethod - def modular(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - if ret_type[0] == "f": - raise NotImplementedError("Not support remainder operation for floating point") - else: - opcode = f'arith.remui' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] - - @staticmethod - def minimum(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - if ret_type[0] == "f": - opcode = f'arith.minimumf' - else: - opcode = f'arith.minui' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] - - @staticmethod - def maximum(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - if ret_type[0] == "f": - opcode = f'arith.maximumf' - else: - opcode = f'arith.maxui' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] - - @staticmethod - def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs): - src_mlir_dtype = var_info[operand][1] - if src_mlir_dtype == "index": - operand = ops.index_cast(operand, "i64") - src_mlir_dtype = var_info[operand][1] - - tile_size = var_info[operand][0] - if isinstance(dst_mlir_dtype, torch.dtype): - dst_mlir_dtype = mlir_common.DTYPE_TO_MLIR[dst_mlir_dtype] - dst_bits = mlir_common.MLIR_TO_BIT[dst_mlir_dtype] - src_bits = mlir_common.MLIR_TO_BIT[src_mlir_dtype] - shape = f"vector<{tile_size}x{dst_mlir_dtype}>" if tile_size > 1 else dst_mlir_dtype - src_shape = f"vector<{tile_size}x{src_mlir_dtype}>" if tile_size > 1 else src_mlir_dtype - if dst_mlir_dtype[0] == "i" and src_mlir_dtype[0] == "f": - return f"arith.fptoui %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype] - if dst_mlir_dtype[0] == "f" and src_mlir_dtype[0] == "i": - return f"arith.uitofp %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype] - if dst_mlir_dtype[0] == "i": - if dst_bits > src_bits: - return f"arith.extui %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype] - elif dst_bits < src_bits: - return f"arith.trunc %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype] - return f"arith.maxui %{operand}, %{operand} : {shape}", [tile_size, dst_mlir_dtype] - elif dst_mlir_dtype[0] == "f": - if dst_bits > src_bits: - return f"arith.extf %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype] - elif dst_bits < src_bits: - return f"arith.trunf %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype] - return f"arith.maximumf %{operand}, %{operand} : {shape}", [tile_size, dst_mlir_dtype] - else: - raise NotImplementedError("Unsupported type for to_dtype ops") - - @staticmethod - def constant(value, src_type, *args, var_info=None, **kwargs): - if isinstance(src_type, torch.dtype): - src_type = mlir_common.DTYPE_TO_MLIR[src_type] - - if "inf" == str(value) or "-inf" == str(value) or "nan" == str(value): - value = f"0x{mlir_common.MLIR_INF[str(value)][src_type]:x}" - # if value represented by e notation, convert to float (ex 1e-3 -> 1.0e-3) - elif "e" in str(value): - value = format(float(value), ".20f") - elif src_type[0] == "f": - value = format(float(value), ".20f") - elif src_type[0] == "i": - value = int(value) - return f'arith.constant {value} : {src_type}', [1, src_type] - - @staticmethod - def alloc(size, src_type, *args, var_info=None, **kwargs): - return f"memref.alloc() : memref<{size}x{src_type}>", [size, src_type] - - @staticmethod - def extractelement(operand, idx, *args, var_info=None, **kwargs): - op_type = var_info[operand] - tile_size = op_type[0] - dtype = op_type[1] - shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f"vector.extract %{operand}[{idx}]: {dtype} from {shape}", [1, dtype] - - # transcendental functions - @staticmethod - def exp(operand, *args, var_info=None, **kwargs): - # Check scalar - op_type = var_info[operand] - if op_type[0] == 1: - operand = ops.broadcast(operand, 4) - val = ops.exp(operand) - result = ops.extractelement(val, 0) - return result, var_info[result] - op_type = var_info[operand] - tile_size = op_type[0] - dtype = op_type[1] - shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.exp %{operand} : {shape}', [tile_size, dtype] - - @staticmethod - def exp2(operand, *args, var_info=None, **kwargs): - # Hands-on part: implement exp2 using math.exp2 - # var_info = {operand: [tile_size, dtype]} - # Ex) var_info[operand] = [8, "f32"] - - ln2 = math.log(2) - coeff = ops.constant(ln2, "f32") - operand = ops.mul(operand, coeff) - return ops.exp(operand), var_info[operand] - - @staticmethod - def erf(operand, *args, var_info=None, **kwargs): - # Check scalar - op_type = var_info[operand] - if op_type[0] == 1: - operand = ops.broadcast(operand, 4) - val = ops.erf(operand) - result = ops.extractelement(val, 0) - return result, var_info[result] - op_type = var_info[operand] - tile_size = op_type[0] - dtype = op_type[1] - shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.erf %{operand} : {shape}', [tile_size, dtype] - - @staticmethod - def tanh(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] - - # Check scalar - op_type = var_info[operand] - if op_type[0] == 1: - operand = ops.broadcast(operand, 4) - val = ops.tanh(operand) - result = ops.extractelement(val, 0) - return result, var_info[result] - op_type = var_info[operand] - tile_size = op_type[0] - dtype = op_type[1] - - # Type check & auto cast - if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32") - var_info[operand] = dtype - shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.tanh %{operand} : {shape}', [tile_size, dtype] - - @staticmethod - def sin(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] - - # Check scalar - op_type = var_info[operand] - if op_type[0] == 1: - operand = ops.broadcast(operand, 4) - val = ops.sin(operand) - result = ops.extractelement(val, 0) - return result, var_info[result] - op_type = var_info[operand] - tile_size = op_type[0] - dtype = op_type[1] - - # Type check & auto cast - if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32") - var_info[operand] = dtype - shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.sin %{operand} : {shape}', [tile_size, dtype] - - @staticmethod - def cos(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] - - # Check scalar - op_type = var_info[operand] - if op_type[0] == 1: - operand = ops.broadcast(operand, 4) - val = ops.cos(operand) - result = ops.extractelement(val, 0) - return result, var_info[result] - op_type = var_info[operand] - tile_size = op_type[0] - dtype = op_type[1] - - # Type check & auto cast - if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32") - var_info[operand] = dtype - shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.cos %{operand} : {shape}', [tile_size, dtype] - - @staticmethod - def sqrt(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] - tile_size = op_type[0] - dtype = op_type[1] - - # Type check & auto cast - if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32") - var_info[operand] = dtype - - shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.sqrt %{operand} : {shape}', [tile_size, dtype] - - @staticmethod - def rsqrt(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] - tile_size = op_type[0] - dtype = op_type[1] - - # Type check & auto cast - if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32") - var_info[operand] = dtype - - shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.rsqrt %{operand} : {shape}', [tile_size, dtype] - - @staticmethod - def pow(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - # Type check & auto cast - if ret_type[0] != "f": - operand1, ret_type = ops.to_dtype(operand1, "f32") - var_info[operand1] = ret_type - - # Type check & auto cast - if ret_type[0] != "f": - operand2, ret_type = ops.to_dtype(operand2, "f32") - var_info[operand2] = ret_type - - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f"math.pow{ret_type[0]} %{operand1}, %{operand2} : {shape}", [tile_size, ret_type] - - @staticmethod - def log(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] - tile_size = op_type[0] - dtype = op_type[1] - - # Type check & auto cast - if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32") - var_info[operand] = dtype - - shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.log %{operand} : {shape}', [tile_size, dtype] - - @staticmethod - def reciprocal(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] - tile_size = op_type[0] - dtype = op_type[1] - - # Type check & auto cast - if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32") - var_info[operand] = dtype - - return ops.div(ops.constant(1.0, dtype), operand), [tile_size, dtype] - - @staticmethod - def ext(operand, dtype, *args, var_info=None, **kwargs): - op_type = var_info[operand] - shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else f"{op_type[1]}" - target_type = f"vector<{op_type[0]}x{dtype}>" if op_type[0] > 1 else f"{dtype}" - if op_type[0] == "f": - opcode = f'arith.extf' - else: - opcode = f'arith.extui' - return f'{opcode} %{operand} : {shape} to {target_type}', [op_type[0], dtype] - - # Logical operations - @staticmethod - def neg(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] - tile_size = op_type[0] - dtype = op_type[1] - - # Type check & auto cast - if dtype[0] != "f": - operand, dtype = ops.to_dtype(operand, "f32") - var_info[operand] = dtype - - shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'arith.negf %{operand} : {shape}', [tile_size, dtype] - - @staticmethod - def eq(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - if ret_type[0] == "f": - op_type = "arith.cmpf" - attribute = "oeq" - elif ret_type[0] == "i": - op_type = "arith.cmpi" - attribute = "eq" - else: - raise ValueError(f"Unsupported data type for 'eq' operation: {ret_type}") - - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] - - @staticmethod - def ne(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - if ret_type[0] == "f": - op_type = "arith.cmpf" - attribute = "one" - elif ret_type[0] == "i": - op_type = "arith.cmpi" - attribute = "ne" - else: - raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}") - - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] - - @staticmethod - def lt(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - if ret_type[0] == "f": - op_type = "arith.cmpf" - attribute = "olt" - elif ret_type[0] == "i": - op_type = "arith.cmpi" - attribute = "slt" - else: - raise ValueError(f"Unsupported data type for 'lt' operation: {ret_type}") - - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] - - @staticmethod - def gt(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - if ret_type[0] == "f": - op_type = "arith.cmpf" - attribute = "ogt" - elif ret_type[0] == "i": - op_type = "arith.cmpi" - attribute = "sgt" - else: - raise ValueError(f"Unsupported data type for 'gt' operation: {ret_type}") - - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] - - @staticmethod - def le(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - if ret_type[0] == "f": - op_type = "arith.cmpf" - attribute = "ole" - elif ret_type[0] == "i": - op_type = "arith.cmpi" - attribute = "sle" - else: - raise ValueError(f"Unsupported data type for 'le' operation: {ret_type}") - - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] - - @staticmethod - def ge(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - if ret_type[0] == "f": - op_type = "arith.cmpf" - attribute = "oge" - elif ret_type[0] == "i": - op_type = "arith.cmpi" - attribute = "sge" - else: - raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}") - - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] - - @staticmethod - def and_(operand1, operand2, *args, var_info=None, **kwargs): - op_type1 = var_info[operand1] - op_type2 = var_info[operand2] - - # Type check & auto cast - if op_type1[1][0] != "i": - operand1, dtype = ops.to_dtype(operand1, "i32") - var_info[operand1] = dtype - - # Type check & auto cast - if op_type2[1][0] != "i": - operand1, dtype = ops.to_dtype(operand1, "i32") - var_info[operand2] = dtype - - ret_type = op_type1[1] - tile_size = op_type1[0] - - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'arith.andi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] - - @staticmethod - def or_(operand1, operand2, *args, var_info=None, **kwargs): - op_type1 = var_info[operand1] - op_type2 = var_info[operand2] - - # Type check & auto cast - if op_type1[1][0] != "i": - operand1, dtype = ops.to_dtype(operand1, "i32") - var_info[operand1] = dtype - - # Type check & auto cast - if op_type2[1][0] != "i": - operand1, dtype = ops.to_dtype(operand1, "i32") - var_info[operand2] = dtype - - ret_type = op_type1[1] - tile_size = op_type1[0] - - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'arith.ori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] - - @staticmethod - def xor(operand1, operand2, *args, var_info=None, **kwargs): - op_type1 = var_info[operand1] - op_type2 = var_info[operand2] - - # Type check & auto cast - if op_type1[1][0] != "i": - operand1, dtype = ops.to_dtype(operand1, "i32") - var_info[operand1] = dtype - - # Type check & auto cast - if op_type2[1][0] != "i": - operand1, dtype = ops.to_dtype(operand1, "i32") - var_info[operand2] = dtype - - ret_type = op_type1[1] - tile_size = op_type1[0] - - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'arith.xori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] - - @staticmethod - def to_bool(operand, *args, var_info=None, **kwargs): - tile_size, ret_type = var_info[operand] - const_one = ops.constant(0, ret_type) - if tile_size > 1: - const_one = ops.broadcast(const_one, tile_size) - ret = ops.ne(operand, const_one) - return ret, [tile_size, "i1"] - - @staticmethod - def logical_and(operand1, operand2, *args, var_info=None, **kwargs): - op_type1 = var_info[operand1] - op_type2 = var_info[operand2] - # Type check & auto cast - if op_type1[1] != "i1": - operand1 = ops.to_bool(operand1) - if op_type2[1] != "i1": - operand2 = ops.to_bool(operand2) - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - return ExtensionOverrides.and_(operand1, operand2, *args, var_info=var_info, **kwargs) - - @staticmethod - def logical_not(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] - - ret_type = op_type[1] - tile_size = op_type[0] - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - const_one = ops.constant(0, ret_type) - const_one = ops.broadcast(const_one, tile_size) - ret = ops.eq(operand,const_one) - return ret, [tile_size, "i1"] - - @staticmethod - def logical_or(operand1, operand2, *args, var_info=None, **kwargs): - op_type1 = var_info[operand1] - op_type2 = var_info[operand2] - # Type check & auto cast - if op_type1[1] != "i1": - operand1 = ops.to_bool(operand1) - if op_type2[1] != "i1": - operand2 = ops.to_bool(operand2) - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - return ExtensionOverrides.or_(operand1, operand2, *args, var_info=var_info, **kwargs) - - @staticmethod - def logical_xor(operand1, operand2, *args, var_info=None, **kwargs): - op_type1 = var_info[operand1] - op_type2 = var_info[operand2] - # Type check & auto cast - if op_type1[1] != "i1": - operand1 = ops.to_bool(operand1) - if op_type2[1] != "i1": - operand2 = ops.to_bool(operand2) - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - return ExtensionOverrides.xor(operand1, operand2, *args, var_info=var_info, **kwargs) - - @staticmethod - def relu(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] - tile_size = op_type[0] - ret_type = "f32" - return ops.maximum(operand, ops.constant(0.0, "f32")), [tile_size, ret_type] - - @staticmethod - def sigmoid(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] - tile_size = op_type[0] - ret_type = "f32" - one = ops.constant(1, "f32") - return ops.truediv(one, ops.add(one, ops.exp(ops.neg(operand)))), [tile_size, ret_type] - - # Special operaitons - @staticmethod - def where(condition, operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - cond_type = var_info[condition] - operand_type = var_info[operand1] - if cond_type[0] < tile_size: - condition = ops.broadcast(condition, tile_size) - elif cond_type[0] > tile_size: - operand1 = ops.broadcast(operand1, cond_type[0]) - operand2 = ops.broadcast(operand2, cond_type[0]) - tile_size, ret_type = var_info[operand1] - shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - cond_shape = f"vector<{tile_size}xi1>," if tile_size > 1 else "" - return f"arith.select %{condition}, %{operand1}, %{operand2} : {cond_shape} {shape}", [tile_size, ret_type] - - @staticmethod - def step(size, dtype, *args, **kwargs): - index_shape = f"vector<{size}x{dtype}>" - return f"vector.step : {index_shape}", [size, dtype] - - @staticmethod - def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", ninf_declared=False, **kwargs): - result = body() - val = ops.constant(other, dtype, *args, **kwargs) - result = ops.where(mask, result, val) - return result, var_info[result] - - @staticmethod - def index_cast(operand, target_type, *args, var_info=None, **kwrags): - op_type = var_info[operand] - src_shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else op_type[1] - des_shape = f"vector<{op_type[0]}x{target_type}>" if op_type[0] > 1 else target_type - return f"arith.index_cast %{operand} : {src_shape} to {des_shape}", [op_type[0], target_type] - - @staticmethod - def broadcast_unflat(operand1, target_size, *args, var_info=None, **kwargs): - op_type1 = var_info[operand1] - src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>"# if op_type1[0] > 1 else op_type1[1] - des_shape = f"vector<{target_size//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>"# if op_type2[0] > 1 else op_type1[1] # Use tile size only - - expand = f"vector.broadcast %{operand1} : {src_shape} to {des_shape}" - return expand, [target_size, op_type1[1]] - - @staticmethod - def broadcast(operand1, target_size, *args, var_info=None, **kwargs): - op_type1 = var_info[operand1] - src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>" if op_type1[0] > 1 else op_type1[1] - des_shape = f"vector<{target_size}x{op_type1[1]}>" # if op_type2[0] > 1 else op_type1[1] # Use tile size only - - # Special case for length 2 vector. We used this vector to avoid scalar operations... - if op_type1[0] != 1 and target_size % op_type1[0] == 0: - unflat_operand = ops.broadcast_unflat(operand1, target_size) - unflat_shape = f"vector<{target_size//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>" - expand = f"vector.shape_cast %{unflat_operand} : {unflat_shape} to {des_shape}" - elif op_type1[0] == 1: - expand = f"vector.broadcast %{operand1} : {src_shape} to {des_shape}" - else: - raise NotImplementedError("Not supporting broadcast type...") - return expand, [target_size, op_type1[1]] - - @staticmethod - def shape_cast(operand, src_shape, dst_shape, *args, var_info=None, **kwargs): - operand_type = var_info[operand] - return f"vector.shape_cast %{operand} : {src_shape} to {dst_shape}", operand_type - - @staticmethod - def multi_reduction(acc, init, vec_size, red_size, red_shape, red_type, type_name, *args, **kwargs): - if red_size == 1: - final_reduced_shape = f"{type_name}" - line = reduction_combine_vec(red_type, acc, init, axis=0, shape=red_shape, reduced_shape=final_reduced_shape) - else: - final_reduced_shape = f"vector<{red_size}x{type_name}>" - new_vshape= f"vector<{vec_size//red_size}x{red_size}x{type_name}>" - value = ops.shape_cast(acc, red_shape, new_vshape) - line = reduction_combine_vec(red_type, value, init, axis=0, shape=new_vshape, reduced_shape=final_reduced_shape) - return line, [red_size, type_name] - - @staticmethod - def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, var_info=None, **kwargs): - if compute_vec_size == 1: - vshape = f"{mlir_dtype}" - operation = "affine.load" - line = f"{operation} %{buffer}[{indices}] : {buffer_shape}" - else: - vshape = f"vector<{compute_vec_size}x{mlir_dtype}>" - operation = "affine.vector_load" - line = f"{operation} %{buffer}[{indices}] : {buffer_shape}, {vshape}" - return line, [compute_vec_size, mlir_dtype] - - @staticmethod - def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, var_info=None, **kwargs): - compute_vec_size, mlir_dtype = var_info[operand][0], var_info[operand][1] - - if compute_vec_size == 1: - vshape = f"{mlir_dtype}" - operation = "affine.store" - line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}" - else: - vshape = f"vector<{compute_vec_size}x{mlir_dtype}>" - operation = "affine.vector_store" - line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}, {vshape}" - - if buffer_name is not None: - return common.DeferredLine(buffer_name, line), [None, None] - else: - return line, [None, None] RTYPE_TO_MLIR = { "sum": "add", @@ -1214,7 +474,7 @@ def store(self, name: str, index: sympy.Expr, value, mode=None, *args, **kwargs) # Generate vector store instruction _, operand_type = self.var_info[value] if mlir_dtype != operand_type: - value = ops.custom_cast(value, mlir_dtype) + value = ops.to_dtype(value, mlir_dtype) if compute_vec_size < self.var_info[value][0]: value = self.cse.generate(self.stores, f"vector.extract_strided_slice %{value} {{offsets = [0], sizes = [{compute_vec_size}], strides = [1]}}: vector<{self.var_info[value][0]}x{self.var_info[value][1]}> to {vshape}") @@ -1256,6 +516,8 @@ def reduction(self, dtype, src_dtype, reduction_type, value): vec_len = self.kernel_group.tile_desc.get_compute_vec_size() reduced_shape = self.kernel_group.tile_desc.get_mlir_vshape(type_name) + + # Prepare reduction init with self.override_buffer_cse(cse=self.const_cse, buffer=self.const_buffer): init = self.get_const_cse(reduction_init(reduction_type, dtype), type_name) @@ -1289,10 +551,12 @@ def reduction(self, dtype, src_dtype, reduction_type, value): _, mask_var = self.get_mask() if mask_var is not None: value = ops.where(mask_var, value, init_vec) + result = reduction_partial_combine_vec(reduction_type, value, body_iter_arg) + result = ops.to_dtype(result, type_name) + self.compute_body_loop.reduction_vars[body_acc] = (reduction_type, body_iter_arg, iter_var_list[-1], reduced_shape) self.compute_body_loop.affine_yield[result] = reduced_shape - # Register affine yield var for reduction_depth, acc in enumerate(acc_var_list[1:]): self.affine_yield[acc] = reduced_shape, reduction_depth @@ -1340,8 +604,8 @@ def store_reduction(self, name, index, value): sum, sqr_sum, _ = self.welford_reduce_out reduction_numel = reduce(mul, self.ranges[self.reduction_depth:], 1) divider = self.get_const_cse(float(reduction_numel), "f32") - mean = ops.div(sum, divider) - sqr_mean = ops.div(sqr_sum, divider) + mean = ops.truediv(sum, divider) + sqr_mean = ops.truediv(sqr_sum, divider) mean_sqr = ops.mul(mean, mean) variance = ops.sub(sqr_mean, mean_sqr) m2 = ops.mul(variance, divider) @@ -1401,10 +665,10 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index): vlane_vec_size = 4 vlane_vec = ops.broadcast(vlane_coeff, vlane_vec_size) - dim = ops.modular(ops.div(vector_index, div_vec), mod_vec) + dim = ops.remainder(ops.truncdiv(vector_index, div_vec), mod_vec) if idx == tile_desc.vmap.vlane_split_axis: # Need to add vector lane offset - stride_dim = ops.modular(dim, vlane_stride_vec) - outer_dim = ops.modular(ops.div(dim, vlane_stride_vec), vlane_outer_vec) + stride_dim = ops.remainder(dim, vlane_stride_vec) + outer_dim = ops.remainder(ops.truncdiv(dim, vlane_stride_vec), vlane_outer_vec) dim = ops.add(stride_dim, ops.mul(outer_dim, nr_vector_lane_vec)) vlane_offset = self.const_cse.generate(self.const_buffer, f"arith.addi %{vlane_vec}, %{vlane_vec} {{ vlane_offset={offset} }} : vector<{vlane_vec_size}xi64> // vlane offset") diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py new file mode 100644 index 00000000..ebf0c111 --- /dev/null +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -0,0 +1,1034 @@ +import math +import torch + +from torch._inductor.codegen import common +from torch._inductor.virtualized import V, _ops as ops +from . import mlir_common + +def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape, reduced_shape): + if reduction_type == "sum": + return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" + if reduction_type == "prod": + return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" + if reduction_type == "max": + return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" + if reduction_type == "min": + return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" + if reduction_type == "any": + return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" + raise AssertionError(reduction_type) + +class ExtensionOverrides(common.OpOverrides): + @staticmethod + def constant(value, src_type, *args, var_info=None, **kwargs): + if isinstance(src_type, torch.dtype): + src_type = mlir_common.DTYPE_TO_MLIR[src_type] + + str_val = str(value) + if "inf" == str_val or "-inf" == str_val or "nan" == str_val: + value = f"0x{mlir_common.MLIR_INF[str_val][src_type]:x}" + # scientific notation check + elif "e" in str_val: + value = format(float(value), ".20f") + elif src_type[0] == "f": + value = format(float(value), ".20f") + elif src_type[0] == "i": + value = int(float(value)) + return f'arith.constant {value} : {src_type}', [1, src_type] + + @staticmethod + def broadcast(operand, target_size, *args, var_info=None, **kwargs): + src_size, dtype = var_info[operand] + + src_shape = f"vector<{src_size}x{dtype}>" if src_size > 1 else dtype + dst_shape = f"vector<{target_size}x{dtype}>" + + op_str = "" + # Special case for length 2 vector. We used this vector to avoid scalar operations... + if src_size > 1: + if target_size % src_size == 0: + unflat_operand = ops.broadcast_unflat(operand, target_size) + outer_dim = target_size // src_size + unflat_shape = f"vector<{outer_dim}x{src_size}x{dtype}>" + # Flatten back to 1D + op_str = f"vector.shape_cast %{unflat_operand} : {unflat_shape} to {dst_shape}" + else: + raise NotImplementedError( + f"Vector broadcast size mismatch: src={src_size} cannot broadcast to target={target_size}" + ) + elif src_size == 1: + op_str = f"vector.broadcast %{operand} : {src_shape} to {dst_shape}" + else: + raise ValueError(f"Invalid source size: {src_size}") + return op_str, [target_size, dtype] + + @staticmethod + def broadcast_unflat(operand, target_size, *args, var_info=None, **kwargs): + src_size, dtype = var_info[operand] + + outer_dim = target_size // src_size + src_shape = f"vector<{src_size}x{dtype}>" + dst_shape = f"vector<{outer_dim}x{src_size}x{dtype}>" + + op_str = f"vector.broadcast %{operand} : {src_shape} to {dst_shape}" + return op_str, [target_size, dtype] + + def load_seed(self, *args, **kwargs): + raise NotImplementedError + + def rand(self, *args, **kwargs): + raise NotImplementedError + + def randn(self, *args, **kwargs): + raise NotImplementedError + + def randint64(self, *args, **kwargs): + raise NotImplementedError + + # Special operaitons + @staticmethod + def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", ninf_declared=False, **kwargs): + result = body() + val = ops.constant(other, dtype, *args, **kwargs) + result = ops.where(mask, result, val) + return result, var_info[result] + + @staticmethod + def where(condition, operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + cond_type = var_info[condition] + operand_type = var_info[operand1] + if cond_type[0] < tile_size: + condition = ops.broadcast(condition, tile_size) + elif cond_type[0] > tile_size: + operand1 = ops.broadcast(operand1, cond_type[0]) + operand2 = ops.broadcast(operand2, cond_type[0]) + tile_size, ret_type = var_info[operand1] + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + cond_shape = f"vector<{tile_size}xi1>" if tile_size > 1 else "" + return f"arith.select %{condition}, %{operand1}, %{operand2} : {cond_shape}, {shape}", [tile_size, ret_type] + + @staticmethod + def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs): + # Extract source information + src_mlir_dtype = var_info[operand][1] + tile_size = var_info[operand][0] + + # Normalize destination type (Torch dtype -> MLIR string) + if isinstance(dst_mlir_dtype, torch.dtype): + dst_mlir_dtype = mlir_common.DTYPE_TO_MLIR[dst_mlir_dtype] + + if src_mlir_dtype == "index" and dst_mlir_dtype != "index": + operand = ops.index_cast(operand, "i64") + src_mlir_dtype = "i64" # Update explicitly + + if dst_mlir_dtype == "index": + # If source is already index, return as is; otherwise cast + if src_mlir_dtype == "index": + return operand, [tile_size, "index"] + return ops.index_cast(operand, "index"), [tile_size, "index"] + + # Early return if types are identical + if src_mlir_dtype == dst_mlir_dtype: + return operand, [tile_size, dst_mlir_dtype] + + dst_bits = mlir_common.MLIR_TO_BIT[dst_mlir_dtype] + src_bits = mlir_common.MLIR_TO_BIT[src_mlir_dtype] + shape = f"vector<{tile_size}x{dst_mlir_dtype}>" if tile_size > 1 else dst_mlir_dtype + src_shape = f"vector<{tile_size}x{src_mlir_dtype}>" if tile_size > 1 else src_mlir_dtype + src_type_char = src_mlir_dtype[0] # 'i' or 'f' + dst_type_char = dst_mlir_dtype[0] # 'i' or 'f'o + + op_str = "" + + # Case A: Integer -> Float + if src_type_char == "i" and dst_type_char == "f": + op_str = f"arith.sitofp %{operand} : {src_shape} to {shape}" + # Case B: Float -> Integer + elif src_type_char == "f" and dst_type_char == "i": + op_str = f"arith.fptosi %{operand} : {src_shape} to {shape}" + # Case C: Integer -> Integer (Extension / Truncation) + elif src_type_char == "i" and dst_type_char == "i": + if dst_bits > src_bits: + op_str = f"arith.extsi %{operand} : {src_shape} to {shape}" + elif dst_bits < src_bits: + # Use arith.trunci for integer truncation + op_str = f"arith.trunci %{operand} : {src_shape} to {shape}" + else: + return operand, [tile_size, dst_mlir_dtype] + # Case D: Float -> Float (Extension / Truncation) + elif src_type_char == "f" and dst_type_char == "f": + if dst_bits > src_bits: + op_str = f"arith.extf %{operand} : {src_shape} to {shape}" + elif dst_bits < src_bits: + # Corrected 'trunf' to 'truncf' + op_str = f"arith.truncf %{operand} : {src_shape} to {shape}" + else: + return operand, [tile_size, dst_mlir_dtype] + else: + raise NotImplementedError(f"Unsupported conversion: {src_mlir_dtype} -> {dst_mlir_dtype}") + + return op_str, [tile_size, dst_mlir_dtype] + + @staticmethod + def identity(operand, *args, var_info=None, **kwargs): + operand_info = var_info[operand] + return operand, operand_info + + @staticmethod + def to_dtype_bitcast(operand, dtype, *args, var_info=None, **kwargs): + tile_size, current_src_type = var_info[operand] + + if isinstance(dtype, torch.dtype): + dst_mlir_type = mlir_common.DTYPE_TO_MLIR[dtype] + else: + dst_mlir_type = dtype + + src_bits = mlir_common.MLIR_TO_BIT[current_src_type] + dst_bits = mlir_common.MLIR_TO_BIT[dst_mlir_type] + + if src_bits != dst_bits: + raise ValueError( + f"Bitcast failed: Bit width mismatch. " + f"Src: {current_src_type}({src_bits}b) != Dst: {dst_mlir_type}({dst_bits}b)" + ) + + src_shape = f"vector<{tile_size}x{current_src_type}>" if tile_size > 1 else current_src_type + dst_shape = f"vector<{tile_size}x{dst_mlir_type}>" if tile_size > 1 else dst_mlir_type + + return f"arith.bitcast %{operand} : {src_shape} to {dst_shape}", [tile_size, dst_mlir_type] + + # Binary element wise operations + @staticmethod + def binary_elementwise_common(operand1, operand2, var_info): + operand1.bounds = operand1.bounds.unknown() + operand2.bounds = operand2.bounds.unknown() + op_type1 = var_info[operand1] + op_type2 = var_info[operand2] + # Tile size check + if op_type1[0] != op_type2[0]: + # Try to broad cast + lhs_tile_size, lhs_dtype = op_type1 + rhs_tile_size, rhs_dtype = op_type2 + if lhs_tile_size > rhs_tile_size: + operand2 = ops.broadcast(operand2, lhs_tile_size) + op_type2 = var_info[operand2] + elif lhs_tile_size < rhs_tile_size: + operand1 = ops.broadcast(operand1, rhs_tile_size) + op_type1 = var_info[operand1] + + # Data type check + if op_type1[1] != op_type2[1]: + if op_type1[1] == "index" or op_type1 == "index": + if op_type1[1] == "index": + operand1 = ops.index_cast(operand1, op_type2[1]) + op_type1 = var_info[operand1] + if op_type2[1] == "index": + operand2 = ops.index_cast(operand2, op_type1[1]) + op_type2 = var_info[operand2] + elif op_type1[1][0] == "i" and op_type2[1][0] == "f": + operand1 = ops.to_dtype(operand1, op_type2[1]) + op_type1 = var_info[operand1] + elif op_type1[1][0] == "f" and op_type2[1][0] == "i": + operand2 = ops.to_dtype(operand2, op_type1[1]) + op_type2 = var_info[operand2] + elif op_type1[1][0] == op_type2[1][0]: + if mlir_common.MLIR_TO_BIT[op_type1[1]] > mlir_common.MLIR_TO_BIT[op_type2[1]]: + operand2 = ops.ext(operand2, op_type1[1]) + op_type2 = var_info[operand2] + elif mlir_common.MLIR_TO_BIT[op_type1[1]] < mlir_common.MLIR_TO_BIT[op_type2[1]]: + operand1 = ops.ext(operand1, op_type2[1]) + op_type1 = var_info[operand1] + else: + raise NotImplementedError("Unsupported type converting") + + # Updated var info + tile_size = op_type1[0] + ret_type = op_type1[1] + return tile_size, ret_type, operand1, operand2 + + @staticmethod + def abs(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def exp(operand, *args, var_info=None, **kwargs): + # Check scalar + op_type = var_info[operand] + if op_type[0] == 1: + operand = ops.broadcast(operand, 4) + val = ops.exp(operand) + result = ops.extractelement(val, 0) + return result, var_info[result] + op_type = var_info[operand] + tile_size = op_type[0] + dtype = op_type[1] + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + return f'math.exp %{operand} : {shape}', [tile_size, dtype] + + @staticmethod + def exp2(operand, *args, var_info=None, **kwargs): + # Hands-on part: implement exp2 using math.exp2 + # var_info = {operand: [tile_size, dtype]} + # Ex) var_info[operand] = [8, "f32"] + + ln2 = math.log(2) + coeff = ops.constant(ln2, "f32") + operand = ops.mul(operand, coeff) + return ops.exp(operand), var_info[operand] + + @staticmethod + def expm1(operand, *args, var_info=None, **kwargs): + coeff = ops.constant(1.0, "f32") + operand = ops.exp(operand) + operand = ops.sub(operand, coeff) + return operand, var_info[operand] + + @staticmethod + def sqrt(operand, *args, var_info=None, **kwargs): + op_type = var_info[operand] + + tile_size = op_type[0] + dtype = op_type[1] + + # Type check & auto cast + if dtype.startswith("f"): + operand = ops.to_dtype(operand, "f32") + + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + return f'math.sqrt %{operand} : {shape}', [tile_size, dtype] + + @staticmethod + def relu(operand, *args, var_info=None, **kwargs): + src_mlir_dtype = var_info[operand][1] + tile_size = var_info[operand][0] + return ops.maximum(operand, ops.constant(0, src_mlir_dtype)), [tile_size, src_mlir_dtype] + + @staticmethod + def minimum(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + if ret_type[0] == "f": + opcode = f'arith.minimumf' + else: + opcode = f'arith.minsi' + return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + + @staticmethod + def maximum(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + if ret_type[0] == "f": + opcode = f'arith.maximumf' + else: + opcode = f'arith.maxsi' + return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + + @staticmethod + def cos(operand, *args, var_info=None, **kwargs): + op_type = var_info[operand] + + # Check scalar + op_type = var_info[operand] + if op_type[0] == 1: + operand = ops.broadcast(operand, 4) + val = ops.cos(operand) + result = ops.extractelement(val, 0) + return result, var_info[result] + op_type = var_info[operand] + tile_size = op_type[0] + dtype = op_type[1] + + # Type check & auto cast + if dtype.startswith("f"): + operand = ops.to_dtype(operand, "f32") + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + return f'math.cos %{operand} : {shape}', [tile_size, dtype] + + @staticmethod + def sin(operand, *args, var_info=None, **kwargs): + op_type = var_info[operand] + + # Check scalar + op_type = var_info[operand] + if op_type[0] == 1: + operand = ops.broadcast(operand, 4) + val = ops.sin(operand) + result = ops.extractelement(val, 0) + return result, var_info[result] + op_type = var_info[operand] + tile_size = op_type[0] + dtype = op_type[1] + + # Type check & auto cast + if dtype.startswith("f"): + operand = ops.to_dtype(operand, "f32") + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + return f'math.sin %{operand} : {shape}', [tile_size, dtype] + + @staticmethod + def tan(operand, *args, var_info=None, **kwargs): + sin_res = ops.sin(operand) + cos_res = ops.cos(operand) + operand = ops.truediv(sin_res, cos_res) + return operand, var_info[operand] + + @staticmethod + def lgamma(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def erf(operand, *args, var_info=None, **kwargs): + # Check scalar + op_type = var_info[operand] + if op_type[0] == 1: + operand = ops.broadcast(operand, 4) + val = ops.erf(operand) + result = ops.extractelement(val, 0) + return result, var_info[result] + op_type = var_info[operand] + tile_size = op_type[0] + dtype = op_type[1] + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + return f'math.erf %{operand} : {shape}', [tile_size, dtype] + + @staticmethod + def cosh(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def sinh(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def tanh(operand, *args, var_info=None, **kwargs): + op_type = var_info[operand] + + # Check scalar + op_type = var_info[operand] + if op_type[0] == 1: + operand = ops.broadcast(operand, 4) + val = ops.tanh(operand) + result = ops.extractelement(val, 0) + return result, var_info[result] + op_type = var_info[operand] + tile_size = op_type[0] + dtype = op_type[1] + + # Type check & auto cast + if dtype.startswith("f"): + operand = ops.to_dtype(operand, "f32") + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + return f'math.tanh %{operand} : {shape}', [tile_size, dtype] + + @staticmethod + def acos(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def acosh(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def asin(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def asinh(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def atan2(operand1, operand2, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def atan(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def atanh(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def copysign(operand1, operand2, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def erfc(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def erfinv(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def frexp(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def hypot(operand1, operand2, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def log10(operand, *args, var_info=None, **kwargs): + val_ln = ops.log(operand) + + tile_size, dtype = var_info[val_ln] + inv_ln10 = 1/math.log(10) + const_op = ops.constant(inv_ln10, dtype) + + # Multiply: ln(x) * (1/ln(10)) + result = ops.mul(val_ln, const_op) + return result, var_info[result] + + @staticmethod + def log2(operand, *args, var_info=None, **kwargs): + val_ln = ops.log(operand) + + tile_size, dtype = var_info[val_ln] + inv_ln10 = 1/math.log(2) + const_op = ops.constant(inv_ln10, dtype) + + # Multiply: ln(x) * (1/ln(10)) + result = ops.mul(val_ln, const_op) + return result, var_info[result] + + @staticmethod + def log(operand, *args, var_info=None, **kwargs): + op_type = var_info[operand] + tile_size = op_type[0] + dtype = op_type[1] + + # Type check & auto cast + if dtype.startswith("f"): + operand = ops.to_dtype(operand, "f32") + + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + return f'math.log %{operand} : {shape}', [tile_size, dtype] + + @staticmethod + def log1p(operand, *args, var_info=None, **kwargs): + tile_size, dtype = var_info[operand] + const_one = ops.constant(1, dtype) + + # 3. 덧셈 연산: (x + 1) + # ops.add가 (result_ssa, result_info)를 반환한다고 가정 + val_add = ops.add(operand, const_one) + result = ops.log(val_add) + return result, var_info[result] + + @staticmethod + def nextafter(operand1, operand2, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def logical_and(operand1, operand2, *args, var_info=None, **kwargs): + if var_info[operand1][1] != "i1": + operand1 = ops.to_bool(operand1) + + if var_info[operand2][1] != "i1": + operand2 = ops.to_bool(operand2) + result = ops.and_(operand1, operand2) + return result, var_info[result] + + @staticmethod + def logical_or(operand1, operand2, *args, var_info=None, **kwargs): + if var_info[operand1][1] != "i1": + operand1 = ops.to_bool(operand1) + + if var_info[operand2][1] != "i1": + operand2 = ops.to_bool(operand2) + result = ops.or_(operand1, operand2) + return result, var_info[result] + + @staticmethod + def logical_xor(operand1, operand2, *args, var_info=None, **kwargs): + if var_info[operand1][1] != "i1": + operand1 = ops.to_bool(operand1) + + if var_info[operand2][1] != "i1": + operand2 = ops.to_bool(operand2) + result = ops.xor(operand1, operand2) + return result, var_info[result] + + @staticmethod + def logical_not(operand, *args, var_info=None, **kwargs): + op_info = var_info[operand] + tile_size = op_info[0] + dtype = op_info[1] + + zero_const = ops.constant(0, dtype) + result = ops.eq(operand, zero_const) + return result, var_info[result] + + @staticmethod + def bitwise_and(operand1, operand2, *args, var_info=None, **kwargs): + # Float check + if var_info[operand1][1].startswith("f") or var_info[operand2][1].startswith("f"): + raise ValueError("Bitwise AND not supported for floats") + + result = ops.and_(operand1, operand2) + return result, var_info[result] + + @staticmethod + def bitwise_not(operand, *args, var_info=None, **kwargs): + tile_size, dtype = var_info[operand] + # Float check + if var_info[operand][1].startswith("f"): + raise ValueError("Bitwise NOT not supported for floats") + + neg_one = ops.constant(-1, dtype) + result = ops.xor(operand, neg_one) + return result, var_info[result] + + @staticmethod + def bitwise_or(operand1, operand2, *args, var_info=None, **kwargs): + # Float check + if var_info[operand1][1].startswith("f") or var_info[operand2][1].startswith("f"): + raise ValueError("Bitwise AND not supported for floats") + + result = ops.or_(operand1, operand2) + return result, var_info[result] + + @staticmethod + def bitwise_xor(operand1, operand2, *args, var_info=None, **kwargs): + # Float check + if var_info[operand1][1].startswith("f") or var_info[operand2][1].startswith("f"): + raise ValueError("Bitwise AND not supported for floats") + + result = ops.xor(operand1, operand2) + return result, var_info[result] + + @staticmethod + def bitwise_left_shift(operand1, operand2, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def bitwise_right_shift(operand1, operand2, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def rsqrt(operand, *args, var_info=None, **kwargs): + op_type = var_info[operand] + tile_size = op_type[0] + dtype = op_type[1] + + # Type check & auto cast + if dtype.startswith("f"): + operand = ops.to_dtype(operand, "f32") + + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + return f'math.rsqrt %{operand} : {shape}', [tile_size, dtype] + + @staticmethod + def sigmoid(operand, *args, var_info=None, **kwargs): + op_type = var_info[operand] + tile_size = op_type[0] + dtype = op_type[1] + one = ops.constant(1, dtype) + return ops.truediv(one, ops.expm1(operand)), [tile_size, dtype] + + @staticmethod + def fmod(operand1, operand2, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def isinf(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def isnan(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def round(operand, *args, var_info=None, **kwargs): + tile_size, dtype = var_info[operand] + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + + if dtype.startswith("f"): + return f"math.roundeven %{operand} : {shape}", [tile_size, dtype] + else: + return operand, [tile_size, dtype] + + @staticmethod + def floor(operand, *args, var_info=None, **kwargs): + tile_size, dtype = var_info[operand] + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + + if dtype.startswith("f"): + return f"math.floor %{operand} : {shape}", [tile_size, dtype] + else: + return operand, [tile_size, dtype] + + @staticmethod + def sign(operand, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def trunc(operand, *args, var_info=None, **kwargs): + tile_size, dtype = var_info[operand] + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + + if dtype.startswith("f"): + return f"math.trunc %{operand} : {shape}", [tile_size, dtype] + else: + return operand, [tile_size, dtype] + + @staticmethod + def ceil(operand, *args, var_info=None, **kwargs): + tile_size, dtype = var_info[operand] + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + + if dtype.startswith("f"): + return f"math.ceil %{operand} : {shape}", [tile_size, dtype] + else: + return operand, [tile_size, dtype] + + # Logical operations + @staticmethod + def neg(operand, *args, var_info=None, **kwargs): + op_type = var_info[operand] + tile_size = op_type[0] + dtype = op_type[1] + + # Type check & auto cast + if dtype.startswith("f"): + operand = ops.to_dtype(operand, "f32") + + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + return f'arith.negf %{operand} : {shape}', [tile_size, dtype] + + @staticmethod + def reciprocal(operand, *args, var_info=None, **kwargs): + op_type = var_info[operand] + tile_size = op_type[0] + dtype = op_type[1] + + # Type check & auto cast + if dtype.startswith("f"): + operand = ops.to_dtype(operand, "f32") + + return ops.truediv(ops.constant(1.0, dtype), operand), [tile_size, dtype] + + @staticmethod + def eq(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + if ret_type[0] == "f": + op_type = "arith.cmpf" + attribute = "oeq" + elif ret_type[0] == "i": + op_type = "arith.cmpi" + attribute = "eq" + else: + raise ValueError(f"Unsupported data type for 'eq' operation: {ret_type}") + + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] + + @staticmethod + def ne(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + if ret_type[0] == "f": + op_type = "arith.cmpf" + attribute = "one" + elif ret_type[0] == "i": + op_type = "arith.cmpi" + attribute = "ne" + else: + raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}") + + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] + + @staticmethod + def lt(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + if ret_type[0] == "f": + op_type = "arith.cmpf" + attribute = "olt" + elif ret_type[0] == "i": + op_type = "arith.cmpi" + attribute = "slt" + else: + raise ValueError(f"Unsupported data type for 'lt' operation: {ret_type}") + + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] + + @staticmethod + def gt(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + if ret_type[0] == "f": + op_type = "arith.cmpf" + attribute = "ogt" + elif ret_type[0] == "i": + op_type = "arith.cmpi" + attribute = "sgt" + else: + raise ValueError(f"Unsupported data type for 'gt' operation: {ret_type}") + + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] + + @staticmethod + def le(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + if ret_type[0] == "f": + op_type = "arith.cmpf" + attribute = "ole" + elif ret_type[0] == "i": + op_type = "arith.cmpi" + attribute = "sle" + else: + raise ValueError(f"Unsupported data type for 'le' operation: {ret_type}") + + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] + + @staticmethod + def ge(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + if ret_type[0] == "f": + op_type = "arith.cmpf" + attribute = "oge" + elif ret_type[0] == "i": + op_type = "arith.cmpi" + attribute = "sge" + else: + raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}") + + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] + + @staticmethod + def add(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + opcode = f'arith.add{ret_type[0]}' + return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + + @staticmethod + def sub(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + opcode = f'arith.sub{ret_type[0]}' + return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + + @staticmethod + def mul(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + opcode = f'arith.mul{ret_type[0]}' + return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + + @staticmethod + def pow(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + # Type check & auto cast + if ret_type.startswith("f"): + operand1 = ops.to_dtype(operand1, "f32") + + # Type check & auto cast + if ret_type.startswith("f"): + operand2 = ops.to_dtype(operand2, "f32") + + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + return f"math.pow{ret_type[0]} %{operand1}, %{operand2} : {shape}", [tile_size, ret_type] + + @staticmethod + def and_(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + return f'arith.andi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + + @staticmethod + def or_(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + return f'arith.ori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + + @staticmethod + def xor(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + return f'arith.xori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + + @staticmethod + def lshift(operand1, operand2, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def rshift(operand1, operand2, *args, var_info=None, **kwargs): + raise NotImplementedError + + @staticmethod + def truncdiv(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + + if ret_type.startswith("f"): + raise ValueError("truncdiv is strictly for integers. Use truediv for floats.") + + # arith.divsi: Signed Integer Division (Result is truncated) + return f'arith.divsi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + + @staticmethod + def floordiv(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + + if ret_type.startswith("f"): + # Float의 floor division은 보통 divf 후 floor를 하므로 여기선 정수만 처리 + raise ValueError("floordiv implementation expects integers based on definition.") + + # arith.floordivsi: Floor Division for Signed Integers + return f'arith.floordivsi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + + @staticmethod + def truediv(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + + if not ret_type.startswith("f"): + raise ValueError(f"truediv expects float inputs, but got {ret_type}. Use int_truediv for integers.") + + return f'arith.divf %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + + @staticmethod + def int_truediv(operand1, operand2, *args, var_info=None, **kwargs): + """ + True division for Integers (Int -> Float). + Promotes integers to floats, then performs floating-point division. + """ + tile_size, src_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + if not src_type.startswith("f"): + target_float_type = "f32" + operand1 = ops.to_dtype(operand1, target_float_type) + operand2 = ops.to_dtype(operand2, target_float_type) + src_type = target_float_type + + result = ops.truediv(operand1, operand2) + return result, var_info[result] + + @staticmethod + def mod(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + if ret_type[0] == "f": + raise NotImplementedError("Not support remainder operation for floating point") + else: + opcode = f'arith.remsi' + return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + + @staticmethod + def remainder(operand1, operand2, *args, var_info=None, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + + if ret_type.startswith("f"): + opcode = 'arith.remf' + else: + opcode = 'arith.remsi' # Signed Integer Remainder (LHS sign) + + return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + + @staticmethod + def square(operand, *args, var_info=None, **kwargs): + result = ops.mul(operand, operand) + return result, var_info[result] + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # PyTorchSim specific operations + + @staticmethod + def alloc(size, src_type, *args, var_info=None, **kwargs): + return f"memref.alloc() : memref<{size}x{src_type}>", [size, src_type] + + @staticmethod + def extractelement(operand, idx, *args, var_info=None, **kwargs): + op_type = var_info[operand] + tile_size = op_type[0] + dtype = op_type[1] + shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype + return f"vector.extract %{operand}[{idx}]: {dtype} from {shape}", [1, dtype] + + @staticmethod + def ext(operand, dtype, *args, var_info=None, **kwargs): + op_type = var_info[operand] + shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else f"{op_type[1]}" + target_type = f"vector<{op_type[0]}x{dtype}>" if op_type[0] > 1 else f"{dtype}" + if op_type[0] == "f": + opcode = f'arith.extf' + else: + opcode = f'arith.extui' + return f'{opcode} %{operand} : {shape} to {target_type}', [op_type[0], dtype] + + @staticmethod + def to_bool(operand, *args, var_info=None, **kwargs): + tile_size, ret_type = var_info[operand] + const_one = ops.constant(0, ret_type) + if tile_size > 1: + const_one = ops.broadcast(const_one, tile_size) + ret = ops.ne(operand, const_one) + return ret, [tile_size, "i1"] + @staticmethod + def step(size, dtype, *args, **kwargs): + index_shape = f"vector<{size}x{dtype}>" + return f"vector.step : {index_shape}", [size, dtype] + + @staticmethod + def index_cast(operand, target_type, *args, var_info=None, **kwrags): + op_type = var_info[operand] + src_shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else op_type[1] + des_shape = f"vector<{op_type[0]}x{target_type}>" if op_type[0] > 1 else target_type + return f"arith.index_cast %{operand} : {src_shape} to {des_shape}", [op_type[0], target_type] + + @staticmethod + def shape_cast(operand, src_shape, dst_shape, *args, var_info=None, **kwargs): + operand_type = var_info[operand] + return f"vector.shape_cast %{operand} : {src_shape} to {dst_shape}", operand_type + + @staticmethod + def multi_reduction(acc, init, vec_size, red_size, red_shape, red_type, type_name, *args, **kwargs): + if red_size == 1: + final_reduced_shape = f"{type_name}" + line = reduction_combine_vec(red_type, acc, init, axis=0, shape=red_shape, reduced_shape=final_reduced_shape) + else: + final_reduced_shape = f"vector<{red_size}x{type_name}>" + new_vshape= f"vector<{vec_size//red_size}x{red_size}x{type_name}>" + value = ops.shape_cast(acc, red_shape, new_vshape) + line = reduction_combine_vec(red_type, value, init, axis=0, shape=new_vshape, reduced_shape=final_reduced_shape) + return line, [red_size, type_name] + + @staticmethod + def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, var_info=None, **kwargs): + if compute_vec_size == 1: + vshape = f"{mlir_dtype}" + operation = "affine.load" + line = f"{operation} %{buffer}[{indices}] : {buffer_shape}" + else: + vshape = f"vector<{compute_vec_size}x{mlir_dtype}>" + operation = "affine.vector_load" + line = f"{operation} %{buffer}[{indices}] : {buffer_shape}, {vshape}" + return line, [compute_vec_size, mlir_dtype] + + @staticmethod + def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, var_info=None, **kwargs): + compute_vec_size, mlir_dtype = var_info[operand][0], var_info[operand][1] + + if compute_vec_size == 1: + vshape = f"{mlir_dtype}" + operation = "affine.store" + line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}" + else: + vshape = f"vector<{compute_vec_size}x{mlir_dtype}>" + operation = "affine.vector_store" + line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}, {vshape}" + + if buffer_name is not None: + return common.DeferredLine(buffer_name, line), [None, None] + else: + return line, [None, None] \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index cc17ada1..a36bc907 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -25,7 +25,7 @@ import PyTorchSimFrontend.extension_codecache as extension_codecache from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo -from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel, reduction_init, reduction_partial_combine_vec, reduction_combine_vec, is_welford_reduction +from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel, reduction_init, reduction_partial_combine_vec, is_welford_reduction from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode from torch._inductor.codegen import common @@ -1070,11 +1070,11 @@ def store_reduction_epilogue(self, name, index, value): if self.current_node.node.origin_node: # FIXME: This is a temporary solution # mean = SUM(X) / N - self.reduction_mean.append(ops.div(out, divider_vec)) + self.reduction_mean.append(ops.truediv(out, divider_vec)) out = self.reduction_mean[i] else: # m2 = (E(X^2) - E(X)^2) * N - sqr_mean = ops.div(out, divider_vec) + sqr_mean = ops.truediv(out, divider_vec) mean_sqr = ops.mul(self.reduction_mean[i], self.reduction_mean[i]) variance = ops.sub(sqr_mean, mean_sqr) m2 = ops.mul(variance, divider_vec) From 8452f5c67f0f88d42d1d5918343b1f9d365bc4e9 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 8 Dec 2025 16:29:36 +0000 Subject: [PATCH 015/194] [Test] Add Llama1&2 test cases --- .github/workflows/pytorchsim_test.yml | 21 +++++ tests/Llama/test_llama.py | 113 ++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 tests/Llama/test_llama.py diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index fe8a4a7d..8444f318 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -663,6 +663,27 @@ jobs: -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_scheduler.py + test_llama: + name: Run test_llama1&2 + runs-on: self-hosted + steps: + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Run test_llama.py + run: | + echo "Running test_llama.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e vpu_num_lanes="${{ inputs.vector_lane }}" \ + -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/Llama/test_llama.py + test_accuracy: name: Run test_accuracy runs-on: self-hosted diff --git a/tests/Llama/test_llama.py b/tests/Llama/test_llama.py new file mode 100644 index 00000000..17672563 --- /dev/null +++ b/tests/Llama/test_llama.py @@ -0,0 +1,113 @@ +import os +import sys +import argparse +import copy +import torch +from transformers.models.llama.configuration_llama import LlamaConfig +from transformers.models.llama.modeling_llama import LlamaForCausalLM + +def test_result(name, out, ref, rtol=1e-4, atol=1e-4): + if torch.allclose(out.cpu(), ref.cpu(), rtol=rtol, atol=atol): + msg = f"|{name} Test Passed|" + print("-" * len(msg)); print(msg); print("-" * len(msg)) + else: + msg = f"|{name} Test Failed|" + print("-" * len(msg)); print(msg); print("-" * len(msg)) + diff = (out.cpu() - ref.cpu()).abs().max().item() + print("device out:", out.detach().cpu()) + print("cpu ref :", ref.detach().cpu()) + print(f"Max abs diff: {diff}") + sys.exit(1) + +@torch.no_grad() +def run_custom_llama_test( + device, + batch=1, + seq_len=32, + dtype="float32", + rtol=1e-3, + atol=1e-3, + max_new_tokens=16, +): + dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16} + torch_dtype = dtype_map.get(dtype, torch.float32) + + cfg = LlamaConfig( + _name_or_path="custom-llama", + architectures=["LlamaForCausalLM"], + attention_bias=False, + attention_dropout=0.0, + bos_token_id=1, + eos_token_id=2, + hidden_act="silu", + hidden_size=4096, + initializer_range=0.02, + intermediate_size=11008, + max_position_embeddings=4096, + mlp_bias=False, + model_type="llama", + num_attention_heads=32, + num_hidden_layers=1, + num_key_value_heads=32, + pretraining_tp=1, + rms_norm_eps=1e-06, + rope_scaling=None, + rope_theta=10000.0, + tie_word_embeddings=True, + torch_dtype=dtype, + transformers_version="4.43.4", + use_cache=True, + vocab_size=8192, + ) + + print("Building LlamaForCausalLM from custom config (random init).") + base_model = LlamaForCausalLM(cfg).eval() + cpu_model = copy.deepcopy(base_model).eval() + + # dtype & device 세팅 + cpu_model.to(dtype=torch_dtype, device="cpu") + model = base_model.to(dtype=torch_dtype, device=device) + + # ---- 입력 텐서 (랜덤 ids) ---- + g = torch.Generator().manual_seed(0) + vocab = cfg.vocab_size + input_ids_cpu = torch.randint(low=0, high=vocab, size=(batch, seq_len), generator=g, dtype=torch.long) + attn_mask_cpu = torch.ones_like(input_ids_cpu, dtype=torch.long) + + input_ids_dev = input_ids_cpu.to(device) + attn_mask_dev = attn_mask_cpu.to(device) + + # ---- forward comparison (compile vs CPU baseline) ---- + print("Compiling model with torch.compile(...)") + compiled = torch.compile(model, dynamic=False) + + logits_cpu = cpu_model(input_ids=input_ids_cpu, attention_mask=attn_mask_cpu).logits + logits_dev = compiled(input_ids=input_ids_dev, attention_mask=attn_mask_dev).logits + + test_result("Custom Llama forward(logits)", logits_dev, logits_cpu, rtol=rtol, atol=atol) + print("Max diff >", (logits_dev.detach().cpu() - logits_cpu.detach().cpu()).abs().max().item()) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test Custom Llama (random weights, no tokenizer)") + parser.add_argument("--batch", type=int, default=1) + parser.add_argument("--seq_len", type=int, default=32) + parser.add_argument("--dtype", type=str, default="float32", choices=["float32", "float16", "bfloat16"]) + parser.add_argument("--rtol", type=float, default=1e-3) + parser.add_argument("--atol", type=float, default=1e-3) + parser.add_argument("--max_new_tokens", type=int, default=16) + args = parser.parse_args() + + sys.path.append(os.environ.get("PYTORCHSIM_ROOT_PATH", "/workspace/PyTorchSim")) + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() + device = module.custom_device() + #test_triu(device, size=(32, 128), diagonal=1) + torch.compiler.is_compiling = lambda: True # FIXME. How to fix this? + run_custom_llama_test( + device=device, + batch=args.batch, + seq_len=args.seq_len, + dtype=args.dtype, + rtol=args.rtol, + atol=args.atol, + ) From 00cd8c7cc9f0577e4ec4e974ec9e5f1467f86c67 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 8 Dec 2025 16:29:57 +0000 Subject: [PATCH 016/194] [TOGSim] Add error handling --- TOGSim/src/TileGraphParser.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc index 42776a51..761530ab 100644 --- a/TOGSim/src/TileGraphParser.cc +++ b/TOGSim/src/TileGraphParser.cc @@ -696,6 +696,9 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa loadConfig(config_path, _config_json); _attribute_path = attribute_path; + if (!std::filesystem::exists(onnx_path)) { + throw std::runtime_error("Error: ONNX file not found at path: " + onnx_path); + } /* Note: this parsing algorithm assume that all node are sorted in topological-order */ std::ifstream model_istream(onnx_path); google::protobuf::io::IstreamInputStream zero_copy_input(&model_istream); From a8d96cda4a8ebf1f281a4f27778d9df649cbc35c Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 8 Dec 2025 16:30:51 +0000 Subject: [PATCH 017/194] [Scheduler] Use given config file for compilations --- Scheduler/scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 31dbf6c0..98ebb1d5 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -358,6 +358,7 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") self.tog_simulator = TOGSimulator(togsim_path, togsim_config) + os.environ['TOGSIM_CONFIG'] = togsim_config self.tog_simulator.interactive_simulation() if engine_select == Scheduler.FIFO_ENGINE: self.execution_engine = FIFORunner(self.tog_simulator, self.num_request_queue) From 8aac3ab08fb63bc1ba3b2bb13c0de8c2b298e4e2 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 9 Dec 2025 05:10:24 +0000 Subject: [PATCH 018/194] [Fix/ops] Fix wrong implementation of sigmoid --- PyTorchSimFrontend/mlir/mlir_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index ebf0c111..af323c1e 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -626,7 +626,7 @@ def sigmoid(operand, *args, var_info=None, **kwargs): tile_size = op_type[0] dtype = op_type[1] one = ops.constant(1, dtype) - return ops.truediv(one, ops.expm1(operand)), [tile_size, dtype] + return ops.truediv(one, ops.add(one, ops.exp(ops.neg(operand)))), [tile_size, dtype] @staticmethod def fmod(operand1, operand2, *args, var_info=None, **kwargs): From fd6a846094df2ee73d2f7c1dcaa21d2c218411db Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 9 Dec 2025 06:29:26 +0000 Subject: [PATCH 019/194] [Tests] Use manual mask for Llama --- tests/Llama/test_llama.py | 301 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 286 insertions(+), 15 deletions(-) diff --git a/tests/Llama/test_llama.py b/tests/Llama/test_llama.py index 17672563..98820fd9 100644 --- a/tests/Llama/test_llama.py +++ b/tests/Llama/test_llama.py @@ -4,7 +4,7 @@ import copy import torch from transformers.models.llama.configuration_llama import LlamaConfig -from transformers.models.llama.modeling_llama import LlamaForCausalLM +from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaDecoderLayer, LlamaRMSNorm, LlamaRotaryEmbedding, LlamaModel def test_result(name, out, ref, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), ref.cpu(), rtol=rtol, atol=atol): @@ -13,12 +13,216 @@ def test_result(name, out, ref, rtol=1e-4, atol=1e-4): else: msg = f"|{name} Test Failed|" print("-" * len(msg)); print(msg); print("-" * len(msg)) - diff = (out.cpu() - ref.cpu()).abs().max().item() + diff = (out.cpu().int() - ref.cpu().int()).abs().max().item() print("device out:", out.detach().cpu()) print("cpu ref :", ref.detach().cpu()) print(f"Max abs diff: {diff}") sys.exit(1) +@torch.no_grad() +def run_rmsnorm_test( + device, + batch=1, + seq_len=32, + dtype="float32", + rtol=1e-3, + atol=1e-3, +): + print("\n[Running LlamaRMSNorm Test]") + dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16} + torch_dtype = dtype_map.get(dtype, torch.float32) + + hidden_size = 4096 + eps = 1e-6 + + print(f"Building LlamaRMSNorm (hidden_size={hidden_size}, eps={eps})") + base_norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps).eval() + cpu_norm = copy.deepcopy(base_norm).eval() + + cpu_norm.to(dtype=torch_dtype, device="cpu") + model = base_norm.to(dtype=torch_dtype, device=device) + + g = torch.Generator().manual_seed(0) + hidden_states = torch.randn(batch, seq_len, hidden_size, generator=g, dtype=torch_dtype) + hs_dev = hidden_states.to(device) + + print("Compiling LlamaRMSNorm with torch.compile(...)") + compiled_norm = torch.compile(model, dynamic=False) + + out_cpu = cpu_norm(hidden_states) + out_dev = compiled_norm(hs_dev) + + test_result("LlamaRMSNorm forward", out_dev, out_cpu, rtol=rtol, atol=atol) + print("Max diff >", (out_dev.detach().cpu() - out_cpu.detach().cpu()).abs().max().item()) + + +@torch.no_grad() +def run_rotary_embedding_test( + device, + batch=1, + seq_len=32, + dtype="float32", + rtol=1e-3, + atol=1e-3, +): + print("\n[Running LlamaRotaryEmbedding Test]") + dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16} + torch_dtype = dtype_map.get(dtype, torch.float32) + + hidden_size = 4096 + num_heads = 32 + head_dim = hidden_size // num_heads + + cfg = LlamaConfig( + _name_or_path="custom-llama", + architectures=["LlamaForCausalLM"], + attention_bias=False, + attention_dropout=0.0, + bos_token_id=1, + eos_token_id=2, + hidden_act="silu", + hidden_size=4096, + initializer_range=0.02, + intermediate_size=11008, + max_position_embeddings=4096, + mlp_bias=False, + model_type="llama", + num_attention_heads=32, + num_hidden_layers=1, + num_key_value_heads=32, + pretraining_tp=1, + rms_norm_eps=1e-06, + rope_scaling=None, + rope_theta=10000.0, + tie_word_embeddings=True, + torch_dtype=dtype, + transformers_version="4.43.4", + use_cache=True, + vocab_size=8192, + _attn_implementation = "sdpa" + ) + base_rope = LlamaRotaryEmbedding(cfg) + + cpu_rope = copy.deepcopy(base_rope) + + cpu_rope.to(device="cpu") + model = base_rope.to(device=device) + + g = torch.Generator().manual_seed(0) + value = torch.randn(batch, num_heads, seq_len, head_dim, generator=g, dtype=torch_dtype) + position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0).expand(batch, -1) + + val_dev = value.to(device) + pos_dev = position_ids.to(device) + + print("Compiling LlamaRotaryEmbedding with torch.compile(...)") + compiled_rope = torch.compile(model, dynamic=False) + + cos_cpu, sin_cpu = cpu_rope(value, position_ids) + cos_dev, sin_dev = compiled_rope(val_dev, pos_dev) + + print(f"Output dtype check - CPU: {cos_cpu.dtype}, Device: {cos_dev.dtype}") + + test_result("LlamaRotaryEmbedding (Cos)", cos_dev, cos_cpu, rtol=rtol, atol=atol) + test_result("LlamaRotaryEmbedding (Sin)", sin_dev, sin_cpu, rtol=rtol, atol=atol) + + diff_cos = (cos_dev.detach().cpu() - cos_cpu.detach().cpu()).abs().max().item() + diff_sin = (sin_dev.detach().cpu() - sin_cpu.detach().cpu()).abs().max().item() + print(f"Max diff (Cos) > {diff_cos}") + print(f"Max diff (Sin) > {diff_sin}") + +@torch.no_grad() +def run_decoder_layer_test( + device, + batch=1, + seq_len=32, + dtype="float32", + rtol=1e-3, + atol=1e-3, +): + print("\n[Running LlamaDecoderLayer Test]") + dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16} + torch_dtype = dtype_map.get(dtype, torch.float32) + + cfg = LlamaConfig( + _name_or_path="custom-llama", + architectures=["LlamaForCausalLM"], + attention_bias=False, + attention_dropout=0.0, + bos_token_id=1, + eos_token_id=2, + hidden_act="silu", + hidden_size=4096, + initializer_range=0.02, + intermediate_size=11008, + max_position_embeddings=4096, + mlp_bias=False, + model_type="llama", + num_attention_heads=32, + num_hidden_layers=1, + num_key_value_heads=32, + pretraining_tp=1, + rms_norm_eps=1e-06, + rope_scaling=None, + rope_theta=10000.0, + tie_word_embeddings=True, + torch_dtype=dtype, + transformers_version="4.43.4", + use_cache=True, + vocab_size=8192, + _attn_implementation = "sdpa" + ) + + print("Building LlamaDecoderLayer from custom config.") + base_layer = LlamaDecoderLayer(cfg, layer_idx=0).eval() + cpu_layer = copy.deepcopy(base_layer).eval() + + cpu_layer.to(dtype=torch_dtype, device="cpu") + model = base_layer.to(dtype=torch_dtype, device=device) + + g = torch.Generator().manual_seed(0) + hidden_states = torch.randn(batch, seq_len, cfg.hidden_size, generator=g, dtype=torch_dtype) + position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0).expand(batch, -1) + + attention_mask = torch.zeros(batch, 1, seq_len, seq_len, dtype=torch_dtype) + mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1) + attention_mask.masked_fill_(mask, torch.finfo(torch_dtype).min) + + # Shape: (1, seq_len, head_dim) or (batch, seq_len, head_dim) + head_dim = cfg.hidden_size // cfg.num_attention_heads + cos = torch.randn(1, seq_len, head_dim, generator=g, dtype=torch_dtype) + sin = torch.randn(1, seq_len, head_dim, generator=g, dtype=torch_dtype) + position_embeddings = (cos, sin) + + hs_dev = hidden_states.to(device) + pos_dev = position_ids.to(device) + att_dev = attention_mask.to(device) + pos_emb_dev = (cos.to(device), sin.to(device)) + + print("Compiling LlamaDecoderLayer with torch.compile(...)") + compiled_layer = torch.compile(model, dynamic=False) + + out_cpu = cpu_layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + position_embeddings=position_embeddings + ) + if isinstance(out_cpu, tuple): + out_cpu = out_cpu[0] + + out_dev = compiled_layer( + hidden_states=hs_dev, + attention_mask=att_dev, + position_ids=pos_dev, + position_embeddings=pos_emb_dev + ) + if isinstance(out_dev, tuple): + out_dev = out_dev[0] + + test_result("LlamaDecoderLayer forward", out_dev, out_cpu, rtol=rtol, atol=atol) + print("Max diff >", (out_dev.detach().cpu() - out_cpu.detach().cpu()).abs().max().item()) + @torch.no_grad() def run_custom_llama_test( device, @@ -40,7 +244,7 @@ def run_custom_llama_test( bos_token_id=1, eos_token_id=2, hidden_act="silu", - hidden_size=4096, + hidden_size=1024, initializer_range=0.02, intermediate_size=11008, max_position_embeddings=4096, @@ -64,11 +268,9 @@ def run_custom_llama_test( base_model = LlamaForCausalLM(cfg).eval() cpu_model = copy.deepcopy(base_model).eval() - # dtype & device 세팅 cpu_model.to(dtype=torch_dtype, device="cpu") model = base_model.to(dtype=torch_dtype, device=device) - # ---- 입력 텐서 (랜덤 ids) ---- g = torch.Generator().manual_seed(0) vocab = cfg.vocab_size input_ids_cpu = torch.randint(low=0, high=vocab, size=(batch, seq_len), generator=g, dtype=torch.long) @@ -81,12 +283,70 @@ def run_custom_llama_test( print("Compiling model with torch.compile(...)") compiled = torch.compile(model, dynamic=False) - logits_cpu = cpu_model(input_ids=input_ids_cpu, attention_mask=attn_mask_cpu).logits - logits_dev = compiled(input_ids=input_ids_dev, attention_mask=attn_mask_dev).logits + logits_cpu = cpu_model(input_ids=input_ids_cpu, attention_mask=attn_mask_cpu)#.logits + logits_dev = compiled(input_ids=input_ids_dev, attention_mask=attn_mask_dev)#.logits test_result("Custom Llama forward(logits)", logits_dev, logits_cpu, rtol=rtol, atol=atol) print("Max diff >", (logits_dev.detach().cpu() - logits_cpu.detach().cpu()).abs().max().item()) +@torch.no_grad() +def run_llama_model_test( + device, + batch=1, + seq_len=32, + dtype="float32", + rtol=1e-3, + atol=1e-3, +): + print("\n[Running LlamaModel Test]") + dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16} + torch_dtype = dtype_map.get(dtype, torch.float32) + + cfg = LlamaConfig( + vocab_size=8192, + hidden_size=1024, + num_attention_heads=32, + num_key_value_heads=32, + intermediate_size=11008 // 4, + num_hidden_layers=1, + max_position_embeddings=4096, + hidden_act="silu", + use_cache=False, + torch_dtype=dtype, + ) + + print("Building LlamaModel from custom config (random init).") + base_model = LlamaModel(cfg).eval() + cpu_model = copy.deepcopy(base_model).eval() + + cpu_model.to(dtype=torch_dtype, device="cpu") + model = base_model.to(dtype=torch_dtype, device=device) + + g = torch.Generator().manual_seed(0) + input_ids_cpu = torch.randint(low=0, high=cfg.vocab_size, size=(batch, seq_len), generator=g, dtype=torch.long) + + # FIXME: Currently, the user must provide the mask manually. + # There is a functionality issue with the model generating the mask internally, + # so we explicitly create and inject a Causal Mask (lower triangular matrix) from the outside. + causal_mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.long)) + attn_mask_cpu = causal_mask.unsqueeze(0).unsqueeze(0).expand(batch, 1, -1, -1).bool() + + input_ids_dev = input_ids_cpu.to(device) + attn_mask_dev = attn_mask_cpu.to(device) + + print("Compiling LlamaModel with torch.compile(...)") + compiled_model = torch.compile(model, dynamic=False) + + out_cpu = cpu_model(input_ids=input_ids_cpu, attention_mask=attn_mask_cpu) + out_dev = compiled_model(input_ids=input_ids_dev, attention_mask=attn_mask_dev) + + last_hidden_state_cpu = out_cpu.last_hidden_state + last_hidden_state_dev = out_dev.last_hidden_state + + test_result("LlamaModel (last_hidden_state)", last_hidden_state_dev, last_hidden_state_cpu, rtol=rtol, atol=atol) + diff = (last_hidden_state_dev.detach().cpu() - last_hidden_state_cpu.detach().cpu()).abs().max().item() + print(f"Max diff > {diff}") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Test Custom Llama (random weights, no tokenizer)") parser.add_argument("--batch", type=int, default=1) @@ -103,11 +363,22 @@ def run_custom_llama_test( device = module.custom_device() #test_triu(device, size=(32, 128), diagonal=1) torch.compiler.is_compiling = lambda: True # FIXME. How to fix this? - run_custom_llama_test( - device=device, - batch=args.batch, - seq_len=args.seq_len, - dtype=args.dtype, - rtol=args.rtol, - atol=args.atol, - ) + #run_rmsnorm_test(device) + #run_rotary_embedding_test(device) + #run_decoder_layer_test( + # device=device, + # batch=args.batch, + # seq_len=args.seq_len, + # dtype=args.dtype, + # rtol=args.rtol, + # atol=args.atol, + #) + run_llama_model_test(device) + #run_custom_llama_test( + # device=device, + # batch=args.batch, + # seq_len=args.seq_len, + # dtype=args.dtype, + # rtol=args.rtol, + # atol=args.atol, + #) From dea7f47f943302c3ab6104433ea29a6607a1bbf4 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 9 Dec 2025 07:40:25 +0000 Subject: [PATCH 020/194] [TOGSim] Use YAML instead of json --- TOGSim/conanfile.txt | 2 +- TOGSim/include/Common.h | 7 +- TOGSim/include/SimulationConfig.h | 6 +- TOGSim/include/SparseCore.h | 1 + TOGSim/include/TileGraphParser.h | 14 ++- TOGSim/src/Common.cc | 165 +++++++++++++++--------------- TOGSim/src/TileGraphParser.cc | 117 ++++++++++++--------- TOGSim/src/main.cc | 8 +- 8 files changed, 170 insertions(+), 150 deletions(-) diff --git a/TOGSim/conanfile.txt b/TOGSim/conanfile.txt index 7a57f52f..ce5268c7 100644 --- a/TOGSim/conanfile.txt +++ b/TOGSim/conanfile.txt @@ -2,6 +2,6 @@ boost/1.79.0 robin-hood-hashing/3.11.5 spdlog/1.11.0 -nlohmann_json/3.11.2 +yaml-cpp/0.8.0 [generators] cmake diff --git a/TOGSim/include/Common.h b/TOGSim/include/Common.h index 640cba0c..c62c3e0b 100644 --- a/TOGSim/include/Common.h +++ b/TOGSim/include/Common.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -14,7 +15,6 @@ #include "SimulationConfig.h" #include "Instruction.h" -#include "nlohmann/json.hpp" #define MIN(x, y) (((x) > (y)) ? (y) : (x)) #define MIN3(x, y, z) MIN(MIN(x, y), z) @@ -24,10 +24,7 @@ #define PAGE_SIZE 4096 -using json = nlohmann::json; - typedef uint64_t addr_type; typedef uint64_t cycle_type; -uint32_t generate_id(); -SimulationConfig initialize_config(json config); \ No newline at end of file +SimulationConfig initialize_config(YAML::Node config); \ No newline at end of file diff --git a/TOGSim/include/SimulationConfig.h b/TOGSim/include/SimulationConfig.h index 64cfa223..090f5520 100644 --- a/TOGSim/include/SimulationConfig.h +++ b/TOGSim/include/SimulationConfig.h @@ -1,13 +1,11 @@ #pragma once -#include #include - -using json = nlohmann::json; +#include enum class CoreType { WS_MESH, STONNE }; -enum class DramType { SIMPLE, RAMULATOR1, RAMULATOR2 }; +enum class DramType { SIMPLE, RAMULATOR2 }; enum class IcntType { SIMPLE, BOOKSIM2 }; diff --git a/TOGSim/include/SparseCore.h b/TOGSim/include/SparseCore.h index 9188b21d..02781ab3 100644 --- a/TOGSim/include/SparseCore.h +++ b/TOGSim/include/SparseCore.h @@ -1,5 +1,6 @@ #include #include +#include #include "Core.h" #include "sstStonne.h" #include "SimpleMem.h" diff --git a/TOGSim/include/TileGraphParser.h b/TOGSim/include/TileGraphParser.h index 9cc61d4a..07e5b212 100644 --- a/TOGSim/include/TileGraphParser.h +++ b/TOGSim/include/TileGraphParser.h @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include "TileGraph.h" @@ -13,8 +13,6 @@ #include "onnx/onnx-operators_pb.h" #include "onnx/onnx_pb.h" -using json = nlohmann::json; - enum class TileType{ LOOP_INDEX_NODE, LOOP_END_NODE, @@ -35,7 +33,7 @@ enum class LoopType { INNER_LOOP }; -bool loadConfig(const std::string& config_path, json& config_json); +bool loadConfig(const std::string& config_path, YAML::Node& config_yaml); class TileNode { public: @@ -80,9 +78,9 @@ class TileGraphParser { LoopType get_loop_type(std::string key) { return std::get<2>(_loop_size_map[key]); } const std::map> & get_loop_map() { return _loop_size_map; } const std::vector &lookupNumaInfo(std::string key); - int getCoreIdFromJson(const json& attribute_json, int subgraph_id); + int getCoreIdFromConfig(const YAML::Node& attribute_config, int subgraph_id); std::string getMetaByName(std::string key) { return _tog_meta[key]; } - const json& get_attribute_file() { return _attribute_json; } + const YAML::Node& get_attribute_file() { return _attribute_config; } std::vector calc_tag(std::vector& accum_tag, std::vector& tag_idx, std::vector& tag_stride); void register_memory_tag(std::string name, std::vector& tag_key); bool check_memory_tag(std::string name, std::vector& tag_key); @@ -135,8 +133,8 @@ class TileGraphParser { void _tile_index_generate() {} int _loop_stack_pointer = 0; - json _attribute_json; - json _config_json; + YAML::Node _attribute_config; + YAML::Node _config_yaml; std::string _tog_path; std::string _attribute_path; uint64_t indirect_counter = 0; diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc index 9a6b7798..63d360c6 100644 --- a/TOGSim/src/Common.cc +++ b/TOGSim/src/Common.cc @@ -1,28 +1,24 @@ #include "Common.h" -uint32_t generate_id() { - static uint32_t id_counter{0}; - return id_counter++; -} - template -T get_config_value(json config, std::string key) { - if (config.contains(key)) { - return config[key]; +T get_config_value(const YAML::Node& config, std::string key) { + if (config[key]) { + return config[key].as(); } else { throw std::runtime_error(fmt::format("Config key {} not found", key)); } } -SimulationConfig initialize_config(json config) { +SimulationConfig initialize_config(YAML::Node config) { SimulationConfig parsed_config; - // print json - spdlog::info("TOGSim Config: {}", config.dump(2)); + YAML::Emitter emitter; + emitter << config; + spdlog::info("PyTorchSim config:\n{}", emitter.c_str()); /* Core configs */ - parsed_config.num_cores = config["num_cores"]; - if (config.contains("core_type")) { - std::vector core_types = config["core_type"].get>(); + parsed_config.num_cores = get_config_value(config, "num_cores"); + if (config["core_type"]) { + std::vector core_types = config["core_type"].as>(); if (core_types.size() != parsed_config.num_cores) throw std::runtime_error("Mismatch between num_cores and core_type list size"); @@ -41,100 +37,105 @@ SimulationConfig initialize_config(json config) { for (int i=0; i(config, "core_freq_mhz"); + if (config["num_systolic_array_per_core"]) + parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"].as(); + if (config["num_stonne_per_core"]) + parsed_config.num_stonne_per_core = config["num_stonne_per_core"].as(); + if (config["num_stonne_port"]) + parsed_config.num_stonne_port = config["num_stonne_port"].as(); parsed_config.core_print_interval = get_config_value(config, "core_stats_print_period_cycles"); - /* Stonne config */ - if (config.contains("stonne_config_path")) - parsed_config.stonne_config_path = config["stonne_config_path"]; + /* Stonne config */ + if (config["stonne_config_path"]) + parsed_config.stonne_config_path = config["stonne_config_path"].as(); /* DRAM config */ - if ((std::string)config["dram_type"] == "simple") + std::string dram_type_str = get_config_value(config, "dram_type"); + + if (dram_type_str == "simple") { parsed_config.dram_type = DramType::SIMPLE; - else if ((std::string)config["dram_type"] == "ramulator") - parsed_config.dram_type = DramType::RAMULATOR1; - else if ((std::string)config["dram_type"] == "ramulator2") + parsed_config.dram_latency = get_config_value(config, "dram_latency"); + } else if (dram_type_str == "ramulator2") { parsed_config.dram_type = DramType::RAMULATOR2; - else - throw std::runtime_error(fmt::format("Not implemented dram type {} ", - (std::string)config["dram_type"])); - parsed_config.dram_freq_mhz = config["dram_freq_mhz"]; - if (config.contains("dram_latency")) - parsed_config.dram_latency = config["dram_latency"]; - if (config.contains("ramulator_config_path")) - parsed_config.dram_config_path = config["ramulator_config_path"]; - parsed_config.dram_channels = config["dram_channels"]; - if (config.contains("dram_req_size_byte")) - parsed_config.dram_req_size = config["dram_req_size_byte"]; - if (config.contains("dram_stats_print_period_cycles")) - parsed_config.dram_print_interval = config["dram_stats_print_period_cycles"]; - if(config.contains("dram_num_burst_length")) - parsed_config.dram_nbl = config["dram_num_burst_length"]; - if (config.contains("dram_num_partitions")) { - parsed_config.dram_num_partitions = config["dram_num_partitions"]; + parsed_config.dram_config_path = get_config_value(config, "ramulator_config_path"); + } else { + throw std::runtime_error(fmt::format("Not implemented dram type {} ", dram_type_str)); + } + + parsed_config.dram_freq_mhz = get_config_value(config, "dram_freq_mhz"); + parsed_config.dram_channels = get_config_value(config, "dram_channels"); + parsed_config.dram_req_size = get_config_value(config, "dram_req_size_byte"); + parsed_config.dram_nbl = get_config_value(config, "dram_num_burst_length"); + + if (config["dram_stats_print_period_cycles"]) + parsed_config.dram_print_interval = config["dram_stats_print_period_cycles"].as(); + if (config["dram_num_partitions"]) { + parsed_config.dram_num_partitions = config["dram_num_partitions"].as(); if (parsed_config.dram_channels % parsed_config.dram_num_partitions != 0) { throw std::runtime_error("[Config] DRAM channels must be divisible by dram_num_partitions"); } } - parsed_config.dram_channels_per_partitions = - parsed_config.dram_channels / parsed_config.dram_num_partitions; + if (parsed_config.dram_num_partitions != 0) { + parsed_config.dram_channels_per_partitions = + parsed_config.dram_channels / parsed_config.dram_num_partitions; + } else { + parsed_config.dram_channels_per_partitions = parsed_config.dram_channels; + } /* L2D config */ - if (config.contains("l2d_type")) { - if ((std::string)config["l2d_type"] == "nocache") + if (config["l2d_type"]) { + std::string l2d_type_str = config["l2d_type"].as(); + if (l2d_type_str == "nocache") parsed_config.l2d_type = L2CacheType::NOCACHE; - else if ((std::string)config["l2d_type"] == "datacache") + else if (l2d_type_str == "datacache") { parsed_config.l2d_type = L2CacheType::DATACACHE; - else - throw std::runtime_error(fmt::format("Not implemented l2 cache type {} ", - (std::string)config["l2d_type"])); + parsed_config.l2d_config_str = get_config_value(config, "l2d_config"); + if (config["l2d_hit_latency"]) + parsed_config.l2d_hit_latency = config["l2d_hit_latency"].as(); + } else + throw std::runtime_error(fmt::format("Not implemented l2 cache type {} ", l2d_type_str)); } else { parsed_config.l2d_type = L2CacheType::NOCACHE; } - if (config.contains("l2d_config")) - parsed_config.l2d_config_str = config["l2d_config"]; - if (config.contains("l2d_hit_latency")) - parsed_config.l2d_config_str = config["l2d_hit_latency"]; - /* Icnt config */ - if ((std::string)config["icnt_type"] == "simple") + std::string icnt_type_str = config["icnt_type"].as(); + if (icnt_type_str == "simple") { parsed_config.icnt_type = IcntType::SIMPLE; - else if ((std::string)config["icnt_type"] == "booksim2") + if (config["icnt_latency_cycles"]) + parsed_config.icnt_latency = config["icnt_latency_cycles"].as(); + } else if (icnt_type_str == "booksim2") { parsed_config.icnt_type = IcntType::BOOKSIM2; - else - throw std::runtime_error(fmt::format("Not implemented icnt type {} ", - (std::string)config["icnt_type"])); - parsed_config.icnt_freq_mhz = config["icnt_freq_mhz"]; - if (config.contains("icnt_latency_cycles")) - parsed_config.icnt_latency = config["icnt_latency_cycles"]; - if (config.contains("booksim_config_path")) - parsed_config.icnt_config_path = config["booksim_config_path"]; - if (config.contains("icnt_stats_print_period_cycles")) - parsed_config.icnt_stats_print_period_cycles = config["icnt_stats_print_period_cycles"]; - if (config.contains("icnt_injection_ports_per_core")) - parsed_config.icnt_injection_ports_per_core = config["icnt_injection_ports_per_core"]; - - if (config.contains("scheduler")) - parsed_config.scheduler_type = config["scheduler"]; - if (config.contains("num_partition")) - parsed_config.num_partition = config["num_partition"]; - if (config.contains("partition")) { + parsed_config.icnt_config_path = get_config_value(config, "booksim_config_path"); + } else + throw std::runtime_error(fmt::format("Not implemented icnt type {} ", icnt_type_str)); + + parsed_config.icnt_freq_mhz = config["icnt_freq_mhz"].as(); + if (config["icnt_stats_print_period_cycles"]) + parsed_config.icnt_stats_print_period_cycles = config["icnt_stats_print_period_cycles"].as(); + if (config["icnt_injection_ports_per_core"]) + parsed_config.icnt_injection_ports_per_core = config["icnt_injection_ports_per_core"].as(); + + if (config["scheduler"]) + parsed_config.scheduler_type = config["scheduler"].as(); + if (config["num_partition"]) + parsed_config.num_partition = config["num_partition"].as(); + if (config["partition"]) { for (int i=0; i(); + parsed_config.partiton_map[i] = partition_id; + spdlog::info("[Config/Core] CPU {}: Partition {}", i, partition_id); + } else { + spdlog::warn("[Config/Core] CPU {}: Partition key not found, defaulting to 0", i); + parsed_config.partiton_map[i] = 0; + } } } else { - /* Default: all partition 0 */ for (int i=0; i> config_json; - config_file.close(); - spdlog::info("[LoadConfig] Success to open \"{}\"", config_path); - return true; - } else { - spdlog::error("[LoadConfig] Failed to open \"{}\"", config_path); +bool loadConfig(const std::string& config_path, YAML::Node& config_yaml) { + try { + config_yaml = YAML::LoadFile(config_path); + spdlog::info("[LoadConfig] Success to open \"{}\"", config_path); + return true; + } catch (const YAML::BadFile& e) { + spdlog::error("[LoadConfig] Failed to open \"{}\" (File not found or inaccessible)", config_path); + return false; + } catch (const YAML::ParserException& e) { + spdlog::error("[LoadConfig] Failed to parse YAML file \"{}\": {}", config_path, e.what()); + return false; + } catch (const std::exception& e) { + spdlog::error("[LoadConfig] Unknown error loading \"{}\": {}", config_path, e.what()); return false; } } @@ -87,26 +91,33 @@ bool find_output_idx(TileGraphParser* tog_parser, std::vector& output_ m = output_idx.at(0); n = output_idx.at(1); k = output_idx.at(2); + auto attr_file = tog_parser->get_attribute_file(); - auto attr_json = tog_parser->get_attribute_file(); + if (!attr_file["zero_skip"]) { + return false; + } - // Check arg0: m -> k + YAML::Node zero_skip = attr_file["zero_skip"]; bool found_arg0 = false; - if (attr_json["zero_skip"].contains("arg0")) { - auto& arg0 = attr_json["zero_skip"]["arg0"]; - if (arg0.contains(std::to_string(m)) && arg0[std::to_string(m)].contains(std::to_string(k))) { + if (zero_skip["arg0"]) { + YAML::Node arg0 = zero_skip["arg0"]; + std::string m_str = std::to_string(m); + std::string k_str = std::to_string(k); + if (arg0[m_str] && arg0[m_str][k_str]) { found_arg0 = true; } } - // Check arg1: n -> k bool found_arg1 = false; - if (attr_json["zero_skip"].contains("arg1")) { - auto& arg1 = attr_json["zero_skip"]["arg1"]; - if (arg1.contains(std::to_string(k)) && arg1[std::to_string(k)].contains(std::to_string(n))) { + if (zero_skip["arg1"]) { + YAML::Node arg1 = zero_skip["arg1"]; + std::string k_str = std::to_string(k); + std::string n_str = std::to_string(n); + if (arg1[k_str] && arg1[k_str][n_str]) { found_arg1 = true; } } + return found_arg0 || found_arg1; } @@ -692,8 +703,8 @@ void TileLoopNode::print_node() { } TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_path, std::string config_path) { - loadConfig(attribute_path, _attribute_json); - loadConfig(config_path, _config_json); + loadConfig(attribute_path, _attribute_config); + loadConfig(config_path, _config_yaml); _attribute_path = attribute_path; if (!std::filesystem::exists(onnx_path)) { @@ -705,32 +716,45 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa onnx::ModelProto model_proto; /* Attribute parsing */ - if (_attribute_json.contains("address_info")) { - auto address_info = _attribute_json["address_info"]; - for (auto it = address_info.begin(); it != address_info.end(); ++it) { - uint64_t value = it.value(); - _arg_to_address[it.key()] = value; - spdlog::info("[TOGParser/Attribute] Address Attribute key: {} address: 0x{:x}", it.key(), value); + if (_attribute_config["address_info"]) { + const auto& address_info = _attribute_config["address_info"]; + for (YAML::const_iterator it = address_info.begin(); it != address_info.end(); ++it) { + std::string key = it->first.as(); + uint64_t value = it->second.as(); + + _arg_to_address[key] = value; + spdlog::info("[TOGParser/Attribute] Address Attribute key: {} address: 0x{:x}", key, value); } } - if (_attribute_json.contains("address_numa_stride")) { - auto address_numa_stride = _attribute_json["address_numa_stride"]; - for (auto it = address_numa_stride.begin(); it != address_numa_stride.end(); ++it) { - auto value_list = it.value(); - for (auto value : value_list) { - _arg_numa_stride[it.key()].push_back(value); + + if (_attribute_config["address_numa_stride"]) { + const auto& address_numa_stride = _attribute_config["address_numa_stride"]; + for (YAML::const_iterator it = address_numa_stride.begin(); it != address_numa_stride.end(); ++it) { + std::string key = it->first.as(); + const auto& value_list = it->second; // YAML Sequence Node + + for (const auto& val : value_list) { + _arg_numa_stride[key].push_back(val.as()); } - spdlog::info("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", it.key(), fmt::join(_arg_numa_stride[it.key()], ", ")); + spdlog::info("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", key, fmt::join(_arg_numa_stride[key], ", ")); } } - if (_attribute_json.contains("sram_alloc") and _config_json.contains("l2d_type") and _config_json["l2d_type"] == "datacache") { - auto sram_alloc_list = _attribute_json["sram_alloc"]; + + if (_attribute_config["sram_alloc"] && + _config_yaml["l2d_type"] && + _config_yaml["l2d_type"].as() == "datacache") { + + auto sram_alloc_list = _attribute_config["sram_alloc"]; spdlog::info("[TOGParser/Attribute] ================= SRAM Alloc Plan ================"); - for (auto it = sram_alloc_list.begin(); it != sram_alloc_list.end(); ++it) { - auto value_list = it.value(); - unsigned long long start = value_list.at(0); - unsigned long long end = value_list.at(1); - spdlog::info("[TOGParser/Attribute] {:16s}: 0x{:016x} ~ 0x{:016x}", it.key(), start, end); + + for (YAML::const_iterator it = sram_alloc_list.begin(); it != sram_alloc_list.end(); ++it) { + std::string key = it->first.as(); + const auto& value_list = it->second; // List [start, end] + + unsigned long long start = value_list[0].as(); + unsigned long long end = value_list[1].as(); + + spdlog::info("[TOGParser/Attribute] {:16s}: 0x{:016x} ~ 0x{:016x}", key, start, end); Interval entry = {start, end, 0}; _cache_plan.push_back(entry); } @@ -838,7 +862,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa /* Iterate outer loop and initialize inner loop */ for (auto iter=_tile_graph->begin(); iter!=_tile_graph->end(); ++iter) { std::shared_ptr subgraph = std::make_shared(); - subgraph->set_core_id(getCoreIdFromJson(_attribute_json, subgraph->get_id())); + subgraph->set_core_id(getCoreIdFromConfig(_attribute_config, subgraph->get_id())); auto indices = iter.get_indices(); for (auto loop : _loop_nodes.at(last_outer_idx)) { std::shared_ptr outer_loop = std::static_pointer_cast(loop); @@ -941,11 +965,12 @@ const std::vector& TileGraphParser::lookupNumaInfo(std::string key) { return _arg_numa_stride.at(key); } -int TileGraphParser::getCoreIdFromJson(const json& attribute_json, int subgraph_id) { - if (attribute_json.contains("subgraph_map")) { - const auto& subgraph_map = attribute_json["subgraph_map"]; - if (subgraph_map.contains(std::to_string(subgraph_id)) && subgraph_map[std::to_string(subgraph_id)].is_number_integer()) { - return subgraph_map[std::to_string(subgraph_id)]; +int TileGraphParser::getCoreIdFromConfig(const YAML::Node& attribute_config, int subgraph_id) { + std::string key = std::to_string(subgraph_id); + if (attribute_config["subgraph_map"]) { + const auto& subgraph_map = attribute_config["subgraph_map"]; + if (subgraph_map[key]) { + return subgraph_map[key].as(); } } return -1; diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc index 77c1bae7..bee1b45f 100644 --- a/TOGSim/src/main.cc +++ b/TOGSim/src/main.cc @@ -22,11 +22,11 @@ void launchKernel(Simulator* simulator, std::string onnx_path, std::string attri } Simulator* create_simulator(std::string config_path) { - json config_json; - if(!loadConfig(config_path, config_json)) { + YAML::Node config_yaml; + if (!loadConfig(config_path, config_yaml)) exit(1); - } - SimulationConfig config = initialize_config(config_json); + SimulationConfig config = initialize_config(config_yaml); + auto simulator = new Simulator(config); return simulator; } From d66df91d973b3a01dc2be81d763b0305569ad9e5 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 9 Dec 2025 09:06:58 +0000 Subject: [PATCH 021/194] [Frontend] Use YAML config file instead of json --- PyTorchSimFrontend/extension_config.py | 30 +++++++------ PyTorchSimFrontend/extension_op.py | 2 +- README.md | 8 ++-- Simulator/simulator.py | 29 +++++++------ TOGSim/include/Common.h | 1 + TOGSim/include/TileGraphParser.h | 3 +- TOGSim/src/Common.cc | 17 ++++++++ TOGSim/src/TileGraphParser.cc | 17 -------- configs/heterogeneous_c2_simple_noc.json | 40 ----------------- configs/heterogeneous_c2_simple_noc.yml | 37 ++++++++++++++++ configs/stonne_big_c1_simple_noc.json | 22 ---------- configs/stonne_big_c1_simple_noc.yml | 21 +++++++++ configs/stonne_single_c1_simple_noc.json | 22 ---------- configs/stonne_single_c1_simple_noc.yml | 21 +++++++++ configs/stonne_validation_c1_simple_noc.json | 23 ---------- configs/stonne_validation_c1_simple_noc.yml | 22 ++++++++++ .../systolic_ws_128x128_c1_booksim_tpuv2.json | 29 ------------- .../systolic_ws_128x128_c1_booksim_tpuv2.yml | 26 +++++++++++ .../systolic_ws_128x128_c1_booksim_tpuv3.json | 32 -------------- .../systolic_ws_128x128_c1_booksim_tpuv3.yml | 30 +++++++++++++ ...stolic_ws_128x128_c1_simple_noc_tpuv2.json | 31 ------------- ...ystolic_ws_128x128_c1_simple_noc_tpuv2.yml | 29 +++++++++++++ ...stolic_ws_128x128_c1_simple_noc_tpuv3.json | 32 -------------- ...ystolic_ws_128x128_c1_simple_noc_tpuv3.yml | 30 +++++++++++++ ...c_ws_128x128_c1_simple_noc_tpuv3_half.json | 32 -------------- ...ic_ws_128x128_c1_simple_noc_tpuv3_half.yml | 30 +++++++++++++ ...stolic_ws_128x128_c1_simple_noc_tpuv4.json | 34 --------------- ...ystolic_ws_128x128_c1_simple_noc_tpuv4.yml | 32 ++++++++++++++ .../systolic_ws_128x128_c2_booksim_tpuv3.json | 32 -------------- .../systolic_ws_128x128_c2_booksim_tpuv3.yml | 30 +++++++++++++ ...s_128x128_c2_booksim_tpuv3_bw_quarter.json | 43 ------------------- ...ws_128x128_c2_booksim_tpuv3_bw_quarter.yml | 39 +++++++++++++++++ .../systolic_ws_128x128_c2_chiplet_tpuv3.json | 34 --------------- .../systolic_ws_128x128_c2_chiplet_tpuv3.yml | 32 ++++++++++++++ ...lic_ws_128x128_c2_chiplet_tpuv3_xnuma.json | 33 -------------- ...olic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml | 31 +++++++++++++ ...stolic_ws_128x128_c2_simple_noc_tpuv2.json | 31 ------------- ...ystolic_ws_128x128_c2_simple_noc_tpuv2.yml | 29 +++++++++++++ ...stolic_ws_128x128_c2_simple_noc_tpuv3.json | 32 -------------- ...ystolic_ws_128x128_c2_simple_noc_tpuv3.yml | 30 +++++++++++++ ...128x128_c2_simple_noc_tpuv3_partition.json | 38 ---------------- ..._128x128_c2_simple_noc_tpuv3_partition.yml | 34 +++++++++++++++ ...stolic_ws_128x128_c2_simple_noc_tpuv4.json | 34 --------------- ...ystolic_ws_128x128_c2_simple_noc_tpuv4.yml | 32 ++++++++++++++ configs/systolic_ws_8x8_c1_booksim.json | 29 ------------- configs/systolic_ws_8x8_c1_booksim.yml | 27 ++++++++++++ configs/systolic_ws_8x8_c1_simple_noc.json | 30 ------------- configs/systolic_ws_8x8_c1_simple_noc.yml | 28 ++++++++++++ experiments/BERT.py | 2 +- .../artifact/cycle_validation/run_cycle.sh | 2 +- experiments/artifact/speedup/run_speedup.sh | 4 +- .../speedup/scripts/run_speed_ils_bert.sh | 8 ++-- .../speedup/scripts/run_speed_ils_conv.sh | 8 ++-- .../speedup/scripts/run_speed_ils_matmul.sh | 8 ++-- .../speedup/scripts/run_speed_ils_resnet.sh | 8 ++-- experiments/attention.py | 2 +- experiments/conv.py | 2 +- experiments/gemm.py | 2 +- experiments/layernorm.py | 2 +- experiments/resnet18.py | 2 +- experiments/resnet50.py | 2 +- experiments/softmax.py | 2 +- scripts/CompilerOpt_experiment/DMAopt.sh | 2 +- scripts/chiplet.sh | 10 ++--- scripts/chiplet_prep.py | 11 +++-- scripts/sparsity_experiment/run.sh | 12 +++--- scripts/stonne_experiment/run.sh | 6 +-- scripts/stonne_experiment2/tog_gen.py | 2 +- tests/test_compile_overhead.py | 2 +- tests/test_hetro.py | 2 +- tests/test_scheduler.py | 2 +- tests/test_scheduler_batching.py | 2 +- tutorial/session1/CompilerOptimization.ipynb | 4 +- tutorial/session1/ExecutionMode.ipynb | 8 ++-- tutorial/session1/LogAnalysis.ipynb | 2 +- tutorial/session1/Mapping.ipynb | 4 +- 76 files changed, 708 insertions(+), 745 deletions(-) delete mode 100644 configs/heterogeneous_c2_simple_noc.json create mode 100644 configs/heterogeneous_c2_simple_noc.yml delete mode 100644 configs/stonne_big_c1_simple_noc.json create mode 100644 configs/stonne_big_c1_simple_noc.yml delete mode 100644 configs/stonne_single_c1_simple_noc.json create mode 100644 configs/stonne_single_c1_simple_noc.yml delete mode 100644 configs/stonne_validation_c1_simple_noc.json create mode 100644 configs/stonne_validation_c1_simple_noc.yml delete mode 100644 configs/systolic_ws_128x128_c1_booksim_tpuv2.json create mode 100644 configs/systolic_ws_128x128_c1_booksim_tpuv2.yml delete mode 100644 configs/systolic_ws_128x128_c1_booksim_tpuv3.json create mode 100644 configs/systolic_ws_128x128_c1_booksim_tpuv3.yml delete mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json create mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml delete mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json create mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml delete mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json create mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml delete mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json create mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml delete mode 100644 configs/systolic_ws_128x128_c2_booksim_tpuv3.json create mode 100644 configs/systolic_ws_128x128_c2_booksim_tpuv3.yml delete mode 100644 configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json create mode 100644 configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml delete mode 100644 configs/systolic_ws_128x128_c2_chiplet_tpuv3.json create mode 100644 configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml delete mode 100644 configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json create mode 100644 configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml delete mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json create mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml delete mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json create mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml delete mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json create mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml delete mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json create mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml delete mode 100644 configs/systolic_ws_8x8_c1_booksim.json create mode 100644 configs/systolic_ws_8x8_c1_booksim.yml delete mode 100644 configs/systolic_ws_8x8_c1_simple_noc.json create mode 100644 configs/systolic_ws_8x8_c1_simple_noc.yml diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index 8d668b58..ab8aea69 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -1,7 +1,7 @@ import os import sys import importlib -import json +import yaml CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') CONFIG_GEM5_PATH = os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt") @@ -13,51 +13,53 @@ def __getattr__(name): # TOGSim config config_path = os.environ.get('TOGSIM_CONFIG', - default=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json") + default=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml") if name == "CONFIG_TOGSIM_CONFIG": return config_path - config_json = json.load(open(config_path, 'r')) + + with open(config_path, 'r') as f: + config_yaml = yaml.safe_load(f) # Hardware info config if name == "vpu_num_lanes": - return config_json["vpu_num_lanes"] + return config_yaml["vpu_num_lanes"] if name == "CONFIG_SPAD_INFO": return { "spad_vaddr" : 0xD0000000, "spad_paddr" : 0x2000000000, - "spad_size" : config_json["vpu_spad_size_kb_per_lane"] << 10 # Note: spad size per lane + "spad_size" : config_yaml["vpu_spad_size_kb_per_lane"] << 10 # Note: spad size per lane } if name == "CONFIG_PRECISION": return 4 # 32bit if name == "CONFIG_NUM_CORES": - return config_json["num_cores"] + return config_yaml["num_cores"] if name == "vpu_vector_length_bits": - return config_json["vpu_vector_length_bits"] + return config_yaml["vpu_vector_length_bits"] if name == "pytorchsim_functional_mode": - return config_json['pytorchsim_functional_mode'] + return config_yaml['pytorchsim_functional_mode'] if name == "pytorchsim_timing_mode": - return config_json['pytorchsim_timing_mode'] + return config_yaml['pytorchsim_timing_mode'] # Mapping strategy if name == "codegen_mapping_strategy": - codegen_mapping_strategy = config_json["codegen_mapping_strategy"] + codegen_mapping_strategy = config_yaml["codegen_mapping_strategy"] assert(codegen_mapping_strategy in ["heuristic", "autotune", "external-then-heuristic", "external-then-autotune"]), "Invalid mapping strategy!" return codegen_mapping_strategy if name == "codegen_external_mapping_file": - return config_json["codegen_external_mapping_file"] + return config_yaml["codegen_external_mapping_file"] # Autotune config if name == "codegen_autotune_max_retry": - return config_json["codegen_autotune_max_retry"] + return config_yaml["codegen_autotune_max_retry"] if name == "codegen_autotune_template_topk": - return config_json["codegen_autotune_template_topk"] + return config_yaml["codegen_autotune_template_topk"] # Compiler Optimization if name == "codegen_compiler_optimization": - opt_level = config_json["codegen_compiler_optimization"] + opt_level = config_yaml["codegen_compiler_optimization"] valid_opts = { "fusion", "reduction_epilogue", diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py index 786e7398..18bf65c3 100644 --- a/PyTorchSimFrontend/extension_op.py +++ b/PyTorchSimFrontend/extension_op.py @@ -276,7 +276,7 @@ def sparse_mm_stonne_outer(a, b, out): onnx_path, attribute_path, c_result_path = prepare_outer_product_matrix(a, b, out) togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") - stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_single_c1_simple_noc.json' + stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_single_c1_simple_noc.yml' TOGSim = TOGSimulator(togsim_path, stonne_config_path) result_path = TOGSim.simulation(onnx_path) TOGSimulator.get_result_from_file(result_path) diff --git a/README.md b/README.md index 103131c1..4d98baa4 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ Our load generator supports multi-tenancy experiments. You can run a simple exam python tests/test_scheduler.py ``` Below is an example code of multi-tenancy `resnet18` and `EncoderBlock`. -In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSimulator config file(`.json`). The compiled PyTorch models are then registered with a unique model id. +In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSimulator config file(`.yml`). The compiled PyTorch models are then registered with a unique model id. ```python3 import os @@ -228,7 +228,7 @@ import sys import torch from torchvision.models import resnet18 base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') -config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' +config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml' sys.path.append(base_path) from tests.test_transformer import EncoderBlock @@ -244,7 +244,7 @@ SchedulerDNNModel.register_model("model0", opt_model0) SchedulerDNNModel.register_model("model1", opt_model1) ``` -The config file(`.json`) specifies two key items: +The config file(`.yml`) specifies two key items: - `num_partition`: The total number of independent request queues to create. - `partition`: Defines the hardware mapping, assigning each queue (identified by its index) to a specific physical core. For example, the configuration below creates two scheduling queues (`0` and `1`) and maps `core_0` to queue `0` and `core_1` to queue `1`: @@ -415,7 +415,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing ``` You can set TOGSim config path as below. ```bash -export TORCHSIM_CONFIG=/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json +export TORCHSIM_CONFIG=/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml ``` ## Future Works Currently, PyTorchSim supports PyTorch 2.2. Support for newer versions will be added soon. diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 4786fd32..a46243f0 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -4,7 +4,7 @@ import subprocess import re import sys -import json +import yaml import time import datetime import threading @@ -204,7 +204,7 @@ class TOGSimulator(): def __init__(self, togsim_path, config_path, vectorlane_size=-1) -> None: self.base_dir = togsim_path self.config_path = config_path - self.config_json = self.load_json(self.config_path) + self.config_yaml = self.load_yaml(self.config_path) self.process = None self.vectorlane_size = vectorlane_size @@ -347,40 +347,41 @@ def sram_dealloc(cls, buf_name, addr_range): def create_attribute_file(self, attribute_path, inputs, **kwargs): address_info = {} sram_buffer = {} - json_content = {} + yaml_content = {} + os.makedirs(attribute_path, exist_ok=True) index = str(len(os.listdir(attribute_path))) attribute_path = os.path.join(attribute_path, index) for idx, tensor in enumerate(inputs): address_info[f"arg{idx}"] = tensor.data_ptr() - json_content["address_info"] = address_info + yaml_content["address_info"] = address_info for buf_name, range in self.ALLOC_POOL.items(): sram_buffer[buf_name] = range - json_content["sram_alloc"] = sram_buffer + yaml_content["sram_alloc"] = sram_buffer with open(attribute_path, "w") as f: - json.dump(json_content, f, indent=4) + yaml.dump(yaml_content, f, default_flow_style=False) f.flush() os.fsync(f.fileno()) # There could be a race condition. return attribute_path - def load_json(self, config_path): + def load_yaml(self, config_path): config_path = Path(config_path) if not config_path.is_file(): - raise FileNotFoundError(f"JSON file not found: {config_path}") + raise FileNotFoundError(f"YAML file not found: {config_path}") try: with open(config_path, "r") as file: - data = json.load(file) + data = yaml.safe_load(file) return data - except json.JSONDecodeError as e: - raise ValueError(f"Invalid JSON format: {e}") + except yaml.YAMLError as e: + raise ValueError(f"Invalid YAML format: {e}") def get_core_freq(self): - if "core_freq_mhz" in self.config_json: - return self.config_json["core_freq_mhz"] * 1000 * 1000 # MHz + if "core_freq_mhz" in self.config_yaml: + return self.config_yaml["core_freq_mhz"] * 1000 * 1000 # MHz else: raise KeyError("Key 'core_freq' not found in JSON.") @@ -462,6 +463,6 @@ def get_result_from_file(result_path): return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle if __name__ == "__main__": - sim = TOGSimulator("/workspace/PyTorchSim/TOGSim", "/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json") + sim = TOGSimulator("/workspace/PyTorchSim/TOGSim", "/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml") sim.interactive_simulation() sim.until(4000) \ No newline at end of file diff --git a/TOGSim/include/Common.h b/TOGSim/include/Common.h index c62c3e0b..2fd62681 100644 --- a/TOGSim/include/Common.h +++ b/TOGSim/include/Common.h @@ -27,4 +27,5 @@ typedef uint64_t addr_type; typedef uint64_t cycle_type; +bool loadConfig(const std::string& config_path, YAML::Node& config_yaml); SimulationConfig initialize_config(YAML::Node config); \ No newline at end of file diff --git a/TOGSim/include/TileGraphParser.h b/TOGSim/include/TileGraphParser.h index 07e5b212..9c176966 100644 --- a/TOGSim/include/TileGraphParser.h +++ b/TOGSim/include/TileGraphParser.h @@ -9,6 +9,7 @@ #include "Instruction.h" #include "sstStonne.h" #include "IntervalTree.h" +#include "Common.h" #include "onnx/defs/schema.h" #include "onnx/onnx-operators_pb.h" #include "onnx/onnx_pb.h" @@ -33,8 +34,6 @@ enum class LoopType { INNER_LOOP }; -bool loadConfig(const std::string& config_path, YAML::Node& config_yaml); - class TileNode { public: TileNode(onnx::NodeProto& node); diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc index 63d360c6..b15381a6 100644 --- a/TOGSim/src/Common.cc +++ b/TOGSim/src/Common.cc @@ -1,5 +1,22 @@ #include "Common.h" +bool loadConfig(const std::string& config_path, YAML::Node& config_yaml) { + try { + config_yaml = YAML::LoadFile(config_path); + spdlog::info("[LoadConfig] Success to open \"{}\"", config_path); + return true; + } catch (const YAML::BadFile& e) { + spdlog::error("[LoadConfig] Failed to open \"{}\" (File not found or inaccessible)", config_path); + return false; + } catch (const YAML::ParserException& e) { + spdlog::error("[LoadConfig] Failed to parse YAML file \"{}\": {}", config_path, e.what()); + return false; + } catch (const std::exception& e) { + spdlog::error("[LoadConfig] Unknown error loading \"{}\": {}", config_path, e.what()); + return false; + } +} + template T get_config_value(const YAML::Node& config, std::string key) { if (config[key]) { diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc index ae8954d9..515f6247 100644 --- a/TOGSim/src/TileGraphParser.cc +++ b/TOGSim/src/TileGraphParser.cc @@ -1,22 +1,5 @@ #include "TileGraphParser.h" -bool loadConfig(const std::string& config_path, YAML::Node& config_yaml) { - try { - config_yaml = YAML::LoadFile(config_path); - spdlog::info("[LoadConfig] Success to open \"{}\"", config_path); - return true; - } catch (const YAML::BadFile& e) { - spdlog::error("[LoadConfig] Failed to open \"{}\" (File not found or inaccessible)", config_path); - return false; - } catch (const YAML::ParserException& e) { - spdlog::error("[LoadConfig] Failed to parse YAML file \"{}\": {}", config_path, e.what()); - return false; - } catch (const std::exception& e) { - spdlog::error("[LoadConfig] Unknown error loading \"{}\": {}", config_path, e.what()); - return false; - } -} - void printIndexMap(std::string prefix, const std::map& indexMap) { std::ostringstream oss; for (const auto& [key, value] : indexMap) { diff --git a/configs/heterogeneous_c2_simple_noc.json b/configs/heterogeneous_c2_simple_noc.json deleted file mode 100644 index a68f38c2..00000000 --- a/configs/heterogeneous_c2_simple_noc.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "core_type" : ["stonne", "ws_mesh"], - "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", - "num_cores" : 2, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - - "num_stonne_per_core" : 8, - "num_stonne_port" : 64, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 16, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":1 - }, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/heterogeneous_c2_simple_noc.yml b/configs/heterogeneous_c2_simple_noc.yml new file mode 100644 index 00000000..9c596d85 --- /dev/null +++ b/configs/heterogeneous_c2_simple_noc.yml @@ -0,0 +1,37 @@ +core_type: +- stonne +- ws_mesh +stonne_config_path: /workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg +num_cores: 2 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_stonne_per_core: 8 +num_stonne_port: 64 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 16 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 +num_partition: 2 +partition: + core_0: 0 + core_1: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/stonne_big_c1_simple_noc.json b/configs/stonne_big_c1_simple_noc.json deleted file mode 100644 index 0a8ca3c2..00000000 --- a/configs/stonne_big_c1_simple_noc.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "core_type" : ["stonne"], - "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", - "num_cores" : 1, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_stonne_per_core" : 8, - "num_stonne_port" : 64, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 8, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycless": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16 -} \ No newline at end of file diff --git a/configs/stonne_big_c1_simple_noc.yml b/configs/stonne_big_c1_simple_noc.yml new file mode 100644 index 00000000..b14838c8 --- /dev/null +++ b/configs/stonne_big_c1_simple_noc.yml @@ -0,0 +1,21 @@ +core_type: +- stonne +stonne_config_path: /workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg +num_cores: 1 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_stonne_per_core: 8 +num_stonne_port: 64 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 8 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycless: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 diff --git a/configs/stonne_single_c1_simple_noc.json b/configs/stonne_single_c1_simple_noc.json deleted file mode 100644 index 3421d4f1..00000000 --- a/configs/stonne_single_c1_simple_noc.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "core_type" : ["stonne"], - "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", - "num_cores" : 1, - "core_freq_mhz" : 700, - "core_stats_print_period_cycles" : 10000, - "num_stonne_per_core" : 1, - "num_stonne_port" : 8, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 700, - "dram_channels": 8, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 700, - "icnt_injection_ports_per_core" : 8 -} \ No newline at end of file diff --git a/configs/stonne_single_c1_simple_noc.yml b/configs/stonne_single_c1_simple_noc.yml new file mode 100644 index 00000000..0ed7962c --- /dev/null +++ b/configs/stonne_single_c1_simple_noc.yml @@ -0,0 +1,21 @@ +core_type: +- stonne +stonne_config_path: /workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg +num_cores: 1 +core_freq_mhz: 700 +core_stats_print_period_cycles: 10000 +num_stonne_per_core: 1 +num_stonne_port: 8 + +dram_type: ramulator2 +dram_freq_mhz: 700 +dram_channels: 8 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 700 +icnt_injection_ports_per_core: 8 diff --git a/configs/stonne_validation_c1_simple_noc.json b/configs/stonne_validation_c1_simple_noc.json deleted file mode 100644 index fb196dfb..00000000 --- a/configs/stonne_validation_c1_simple_noc.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "core_type" : ["stonne"], - "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg", - "num_cores" : 1, - "core_freq_mhz" : 1000, - "core_stats_print_period_cycles" : 10000, - "num_stonne_per_core" : 1, - "num_stonne_port" : 32, - - "dram_type" : "simple", - "dram_freq_mhz" : 1000, - "dram_channels": 1, - "dram_req_size_byte": 32, - "dram_latency" : 100, - "dram_stats_print_period_cycles": 10000, - "l2d_type" : "datacache", - "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 1000, - "icnt_injection_ports_per_core" : 8 -} \ No newline at end of file diff --git a/configs/stonne_validation_c1_simple_noc.yml b/configs/stonne_validation_c1_simple_noc.yml new file mode 100644 index 00000000..f86dcce1 --- /dev/null +++ b/configs/stonne_validation_c1_simple_noc.yml @@ -0,0 +1,22 @@ +core_type: +- stonne +stonne_config_path: /workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg +num_cores: 1 +core_freq_mhz: 1000 +core_stats_print_period_cycles: 10000 +num_stonne_per_core: 1 +num_stonne_port: 32 + +dram_type: simple +dram_freq_mhz: 1000 +dram_channels: 1 +dram_req_size_byte: 32 +dram_latency: 100 +dram_stats_print_period_cycles: 10000 +l2d_type: datacache +l2d_config: S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32 + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 1000 +icnt_injection_ports_per_core: 8 diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/configs/systolic_ws_128x128_c1_booksim_tpuv2.json deleted file mode 100644 index 686827dc..00000000 --- a/configs/systolic_ws_128x128_c1_booksim_tpuv2.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 700, - "core_stats_print_period_cycles" : 10000, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" :700, - "dram_channels": 16, - "dram_req_size_byte": 32, - - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - - "icnt_type" : "booksim2", - "icnt_freq_mhz" : 700, - "icnt_injection_ports_per_core" : 16, - "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt", - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml new file mode 100644 index 00000000..08149005 --- /dev/null +++ b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml @@ -0,0 +1,26 @@ +num_cores: 1 +core_freq_mhz: 700 +core_stats_print_period_cycles: 10000 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 700 +dram_channels: 16 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml + +icnt_type: booksim2 +icnt_freq_mhz: 700 +icnt_injection_ports_per_core: 16 +booksim_config_path: ../configs/booksim2_configs/fly_c16_m16.icnt + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv3.json b/configs/systolic_ws_128x128_c1_booksim_tpuv3.json deleted file mode 100644 index 1109dc0f..00000000 --- a/configs/systolic_ws_128x128_c1_booksim_tpuv3.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 16, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "booksim2", - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt", - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml new file mode 100644 index 00000000..12304ce2 --- /dev/null +++ b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml @@ -0,0 +1,30 @@ +num_cores: 1 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 16 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: booksim2 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 +booksim_config_path: ../configs/booksim2_configs/fly_c16_m16.icnt + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json deleted file mode 100644 index 22aedcf8..00000000 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 700, - "core_stats_print_period_cycles" : 10000, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 700, - "dram_channels": 32, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycless": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 700, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml new file mode 100644 index 00000000..aec29ff8 --- /dev/null +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml @@ -0,0 +1,29 @@ +num_cores: 1 +core_freq_mhz: 700 +core_stats_print_period_cycles: 10000 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 700 +dram_channels: 32 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycless: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 700 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json deleted file mode 100644 index e8e489d9..00000000 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 16, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "heuristic", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml new file mode 100644 index 00000000..72873f1c --- /dev/null +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml @@ -0,0 +1,30 @@ +num_cores: 1 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 16 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: heuristic +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json deleted file mode 100644 index 980bfc73..00000000 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 8, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml new file mode 100644 index 00000000..c2e962e3 --- /dev/null +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml @@ -0,0 +1,30 @@ +num_cores: 1 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 8 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json deleted file mode 100644 index 02bfd75c..00000000 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 1050, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 4, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" :1200, - "dram_channels": 16, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - "l2d_type" : "datacache", - "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 1050, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml new file mode 100644 index 00000000..0415876d --- /dev/null +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml @@ -0,0 +1,32 @@ +num_cores: 1 +core_freq_mhz: 1050 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 4 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 1200 +dram_channels: 16 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml +l2d_type: datacache +l2d_config: S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32 + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 1050 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/configs/systolic_ws_128x128_c2_booksim_tpuv3.json deleted file mode 100644 index 66566324..00000000 --- a/configs/systolic_ws_128x128_c2_booksim_tpuv3.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 2, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 32, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "booksim2", - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - "booksim_config_path" : "../configs/booksim2_configs/fly_c32_m32.icnt", - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml new file mode 100644 index 00000000..e411c0f3 --- /dev/null +++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml @@ -0,0 +1,30 @@ +num_cores: 2 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 32 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: booksim2 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 +booksim_config_path: ../configs/booksim2_configs/fly_c32_m32.icnt + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json deleted file mode 100644 index 8ef47e87..00000000 --- a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "num_cores" : 2, - "core_freq_mhz" : 940, - "sram_size" : 65536, - "core_print_interval" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq" : 940, - "dram_channels": 8, - "dram_req_size": 32, - "dram_latency" : 10, - "dram_nbl" : 2, - "dram_print_interval": 10000, - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "booksim2", - "icnt_latency_cycles" : 10, - "icnt_freq" : 940, - "icnt_injection_ports_per_core" : 16, - "icnt_config_path" : "../configs/booksim2_configs/fly_c32_m8.icnt", - - "precision" : 4, - "scheduler" : "simple", - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":0 - }, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml new file mode 100644 index 00000000..f164b108 --- /dev/null +++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml @@ -0,0 +1,39 @@ +num_cores: 2 +core_freq_mhz: 940 +sram_size: 65536 +core_print_interval: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq: 940 +dram_channels: 8 +dram_req_size: 32 +dram_latency: 10 +dram_nbl: 2 +dram_print_interval: 10000 +dram_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: booksim2 +icnt_latency_cycles: 10 +icnt_freq: 940 +icnt_injection_ports_per_core: 16 +icnt_config_path: ../configs/booksim2_configs/fly_c32_m8.icnt +precision: 4 +scheduler: simple +num_partition: 2 +partition: + core_0: 0 + core_1: 0 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json deleted file mode 100644 index ecd671bf..00000000 --- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "num_cores" : 2, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 32, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "dram_num_partitions" : 2, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "booksim2", - "icnt_freq_mhz" : 1000, - "icnt_injection_ports_per_core" : 16, - "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", - "icnt_stats_print_period_cycles" : 10000, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml new file mode 100644 index 00000000..e38f091f --- /dev/null +++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml @@ -0,0 +1,32 @@ +num_cores: 2 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 32 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +dram_num_partitions: 2 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: booksim2 +icnt_freq_mhz: 1000 +icnt_injection_ports_per_core: 16 +booksim_config_path: ../configs/booksim2_configs/chiplet_32_32_2.icnt +icnt_stats_print_period_cycles: 10000 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json deleted file mode 100644 index 168fbe3a..00000000 --- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "num_cores" : 2, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 32, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "dram_num_partitions" : 1, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "booksim2", - "icnt_freq_mhz" : 1000, - "icnt_injection_ports_per_core" : 16, - "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt", - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml new file mode 100644 index 00000000..57696243 --- /dev/null +++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml @@ -0,0 +1,31 @@ +num_cores: 2 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 32 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +dram_num_partitions: 1 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: booksim2 +icnt_freq_mhz: 1000 +icnt_injection_ports_per_core: 16 +booksim_config_path: ../configs/booksim2_configs/chiplet_32_32_2.icnt + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json deleted file mode 100644 index 0a5f15b2..00000000 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "num_cores" : 2, - "core_freq_mhz" : 700, - "core_stats_print_period_cycles" : 10000, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" :700, - "dram_channels": 32, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 700, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "heuristic", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml new file mode 100644 index 00000000..f0686055 --- /dev/null +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml @@ -0,0 +1,29 @@ +num_cores: 2 +core_freq_mhz: 700 +core_stats_print_period_cycles: 10000 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 700 +dram_channels: 32 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 700 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: heuristic +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json deleted file mode 100644 index f099b93d..00000000 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 2, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 32, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "heuristic", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml new file mode 100644 index 00000000..511a5a09 --- /dev/null +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml @@ -0,0 +1,30 @@ +num_cores: 2 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 32 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: heuristic +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json deleted file mode 100644 index 681ef884..00000000 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "num_cores" : 2, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 32, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - - "num_partition" : 2, - "partition": { - "core_0":0, - "core_1":1 - }, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml new file mode 100644 index 00000000..499ad823 --- /dev/null +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml @@ -0,0 +1,34 @@ +num_cores: 2 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 32 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 +num_partition: 2 +partition: + core_0: 0 + core_1: 1 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json deleted file mode 100644 index d09228a1..00000000 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "num_cores" : 2, - "core_freq_mhz" : 1050, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 4, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" :1200, - "dram_channels": 32, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml", - "l2d_type" : "datacache", - "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 1050, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml new file mode 100644 index 00000000..da40f01e --- /dev/null +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml @@ -0,0 +1,32 @@ +num_cores: 2 +core_freq_mhz: 1050 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 4 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 1200 +dram_channels: 32 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml +l2d_type: datacache +l2d_config: S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32 + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 1050 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_8x8_c1_booksim.json b/configs/systolic_ws_8x8_c1_booksim.json deleted file mode 100644 index 851664e6..00000000 --- a/configs/systolic_ws_8x8_c1_booksim.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 800, - "core_stats_print_period_cycles" : 100000, - - "vpu_num_lanes" : 8, - "vpu_spad_size_kb_per_lane" : 32, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" :800, - "dram_channels": 1, - "dram_req_size_byte": 64, - "dram_num_burst_length" : 4, - "dram_stats_print_period_cycles": 100000, - "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "booksim2", - "icnt_freq_mhz" : 800, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/systolic_ws_8x8_c1_booksim.yml b/configs/systolic_ws_8x8_c1_booksim.yml new file mode 100644 index 00000000..6fd305f9 --- /dev/null +++ b/configs/systolic_ws_8x8_c1_booksim.yml @@ -0,0 +1,27 @@ +num_cores: 1 +core_freq_mhz: 800 +core_stats_print_period_cycles: 100000 + +vpu_num_lanes: 8 +vpu_spad_size_kb_per_lane: 32 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 800 +dram_channels: 1 +dram_req_size_byte: 64 +dram_num_burst_length: 4 +dram_stats_print_period_cycles: 100000 +ramulator_config_path: ../configs/ramulator2_configs/DDR4.yaml + +icnt_type: booksim2 +icnt_freq_mhz: 800 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/configs/systolic_ws_8x8_c1_simple_noc.json b/configs/systolic_ws_8x8_c1_simple_noc.json deleted file mode 100644 index 2eb7e183..00000000 --- a/configs/systolic_ws_8x8_c1_simple_noc.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 800, - "core_stats_print_period_cycles" : 100000, - - "vpu_num_lanes" : 8, - "vpu_spad_size_kb_per_lane" : 32, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" :800, - "dram_channels": 1, - "dram_req_size_byte": 64, - "dram_num_burst_length" : 4, - "dram_stats_print_period_cycles": 100000, - "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 800, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/configs/systolic_ws_8x8_c1_simple_noc.yml b/configs/systolic_ws_8x8_c1_simple_noc.yml new file mode 100644 index 00000000..274f633c --- /dev/null +++ b/configs/systolic_ws_8x8_c1_simple_noc.yml @@ -0,0 +1,28 @@ +num_cores: 1 +core_freq_mhz: 800 +core_stats_print_period_cycles: 100000 + +vpu_num_lanes: 8 +vpu_spad_size_kb_per_lane: 32 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 800 +dram_channels: 1 +dram_req_size_byte: 64 +dram_num_burst_length: 4 +dram_stats_print_period_cycles: 100000 +ramulator_config_path: ../configs/ramulator2_configs/DDR4.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 800 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/experiments/BERT.py b/experiments/BERT.py index 3311682c..5ccd3084 100644 --- a/experiments/BERT.py +++ b/experiments/BERT.py @@ -36,7 +36,7 @@ def run_BERT(size, input_seq, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name sys.path.append(base_dir) args = argparse.ArgumentParser() diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh index 99eed4ed..9cfd1e98 100755 --- a/experiments/artifact/cycle_validation/run_cycle.sh +++ b/experiments/artifact/cycle_validation/run_cycle.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -export TORCHSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json +export TORCHSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs mkdir -p $LOG_DIR diff --git a/experiments/artifact/speedup/run_speedup.sh b/experiments/artifact/speedup/run_speedup.sh index 9a19e9af..e84ab1a9 100755 --- a/experiments/artifact/speedup/run_speedup.sh +++ b/experiments/artifact/speedup/run_speedup.sh @@ -4,8 +4,8 @@ CONFIG_DIR="$TORCHSIM_DIR/configs" SIMULATOR_BIN="$TORCHSIM_DIR/TOGSim/build/bin/Simulator" configs=( - "systolic_ws_128x128_c2_simple_noc_tpuv3.json" - "systolic_ws_128x128_c2_booksim_tpuv3.json" + "systolic_ws_128x128_c2_simple_noc_tpuv3.yml" + "systolic_ws_128x128_c2_booksim_tpuv3.yml" ) target_list=( diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh index fe872e02..467949af 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh @@ -2,10 +2,10 @@ base_dir=$TORCHSIM_DIR/experiments/artifact/speedup config=( - # "systolic_ws_8x8_c1_simple_noc.json" - "systolic_ws_128x128_c2_simple_noc_tpuv3.json" - #"systolic_ws_128x128_c2_booksim_tpuv3.json" - # "systolic_ws_128x128_c2_simple_noc_tpuv4.json" + # "systolic_ws_8x8_c1_simple_noc.yml" + "systolic_ws_128x128_c2_simple_noc_tpuv3.yml" + #"systolic_ws_128x128_c2_booksim_tpuv3.yml" + # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml" ) TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") SIZE_LIST=( diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh index 19613a34..fb681c74 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh @@ -2,10 +2,10 @@ base_dir=$TORCHSIM_DIR/experiments/artifact/speedup config=( - # "systolic_ws_8x8_c1_simple_noc.json" - "systolic_ws_128x128_c2_simple_noc_tpuv3.json" - #"systolic_ws_128x128_c2_booksim_tpuv3.json" - # "systolic_ws_128x128_c2_simple_noc_tpuv4.json" + # "systolic_ws_8x8_c1_simple_noc.yml" + "systolic_ws_128x128_c2_simple_noc_tpuv3.yml" + #"systolic_ws_128x128_c2_booksim_tpuv3.yml" + # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml" ) TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") SHAPE_LIST=( diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh index 6f3385f1..dc0fdd20 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh @@ -2,10 +2,10 @@ base_dir=$TORCHSIM_DIR/experiments/artifact/speedup config=( - # "systolic_ws_8x8_c1_simple_noc.json" - "systolic_ws_128x128_c2_simple_noc_tpuv3.json" - #"systolic_ws_128x128_c2_booksim_tpuv3.json" - # "systolic_ws_128x128_c2_simple_noc_tpuv4.json" + # "systolic_ws_8x8_c1_simple_noc.yml" + "systolic_ws_128x128_c2_simple_noc_tpuv3.yml" + #"systolic_ws_128x128_c2_booksim_tpuv3.yml" + # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml" ) TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") SHAPE_LIST=( diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh index ca4cfa39..2346ab3c 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh @@ -2,10 +2,10 @@ base_dir=$TORCHSIM_DIR/experiments/artifact/speedup config=( - # "systolic_ws_8x8_c1_simple_noc.json" - "systolic_ws_128x128_c2_simple_noc_tpuv3.json" - #"systolic_ws_128x128_c2_booksim_tpuv3.json" - # "systolic_ws_128x128_c2_simple_noc_tpuv4.json" + # "systolic_ws_8x8_c1_simple_noc.yml" + "systolic_ws_128x128_c2_simple_noc_tpuv3.yml" + #"systolic_ws_128x128_c2_booksim_tpuv3.yml" + # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml" ) TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") SIZE_LIST=( diff --git a/experiments/attention.py b/experiments/attention.py index bbd2734e..842f105a 100644 --- a/experiments/attention.py +++ b/experiments/attention.py @@ -36,7 +36,7 @@ def attention(query, key, value): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() diff --git a/experiments/conv.py b/experiments/conv.py index f439c5e3..25952fb0 100644 --- a/experiments/conv.py +++ b/experiments/conv.py @@ -37,7 +37,7 @@ def custom_conv2d(a, b, bias): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() diff --git a/experiments/gemm.py b/experiments/gemm.py index e92200d1..3090e331 100644 --- a/experiments/gemm.py +++ b/experiments/gemm.py @@ -31,7 +31,7 @@ def custom_matmul(a, b): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml) config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() diff --git a/experiments/layernorm.py b/experiments/layernorm.py index 74b6d286..9c9934a1 100644 --- a/experiments/layernorm.py +++ b/experiments/layernorm.py @@ -27,7 +27,7 @@ def run_layernorm(size, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() diff --git a/experiments/resnet18.py b/experiments/resnet18.py index 45311d59..5451e0f5 100644 --- a/experiments/resnet18.py +++ b/experiments/resnet18.py @@ -29,7 +29,7 @@ def run_resnet(batch, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() diff --git a/experiments/resnet50.py b/experiments/resnet50.py index 4f03ea15..83d82db4 100644 --- a/experiments/resnet50.py +++ b/experiments/resnet50.py @@ -29,7 +29,7 @@ def run_resnet(batch, config): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() diff --git a/experiments/softmax.py b/experiments/softmax.py index b47bd685..580d56ca 100644 --- a/experiments/softmax.py +++ b/experiments/softmax.py @@ -27,7 +27,7 @@ def run_softmax(size, config, dim=1): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json') + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() diff --git a/scripts/CompilerOpt_experiment/DMAopt.sh b/scripts/CompilerOpt_experiment/DMAopt.sh index 5c2dc65c..9e494d9b 100644 --- a/scripts/CompilerOpt_experiment/DMAopt.sh +++ b/scripts/CompilerOpt_experiment/DMAopt.sh @@ -1,5 +1,5 @@ #!/bin/bash -export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json" +export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml" # None FG DMA export TORCHSIM_SUBTILE=0 diff --git a/scripts/chiplet.sh b/scripts/chiplet.sh index 0d56ecae..e622874b 100755 --- a/scripts/chiplet.sh +++ b/scripts/chiplet.sh @@ -19,11 +19,11 @@ GEMM_DIR_NAME=$(basename "$GEMM_PATH") echo "GEMM Directory Name: $GEMM_DIR_NAME" CONFIG_LIST=( - "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json" + "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml" ) CONFIG_LIST2=( - "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_booksim_tpuv3.json" - "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json" + "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml" + "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml" ) shift shift @@ -39,7 +39,7 @@ MODELS_LIST="$GEMM_PATH/tile_graph.onnx" ATTRIBUTE_PATH="$GEMM_PATH/runtime_0000/attribute" for CONFIG in "${CONFIG_LIST[@]}"; do - CONFIG_NAME=$(basename "$CONFIG" .json) + CONFIG_NAME=$(basename "$CONFIG" .yml) for ATTRIBUTE_FILE in "${ATTRIBUTE_FILES[@]}"; do ATTRIBUTE_NAME=$(basename "$ATTRIBUTE_FILE") @@ -56,7 +56,7 @@ for CONFIG in "${CONFIG_LIST[@]}"; do done for CONFIG in "${CONFIG_LIST2[@]}"; do - CONFIG_NAME=$(basename "$CONFIG" .json) + CONFIG_NAME=$(basename "$CONFIG" .yml) ATTRIBUTE_NAME=0 RESULTS_DIR="./chiplet_results$INDEX_NAME/$GEMM_DIR_NAME/$ATTRIBUTE_NAME" mkdir -p "$RESULTS_DIR" diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py index 32f7ad50..4f8b7f7c 100644 --- a/scripts/chiplet_prep.py +++ b/scripts/chiplet_prep.py @@ -1,5 +1,5 @@ import os -import json +import yaml import shutil import argparse import torch @@ -41,9 +41,11 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None): if not os.path.exists(file_path): print(f"File {file_path} does not exist.") return + with open(file_path, 'r') as f: - data = json.load(f) - # address_numa_stride와 subgraph_map 추가 + data = yaml.safe_load(f) + + # address_numa_stride, subgraph_map if address_numa_stride: data['address_numa_stride'] = address_numa_stride if subgraph_map: @@ -52,8 +54,9 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None): output_path = file_path = os.path.join(dump_path, 'runtime_0000', 'attribute') os.makedirs(output_path, exist_ok=True) output_file = os.path.join(output_path, name) + with open(output_file, 'w') as f: - json.dump(data, f, indent=4) + yaml.dump(data, f, default_flow_style=False, sort_keys=False) print(f"Modified file saved to {output_file}") if __name__ == "__main__": diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh index 4f5dd3a6..84c818ac 100755 --- a/scripts/sparsity_experiment/run.sh +++ b/scripts/sparsity_experiment/run.sh @@ -5,7 +5,7 @@ export TORCHSIM_FORCE_TIME_M=8 export TORCHSIM_FORCE_TIME_N=8 OUTPUT_DIR="12GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_12G_simple_noc.yml" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -13,7 +13,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="24GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_24G_simple_noc.yml" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -21,7 +21,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="48GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_48G_simple_noc.yml" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -29,7 +29,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="12GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_12G_simple_noc.yml" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -37,7 +37,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="24GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_24G_simple_noc.yml" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -45,7 +45,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="48GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json" +export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_48G_simple_noc.yml" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 diff --git a/scripts/stonne_experiment/run.sh b/scripts/stonne_experiment/run.sh index 1825817f..2e386d9c 100755 --- a/scripts/stonne_experiment/run.sh +++ b/scripts/stonne_experiment/run.sh @@ -2,8 +2,8 @@ export TORCHSIM_FORCE_TIME_M=1024 export TORCHSIM_FORCE_TIME_K=1024 export TORCHSIM_FORCE_TIME_N=1024 -python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config stonne_big_c1_simple_noc.json --mode 0 > hetero/big_sparse.log -python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config systolic_ws_128x128_c1_simple_noc_tpuv3_half.json --mode 1 > hetero/big.log -python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config heterogeneous_c2_simple_noc.json --mode 2 > hetero/hetero.log +python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config stonne_big_c1_simple_noc.yml --mode 0 > hetero/big_sparse.log +python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml --mode 1 > hetero/big.log +python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config heterogeneous_c2_simple_noc.yml --mode 2 > hetero/hetero.log echo "All processes completed!" diff --git a/scripts/stonne_experiment2/tog_gen.py b/scripts/stonne_experiment2/tog_gen.py index d4f93d4d..e8013da7 100644 --- a/scripts/stonne_experiment2/tog_gen.py +++ b/scripts/stonne_experiment2/tog_gen.py @@ -72,7 +72,7 @@ def extract_simulation_stats(result_path): continue tog_path = os.path.join(path, "tile_graph.onnx") togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") - stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_validation_c1_simple_noc.json' + stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_validation_c1_simple_noc.yml' backsim = TOGSimulator(togsim_path, stonne_config_path) result_path = backsim.simulation(tog_path) nr_multiplications, total_cycle, sim_time = extract_simulation_stats(result_path) diff --git a/tests/test_compile_overhead.py b/tests/test_compile_overhead.py index 030f548e..449707a5 100644 --- a/tests/test_compile_overhead.py +++ b/tests/test_compile_overhead.py @@ -21,7 +21,7 @@ # shutil.rmtree("/tmp/torchinductor") #except FileNotFoundError: # print("no cache") - scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") + scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml") # Register compiled model opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False) SchedulerDNNModel.register_model("resnet18", opt_model1) diff --git a/tests/test_hetro.py b/tests/test_hetro.py index a0716e2d..9fac8c65 100644 --- a/tests/test_hetro.py +++ b/tests/test_hetro.py @@ -17,7 +17,7 @@ def custom_matmul(a, b): parser.add_argument("--N", type=int, default=128, help="Input layer size") parser.add_argument("--K", type=int, default=128, help="Hidden layer size") parser.add_argument("--sparsity", type=float, default=0.9, help="Output layer size") - parser.add_argument("--config", type=str, default="stonne_big_c1_simple_noc.json", help="Output layer size") + parser.add_argument("--config", type=str, default="stonne_big_c1_simple_noc.yml", help="Output layer size") parser.add_argument("--mode", type=int, default=0, help="Output layer size") args = parser.parse_args() diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 4860de56..9c7ca255 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -7,7 +7,7 @@ base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') sys.path.append(base_path) from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request -config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json' +config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml' target_model1 = model1().eval() target_model2 = model2(768, 12).eval() diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py index 53f9256d..65213ef0 100644 --- a/tests/test_scheduler_batching.py +++ b/tests/test_scheduler_batching.py @@ -17,7 +17,7 @@ target_model1 = model1().eval() # Init scheduler - scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json") + scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml") # Register compiled model opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False) SchedulerDNNModel.register_model("resnet18", opt_model1) diff --git a/tutorial/session1/CompilerOptimization.ipynb b/tutorial/session1/CompilerOptimization.ipynb index 178974c1..ead695c0 100644 --- a/tutorial/session1/CompilerOptimization.ipynb +++ b/tutorial/session1/CompilerOptimization.ipynb @@ -18,7 +18,7 @@ "import sys\n", "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", "sys.path.append(base_dir)\n", - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.json\"" + "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"" ] }, { @@ -71,7 +71,7 @@ "outputs": [], "source": [ "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"non_fused\")\n", - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json\"\n", + "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml\"\n", "\n", "input = torch.randn(1024, 1024).to(device=device)\n", "weight = torch.randn(1024, 1024).to(device=device)\n", diff --git a/tutorial/session1/ExecutionMode.ipynb b/tutorial/session1/ExecutionMode.ipynb index 22e00bed..b6f0e048 100644 --- a/tutorial/session1/ExecutionMode.ipynb +++ b/tutorial/session1/ExecutionMode.ipynb @@ -56,7 +56,7 @@ "metadata": {}, "outputs": [], "source": [ - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_functional_only.json\"\n", + "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_functional_only.yml\"\n", "\n", "input = torch.randn(1024, 1024).to(device=device)\n", "weight = torch.randn(1024, 1024).to(device=device)\n", @@ -78,7 +78,7 @@ "metadata": {}, "outputs": [], "source": [ - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.json\"\n", + "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", "\n", "input = torch.randn(1024, 1024).to(device=device)\n", "weight = torch.randn(1024, 1024).to(device=device)\n", @@ -101,7 +101,7 @@ "metadata": {}, "outputs": [], "source": [ - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.json\"\n", + "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", "\n", "input = torch.randn(2048, 2048).to(device=device)\n", "weight = torch.randn(2048, 2048).to(device=device)\n", @@ -132,7 +132,7 @@ "metadata": {}, "outputs": [], "source": [ - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_2_cores.json\"\n", + "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_2_cores.yml\"\n", "\n", "input = torch.randn(2048, 2048).to(device=device)\n", "weight = torch.randn(2048, 2048).to(device=device)\n", diff --git a/tutorial/session1/LogAnalysis.ipynb b/tutorial/session1/LogAnalysis.ipynb index 4f1e17cb..d3207af1 100644 --- a/tutorial/session1/LogAnalysis.ipynb +++ b/tutorial/session1/LogAnalysis.ipynb @@ -18,7 +18,7 @@ "import sys\n", "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", "sys.path.append(base_dir)\n", - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.json\"\n", + "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", "os.environ['TORCHSIM_DUMP_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")" ] }, diff --git a/tutorial/session1/Mapping.ipynb b/tutorial/session1/Mapping.ipynb index b02c98fe..684b69c0 100644 --- a/tutorial/session1/Mapping.ipynb +++ b/tutorial/session1/Mapping.ipynb @@ -68,7 +68,7 @@ "source": [ "torch._dynamo.reset()\n", "\n", - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_external_mapping.json\"\n", + "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml\"\n", "\n", "input = torch.randn(1024, 1024).to(device=device)\n", "weight = torch.randn(1024, 1024).to(device=device)\n", @@ -101,7 +101,7 @@ "source": [ "torch._dynamo.reset()\n", "\n", - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_autotune.json\"\n", + "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_autotune.yml\"\n", "\n", "input = torch.randn(1024, 1024).to(device=device)\n", "weight = torch.randn(1024, 1024).to(device=device)\n", From dce58d080d8bd044e8f59197f223532725e727b0 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 9 Dec 2025 09:31:12 +0000 Subject: [PATCH 022/194] [Test] Change attention masek for Llama --- tests/Llama/test_llama.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/Llama/test_llama.py b/tests/Llama/test_llama.py index 98820fd9..443f3fc2 100644 --- a/tests/Llama/test_llama.py +++ b/tests/Llama/test_llama.py @@ -274,7 +274,17 @@ def run_custom_llama_test( g = torch.Generator().manual_seed(0) vocab = cfg.vocab_size input_ids_cpu = torch.randint(low=0, high=vocab, size=(batch, seq_len), generator=g, dtype=torch.long) - attn_mask_cpu = torch.ones_like(input_ids_cpu, dtype=torch.long) + + min_dtype = torch.finfo(torch_dtype).min + causal_mask = torch.zeros((seq_len, seq_len), dtype=torch_dtype, device="cpu") + + if seq_len > 1: + causal_mask = torch.triu(torch.full_like(causal_mask, min_dtype), diagonal=1) + + cache_position = torch.arange(seq_len, device="cpu") + mask_condition = torch.arange(seq_len, device="cpu") > cache_position.reshape(-1, 1) + causal_mask.masked_fill_(mask_condition, min_dtype) + attn_mask_cpu = causal_mask[None, None, :, :].expand(batch, 1, -1, -1) input_ids_dev = input_ids_cpu.to(device) attn_mask_dev = attn_mask_cpu.to(device) @@ -325,11 +335,11 @@ def run_llama_model_test( g = torch.Generator().manual_seed(0) input_ids_cpu = torch.randint(low=0, high=cfg.vocab_size, size=(batch, seq_len), generator=g, dtype=torch.long) - # FIXME: Currently, the user must provide the mask manually. - # There is a functionality issue with the model generating the mask internally, - # so we explicitly create and inject a Causal Mask (lower triangular matrix) from the outside. - causal_mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.long)) - attn_mask_cpu = causal_mask.unsqueeze(0).unsqueeze(0).expand(batch, 1, -1, -1).bool() + min_dtype = torch.finfo(torch_dtype).min + causal_mask = torch.full((seq_len, seq_len), fill_value=min_dtype, dtype=torch_dtype, device="cpu") + if seq_len > 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + attn_mask_cpu = causal_mask[None, None, :, :].expand(batch, 1, -1, -1) input_ids_dev = input_ids_cpu.to(device) attn_mask_dev = attn_mask_cpu.to(device) From 1c2ab36117f90ff67f0c579220ad54568654ab91 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 9 Dec 2025 13:54:30 +0000 Subject: [PATCH 023/194] [Autotune] Fix autotune log path --- PyTorchSimFrontend/extension_codecache.py | 4 +- .../mlir/mlir_codegen_backend.py | 2 +- Simulator/simulator.py | 43 +++++++++++++------ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 4d57b987..2e35220c 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -278,7 +278,7 @@ def dummy_simulator(*args, **kwargs): vectorlane_size=vectorlane_size, spad_info=spad_info, silent_mode=silent_mode) if not extension_config.pytorchsim_timing_mode: - return + return [float("inf")] onnx_path = os.path.join(result_path, "tile_graph.onnx") attribute_path = os.path.join(runtime_path, "attribute") @@ -286,7 +286,7 @@ def dummy_simulator(*args, **kwargs): TOGSim = TOGSimulator(togsim_path, extension_config.CONFIG_TOGSIM_CONFIG) TOGSim.vectorlane_size = vectorlane_size attribute_path = TOGSim.create_attribute_file(attribute_path, args, loop_size=loop_size) - result_path = TOGSim.simulation(onnx_path, attribute_path, silent_mode=silent_mode) + result_path = TOGSim.simulation(onnx_path, attribute_path, silent_mode=silent_mode, autotune_mode=autotune) result = TOGSimulator.get_result_from_file(result_path) return result diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index cda996ab..266d884b 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -919,7 +919,7 @@ def get_cycle(choice): return float("inf") # Exceeded maximum number of autotuning attempts choices = self.make_choices(*args) - if len(choices) == 0: # can't autotune + if len(choices) == 0: # Can't autotune return [None, None] with ThreadPoolExecutor(max_workers=8) as executor: results = list(executor.map(get_cycle, choices)) diff --git a/Simulator/simulator.py b/Simulator/simulator.py index a46243f0..672ae6ec 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -9,6 +9,7 @@ import datetime import threading from pathlib import Path +import uuid import torch import numpy as np @@ -214,7 +215,7 @@ def get_togsim_command(self): cmd = f"{bin} --config {config}" return cmd - def simulation(self, model_path, attribute_path="", silent_mode=False): + def simulation(self, model_path, attribute_path="", silent_mode=False, autotune_mode=False): def show_progress(): i = 0 while not finished: @@ -245,19 +246,35 @@ def show_progress(): if not silent_mode: finished = True progress_thread.join() - print("[TOGSim] Command failed with exit code", e.returncode) - print("[TOGSim] Error output:", e.output) + with print_lock: + print("[TOGSim] Command failed with exit code", e.returncode) + print("[TOGSim] Error output:", e.output) assert 0 - # Save result to result_path - result_path = extension_config.CONFIG_TORCHSIM_LOG_PATH - os.makedirs(result_path, exist_ok=True) - file_name = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')+".log" - result_path = os.path.join(result_path, file_name) + + # Separate Autotune logs + if autotune_mode: + base_dir = Path(model_path).parent / "togsim_result" + base_dir.mkdir(parents=True, exist_ok=True) + file_name = f"{len(list(base_dir.iterdir()))}.log" + else: + base_dir = Path(extension_config.CONFIG_TORCHSIM_LOG_PATH) + unique_id = uuid.uuid4().hex[:8] + timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + file_name = f"{unique_id}_{timestamp}.log" + + base_dir.mkdir(parents=True, exist_ok=True) + result_path = base_dir / file_name + + # Prevent race condition with open(result_path, "w") as f: f.write(result.decode()) + f.flush() + os.fsync(f.fileno()) + if not silent_mode or extension_config.CONFIG_DEBUG_MODE: model_path_log = f' of "{model_path}" ' if extension_config.CONFIG_DEBUG_MODE else " " - print(f'[TOGSim] Simulation log{model_path_log}is stored to "{result_path}"') + with print_lock: + print(f'[TOGSim] Simulation log{model_path_log}is stored to "{result_path}"') return result_path def interactive_simulation(self): @@ -406,9 +423,9 @@ def find_zero_sub_tensors(self, tensor): def get_result_from_file(result_path): core_metrics = {} dram_channel_bw = {} - avg_dram_bw = None - simulation_time = None - total_cycle = None + avg_dram_bw = 0.0 + simulation_time = float("inf") + total_cycle = float("inf") # Read and find total stat position with open(result_path, "r") as f: @@ -423,7 +440,7 @@ def get_result_from_file(result_path): break if simulation_finished_idx == -1: - print("[TOGSim] Tried to parsing wrong formated output file!") + print(f"[TOGSim] Warning: Unable to parse the output file ({result_path}). The file may be improperly formatted.") return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time total_stat_lines = lines[simulation_finished_idx:] From 20af55066a6e1a73e99149ac6d3b23b903031264 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 9 Dec 2025 14:39:58 +0000 Subject: [PATCH 024/194] [Fix] Fix codegen error in ops.select --- PyTorchSimFrontend/mlir/mlir_ops.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index af323c1e..21995512 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -98,6 +98,7 @@ def where(condition, operand1, operand2, *args, var_info=None, **kwargs): tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) cond_type = var_info[condition] operand_type = var_info[operand1] + condition = ops.to_bool(condition) if cond_type[0] < tile_size: condition = ops.broadcast(condition, tile_size) elif cond_type[0] > tile_size: @@ -969,6 +970,9 @@ def ext(operand, dtype, *args, var_info=None, **kwargs): @staticmethod def to_bool(operand, *args, var_info=None, **kwargs): tile_size, ret_type = var_info[operand] + if ret_type == "i1": + return operand, [tile_size, ret_type] + const_one = ops.constant(0, ret_type) if tile_size > 1: const_one = ops.broadcast(const_one, tile_size) From c39c3a3c8e661db989c7d87bbd5bba9c981e5075 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 11 Dec 2025 08:14:27 +0000 Subject: [PATCH 025/194] [Tutorial] Update environment setting for the tutorial --- Dockerfile.base | 2 +- Dockerfile.ksc2025 | 2 +- tutorial/session2/Warmup.py | 13 +++++++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/Dockerfile.base b/Dockerfile.base index 1ac5e175..6a21760b 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -33,7 +33,7 @@ RUN apt -y update && \ python3-dev python-is-python3 libboost-all-dev \ libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \ python3-venv black libssl-dev libasan5 libubsan1 curl device-tree-compiler wget ninja-build && \ - pip install onnx matplotlib scikit-learn && pip install --user conan==1.56.0 && rm -rf /var/lib/apt/lists/* + pip install onnx matplotlib scikit-learn pydot tabulate && pip install --user conan==1.56.0 && rm -rf /var/lib/apt/lists/* # Download RISC-V tool chain RUN wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz && \ diff --git a/Dockerfile.ksc2025 b/Dockerfile.ksc2025 index 2ac210e0..b70b2b77 100644 --- a/Dockerfile.ksc2025 +++ b/Dockerfile.ksc2025 @@ -33,7 +33,7 @@ RUN apt -y update && apt -y upgrade && \ python3-dev python-is-python3 doxygen libboost-all-dev \ libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \ python3-venv black libssl-dev libasan5 libubsan1 -RUN pip install mypy pre-commit jupyter +RUN pip install mypy pre-commit jupyter pydot tabulate jupyterlab_execute_time # Pass Access Token securely ENV PATH=$PATH:/root/.local/bin diff --git a/tutorial/session2/Warmup.py b/tutorial/session2/Warmup.py index ce215cf5..a45734ad 100644 --- a/tutorial/session2/Warmup.py +++ b/tutorial/session2/Warmup.py @@ -1,13 +1,19 @@ from typing import List import os from torch.fx.passes.graph_drawer import FxGraphDrawer -os.environ['TORCH_LOGS'] = 'bytecode' import torch +import inspect def dummy_compiler(gm: torch.fx.GraphModule, _): - gm.graph.print_tabular() + sep = "-" * 80 drawer = FxGraphDrawer(gm, "my_model") drawer.get_dot_graph().write_svg("fx_graph.svg") + + print(f"\n{sep}\n[1] FX Graph Tabular View\n{sep}") + gm.graph.print_tabular() + + print(f"\n{sep}\n[2] Generated Forward Source Code\n{sep}") + print(inspect.getsource(gm.forward)) return gm.forward # Return a callable object class MyModel(torch.nn.Module): @@ -23,5 +29,4 @@ def f(x, y): if __name__ == "__main__": x = torch.randn(7, 5,requires_grad=False) y = torch.randn(5, 3,requires_grad=False) - k = f(x, y) - print(k) + k = f(x, y) \ No newline at end of file From 8678fe631db988c70c35f3e428553692af835d0d Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 12 Dec 2025 17:59:19 +0900 Subject: [PATCH 026/194] [Tutorial] Add tutorail env setting scripts --- .github/workflows/docker-tutorial-image.yml | 2 +- tutorial/jupyterhub/Dockerfile | 7 +++++ .../jupyterhub/Dockerfile.ksc2025 | 8 ++++-- tutorial/jupyterhub/docker-compose.yml | 25 +++++++++++++++++ tutorial/jupyterhub/jupyterhub_config.py | 28 +++++++++++++++++++ tutorial/jupyterhub/setting.sh | 5 ++++ 6 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 tutorial/jupyterhub/Dockerfile rename Dockerfile.ksc2025 => tutorial/jupyterhub/Dockerfile.ksc2025 (96%) create mode 100644 tutorial/jupyterhub/docker-compose.yml create mode 100644 tutorial/jupyterhub/jupyterhub_config.py create mode 100755 tutorial/jupyterhub/setting.sh diff --git a/.github/workflows/docker-tutorial-image.yml b/.github/workflows/docker-tutorial-image.yml index c7d3a2ca..c0d8267d 100644 --- a/.github/workflows/docker-tutorial-image.yml +++ b/.github/workflows/docker-tutorial-image.yml @@ -30,6 +30,6 @@ jobs: uses: docker/build-push-action@v4 with: context: . - file: ./Dockerfile.ksc2025 + file: ./tutorial/jupyterhub/Dockerfile.ksc2025 push: true tags: ghcr.io/psal-postech/torchsim_ksc2025:latest diff --git a/tutorial/jupyterhub/Dockerfile b/tutorial/jupyterhub/Dockerfile new file mode 100644 index 00000000..f98b2294 --- /dev/null +++ b/tutorial/jupyterhub/Dockerfile @@ -0,0 +1,7 @@ +FROM jupyterhub/jupyterhub:latest + +RUN pip install --no-cache-dir \ + dockerspawner \ + jupyterhub-nativeauthenticator + +WORKDIR /srv/jupyterhub diff --git a/Dockerfile.ksc2025 b/tutorial/jupyterhub/Dockerfile.ksc2025 similarity index 96% rename from Dockerfile.ksc2025 rename to tutorial/jupyterhub/Dockerfile.ksc2025 index b70b2b77..5ff5d40d 100644 --- a/Dockerfile.ksc2025 +++ b/tutorial/jupyterhub/Dockerfile.ksc2025 @@ -40,8 +40,8 @@ ENV PATH=$PATH:/root/.local/bin ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH # Build Gem5 -RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch TorchSim -RUN cd gem5 && scons build/RISCV/gem5.opt -j $(nproc) +RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch tutorial +RUN cd gem5 && scons build/RISCV/gem5.opt -j $(nproc) && git checkout TorchSim ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt # Build LLVM RISC-V @@ -87,4 +87,6 @@ RUN cd PyTorchSim/TOGSim && \ cd build && \ conan install .. --build=missing && \ cmake .. && \ - make -j$(nproc) \ No newline at end of file + make -j$(nproc) + +RUN pip install jupyterhub jupyterlab diff --git a/tutorial/jupyterhub/docker-compose.yml b/tutorial/jupyterhub/docker-compose.yml new file mode 100644 index 00000000..62c07ff1 --- /dev/null +++ b/tutorial/jupyterhub/docker-compose.yml @@ -0,0 +1,25 @@ +version: '3' + +services: + jupyterhub: + build: + context: . + dockerfile: Dockerfile + container_name: jupyterhub + image: my-jupyterhub-image + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - ./jupyterhub_config.py:/srv/jupyterhub/jupyterhub_config.py + environment: + # DockerSpawner가 사용할 네트워크 이름 + DOCKER_NETWORK_NAME: jupyterhub-network + # Hub가 내부적으로 사용할 IP + HUB_IP: jupyterhub + ports: + - "8888:8000" + networks: + - jupyterhub-network + +networks: + jupyterhub-network: + external: true diff --git a/tutorial/jupyterhub/jupyterhub_config.py b/tutorial/jupyterhub/jupyterhub_config.py new file mode 100644 index 00000000..a43c0543 --- /dev/null +++ b/tutorial/jupyterhub/jupyterhub_config.py @@ -0,0 +1,28 @@ +import os + +c = get_config() + +# ------------------------------------------------------------------------------ +# Spawner config +# ------------------------------------------------------------------------------ +c.JupyterHub.spawner_class = 'dockerspawner.DockerSpawner' +c.DockerSpawner.image = "ghcr.io/psal-postech/torchsim_ksc2025:latest" + +# Resource limit +c.DockerSpawner.mem_limit = '16G' +c.DockerSpawner.cpu_limit = 4.0 + +c.DockerSpawner.network_name = 'jupyterhub-network' +c.Spawner.default_url = '/lab' +c.Spawner.ip = '0.0.0.0' +c.DockerSpawner.remove = False +c.DockerSpawner.cmd = ["jupyterhub-singleuser", "--allow-root"] + +c.JupyterHub.authenticator_class = 'nativeauthenticator.NativeAuthenticator' +c.Authenticator.admin_users = {'admin'} + +c.JupyterHub.hub_ip = 'jupyterhub' +c.JupyterHub.hub_port = 8081 + +c.NativeAuthenticator.open_signup = True +c.NativeAuthenticator.allow_all = True diff --git a/tutorial/jupyterhub/setting.sh b/tutorial/jupyterhub/setting.sh new file mode 100755 index 00000000..3e544839 --- /dev/null +++ b/tutorial/jupyterhub/setting.sh @@ -0,0 +1,5 @@ +if [ -z "$(docker network ls | grep jupyterhub-network)" ]; then + docker network create jupyterhub-network +fi + +docker compose up -d --build \ No newline at end of file From 0a5d0e70dcd212880eab04b33b0c21b2e915fe15 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 15 Dec 2025 15:39:22 +0900 Subject: [PATCH 027/194] [Tutorial] Change format of config files to yml --- .../togsim_configs/togsim_config.json | 32 ------------------- .../session1/togsim_configs/togsim_config.yml | 30 +++++++++++++++++ .../togsim_configs/togsim_config_2_cores.json | 32 ------------------- .../togsim_configs/togsim_config_2_cores.yml | 30 +++++++++++++++++ .../togsim_config_autotune.json | 32 ------------------- .../togsim_configs/togsim_config_autotune.yml | 30 +++++++++++++++++ .../togsim_config_external_mapping.json | 32 ------------------- .../togsim_config_external_mapping.yml | 30 +++++++++++++++++ .../togsim_config_functional_only.json | 32 ------------------- .../togsim_config_functional_only.yml | 30 +++++++++++++++++ ...ogsim_config_no_compiler_optimization.json | 32 ------------------- ...togsim_config_no_compiler_optimization.yml | 30 +++++++++++++++++ .../togsim_config_timing_only.json | 32 ------------------- .../togsim_config_timing_only.yml | 30 +++++++++++++++++ 14 files changed, 210 insertions(+), 224 deletions(-) delete mode 100644 tutorial/session1/togsim_configs/togsim_config.json create mode 100644 tutorial/session1/togsim_configs/togsim_config.yml delete mode 100644 tutorial/session1/togsim_configs/togsim_config_2_cores.json create mode 100644 tutorial/session1/togsim_configs/togsim_config_2_cores.yml delete mode 100644 tutorial/session1/togsim_configs/togsim_config_autotune.json create mode 100644 tutorial/session1/togsim_configs/togsim_config_autotune.yml delete mode 100644 tutorial/session1/togsim_configs/togsim_config_external_mapping.json create mode 100644 tutorial/session1/togsim_configs/togsim_config_external_mapping.yml delete mode 100644 tutorial/session1/togsim_configs/togsim_config_functional_only.json create mode 100644 tutorial/session1/togsim_configs/togsim_config_functional_only.yml delete mode 100644 tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json create mode 100644 tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml delete mode 100644 tutorial/session1/togsim_configs/togsim_config_timing_only.json create mode 100644 tutorial/session1/togsim_configs/togsim_config_timing_only.yml diff --git a/tutorial/session1/togsim_configs/togsim_config.json b/tutorial/session1/togsim_configs/togsim_config.json deleted file mode 100644 index e8e489d9..00000000 --- a/tutorial/session1/togsim_configs/togsim_config.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 16, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "heuristic", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/tutorial/session1/togsim_configs/togsim_config.yml b/tutorial/session1/togsim_configs/togsim_config.yml new file mode 100644 index 00000000..72873f1c --- /dev/null +++ b/tutorial/session1/togsim_configs/togsim_config.yml @@ -0,0 +1,30 @@ +num_cores: 1 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 16 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: heuristic +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/tutorial/session1/togsim_configs/togsim_config_2_cores.json b/tutorial/session1/togsim_configs/togsim_config_2_cores.json deleted file mode 100644 index c50edaa9..00000000 --- a/tutorial/session1/togsim_configs/togsim_config_2_cores.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 2, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 32, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 0, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "heuristic", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/tutorial/session1/togsim_configs/togsim_config_2_cores.yml b/tutorial/session1/togsim_configs/togsim_config_2_cores.yml new file mode 100644 index 00000000..3b9b8fc8 --- /dev/null +++ b/tutorial/session1/togsim_configs/togsim_config_2_cores.yml @@ -0,0 +1,30 @@ +num_cores: 2 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 32 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 0 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: heuristic +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/tutorial/session1/togsim_configs/togsim_config_autotune.json b/tutorial/session1/togsim_configs/togsim_config_autotune.json deleted file mode 100644 index c9763e92..00000000 --- a/tutorial/session1/togsim_configs/togsim_config_autotune.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 16, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "autotune", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/tutorial/session1/togsim_configs/togsim_config_autotune.yml b/tutorial/session1/togsim_configs/togsim_config_autotune.yml new file mode 100644 index 00000000..2726736a --- /dev/null +++ b/tutorial/session1/togsim_configs/togsim_config_autotune.yml @@ -0,0 +1,30 @@ +num_cores: 1 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 16 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/tutorial/session1/togsim_configs/togsim_config_external_mapping.json b/tutorial/session1/togsim_configs/togsim_config_external_mapping.json deleted file mode 100644 index c8ddb0f3..00000000 --- a/tutorial/session1/togsim_configs/togsim_config_external_mapping.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 16, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "external-then-heuristic", - "codegen_external_mapping_file" : "/workspace/PyTorchSim/tutorial/session1/tutorial_external_mapping.json", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml b/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml new file mode 100644 index 00000000..468a0b44 --- /dev/null +++ b/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml @@ -0,0 +1,30 @@ +num_cores: 1 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 16 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: external-then-heuristic +codegen_external_mapping_file: /workspace/PyTorchSim/tutorial/session1/tutorial_external_mapping.json +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/tutorial/session1/togsim_configs/togsim_config_functional_only.json b/tutorial/session1/togsim_configs/togsim_config_functional_only.json deleted file mode 100644 index 53072307..00000000 --- a/tutorial/session1/togsim_configs/togsim_config_functional_only.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 16, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 1, - "pytorchsim_timing_mode" : 0, - - "codegen_mapping_strategy" : "heuristic", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/tutorial/session1/togsim_configs/togsim_config_functional_only.yml b/tutorial/session1/togsim_configs/togsim_config_functional_only.yml new file mode 100644 index 00000000..a1f1b432 --- /dev/null +++ b/tutorial/session1/togsim_configs/togsim_config_functional_only.yml @@ -0,0 +1,30 @@ +num_cores: 1 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 16 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 1 +pytorchsim_timing_mode: 0 + +codegen_mapping_strategy: heuristic +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json deleted file mode 100644 index e2b9c8c8..00000000 --- a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 16, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 0, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "heuristic", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "none" -} \ No newline at end of file diff --git a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml new file mode 100644 index 00000000..62d627a6 --- /dev/null +++ b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml @@ -0,0 +1,30 @@ +num_cores: 1 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 16 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 0 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: heuristic +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: none diff --git a/tutorial/session1/togsim_configs/togsim_config_timing_only.json b/tutorial/session1/togsim_configs/togsim_config_timing_only.json deleted file mode 100644 index 0b846bbd..00000000 --- a/tutorial/session1/togsim_configs/togsim_config_timing_only.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "num_cores" : 1, - "core_freq_mhz" : 940, - "core_stats_print_period_cycles" : 10000, - "num_systolic_array_per_core" : 2, - - "vpu_num_lanes" : 128, - "vpu_spad_size_kb_per_lane" : 128, - "vpu_vector_length_bits" : 256, - - "dram_type" : "ramulator2", - "dram_freq_mhz" : 940, - "dram_channels": 16, - "dram_req_size_byte": 32, - "dram_num_burst_length" : 2, - "dram_stats_print_period_cycles": 10000, - "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", - - "icnt_type" : "simple", - "icnt_latency_cycles" : 10, - "icnt_freq_mhz" : 940, - "icnt_injection_ports_per_core" : 16, - - "pytorchsim_functional_mode" : 0, - "pytorchsim_timing_mode" : 1, - - "codegen_mapping_strategy" : "heuristic", - "codegen_external_mapping_file" : "", - "codegen_autotune_max_retry": 10, - "codegen_autotune_template_topk": 4, - "codegen_compiler_optimization" : "all" -} \ No newline at end of file diff --git a/tutorial/session1/togsim_configs/togsim_config_timing_only.yml b/tutorial/session1/togsim_configs/togsim_config_timing_only.yml new file mode 100644 index 00000000..0024c073 --- /dev/null +++ b/tutorial/session1/togsim_configs/togsim_config_timing_only.yml @@ -0,0 +1,30 @@ +num_cores: 1 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 16 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 0 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: heuristic +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all From 008cf4c2fe92b03a2c76c325febc0171e6c0acc6 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 15 Dec 2025 16:20:40 +0900 Subject: [PATCH 028/194] [Tutorial] Fix typo dockerfile --- tutorial/jupyterhub/Dockerfile.ksc2025 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorial/jupyterhub/Dockerfile.ksc2025 b/tutorial/jupyterhub/Dockerfile.ksc2025 index 5ff5d40d..9eaec15a 100644 --- a/tutorial/jupyterhub/Dockerfile.ksc2025 +++ b/tutorial/jupyterhub/Dockerfile.ksc2025 @@ -79,7 +79,7 @@ RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \ # Install torchsim dependency RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0 -# Prepare ONNXim project +# Prepare PyTorchSim project RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch tutorial RUN cd PyTorchSim/TOGSim && \ git submodule update --recursive --init && \ From 18d7babf7cf7c4ed15fd68f418bf5bf8d31d233a Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 15 Dec 2025 21:54:43 +0900 Subject: [PATCH 029/194] [Tutorial] Fix wrong config name --- PyTorchSimFrontend/extension_config.py | 2 +- tutorial/session1/LogAnalysis.ipynb | 2 +- tutorial/session2/Hands_on.ipynb | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index ab8aea69..2b1b3102 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -103,7 +103,7 @@ def __getattr__(name): if name == "CONFIG_TORCHSIM_DUMP_PATH": return os.environ.get('TORCHSIM_DUMP_PATH', default = CONFIG_TORCHSIM_DIR) if name == "CONFIG_TORCHSIM_LOG_PATH": - return os.environ.get('TORCHSIM_DUMP_LOG_PATH', default = os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results")) + return os.environ.get('TORCHSIM_LOG_PATH', default = os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results")) if name == "CONFIG_TOGSIM_EAGER_MODE": return int(os.environ.get("TOGSIM_EAGER_MODE", default=False)) diff --git a/tutorial/session1/LogAnalysis.ipynb b/tutorial/session1/LogAnalysis.ipynb index d3207af1..a82737db 100644 --- a/tutorial/session1/LogAnalysis.ipynb +++ b/tutorial/session1/LogAnalysis.ipynb @@ -19,7 +19,7 @@ "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", "sys.path.append(base_dir)\n", "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", - "os.environ['TORCHSIM_DUMP_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")" + "os.environ['TORCHSIM_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")" ] }, { diff --git a/tutorial/session2/Hands_on.ipynb b/tutorial/session2/Hands_on.ipynb index 33ec1a28..2d5a5cdc 100644 --- a/tutorial/session2/Hands_on.ipynb +++ b/tutorial/session2/Hands_on.ipynb @@ -32,6 +32,7 @@ "import torch._dynamo\n", "import torch.utils.cpp_extension\n", "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", + "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"togsim_results\")\n", "sys.path.append(base_dir)\n", "\n", "from Scheduler.scheduler import PyTorchSimRunner\n", From 1e4d72a0fea00c80ef681b9da6bec783f4b2bd93 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 17 Dec 2025 22:01:36 +0900 Subject: [PATCH 030/194] [Fix] configuration reference in DNNServing.ipynb --- tutorial/session1/DNNServing.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorial/session1/DNNServing.ipynb b/tutorial/session1/DNNServing.ipynb index b38bfe6a..56ad5ab6 100644 --- a/tutorial/session1/DNNServing.ipynb +++ b/tutorial/session1/DNNServing.ipynb @@ -38,7 +38,7 @@ "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request\n", "from PyTorchSimFrontend import extension_config\n", "\n", - "scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.TOGSIM_CONFIG)\n", + "scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG)\n", "device = scheduler.execution_engine.module.custom_device()\n", "\n", "model = resnet18().eval()\n", From 232c4a69053f0082e254e536cfcd04dddbc9c2c0 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 17 Dec 2025 22:02:27 +0900 Subject: [PATCH 031/194] Change log level from warn to debug for unused tags --- TOGSim/include/DMA.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TOGSim/include/DMA.h b/TOGSim/include/DMA.h index 2f41c6f3..3056c626 100644 --- a/TOGSim/include/DMA.h +++ b/TOGSim/include/DMA.h @@ -62,7 +62,7 @@ class DMA { const std::vector& tag_key = tag_entry.first; uint32_t value = tag_entry.second; if (value == 1) { - spdlog::warn("[Tag Table][{}] Unused tag found: (key={}, val={})", + spdlog::debug("[Tag Table][{}] Unused tag found: (key={}, val={})", subgraph_id, fmt::format("[{}]", fmt::join(tag_key, ", ")), value); } } @@ -134,4 +134,4 @@ class DMA { std::queue _pending_accesses; bool _generated_once = false; }; -#endif \ No newline at end of file +#endif From 8b0f5354bcfce195d7a48498d0dde824be278a94 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 17 Dec 2025 22:08:26 +0900 Subject: [PATCH 032/194] Add placeholder echo command in Dockerfile Added a placeholder echo command for future removal. --- tutorial/jupyterhub/Dockerfile.ksc2025 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tutorial/jupyterhub/Dockerfile.ksc2025 b/tutorial/jupyterhub/Dockerfile.ksc2025 index 9eaec15a..4993538a 100644 --- a/tutorial/jupyterhub/Dockerfile.ksc2025 +++ b/tutorial/jupyterhub/Dockerfile.ksc2025 @@ -80,6 +80,8 @@ RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \ RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0 # Prepare PyTorchSim project +RUN echo "Remove me!" + RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch tutorial RUN cd PyTorchSim/TOGSim && \ git submodule update --recursive --init && \ From 602131571983a5b752bcea4bd929043aca556023 Mon Sep 17 00:00:00 2001 From: Yunseon Date: Wed, 17 Dec 2025 22:53:21 +0900 Subject: [PATCH 033/194] [Frontend] prevent reload device --- Scheduler/scheduler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 98ebb1d5..34f0eda4 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -163,6 +163,8 @@ def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None: @staticmethod def setup_device(): + if cls._npu_module is not None: + return cls._npu_module source_file_path = os.path.dirname(os.path.abspath(__file__)) source_file = os.path.join( source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimFrontend/extension_device.cpp" @@ -201,6 +203,7 @@ def setup_device(): get_wrapper_codegen_for_device("npu") == ExtensionWrapperCodegen ) + cls._npu_module = module return module def submit(self, batched_req, partition_idx) -> List[RequestReturn]: From 7c5dcccd539b9174da3ff8e1751117fa72910fd6 Mon Sep 17 00:00:00 2001 From: Yunseon Date: Wed, 17 Dec 2025 22:59:39 +0900 Subject: [PATCH 034/194] [fix] setup_device to class method --- Scheduler/scheduler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 34f0eda4..94092723 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -144,6 +144,7 @@ class PyTorchSimRunner: PARTITION_BUSY = 0 PARTITION_IDLE = 1 SELECT_NOTHING = 2 + _npu_module = None def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None: self.module = self.setup_device() self.num_partion = num_partion @@ -161,7 +162,7 @@ def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None: # Dry run for compile and create generator os.environ["TOGSIM_EAGER_MODE"] = "1" - @staticmethod + @classmethod def setup_device(): if cls._npu_module is not None: return cls._npu_module From 88d9eb8f74f03566034e2de95323f6662b3cd183 Mon Sep 17 00:00:00 2001 From: Yunseon Date: Wed, 17 Dec 2025 23:13:59 +0900 Subject: [PATCH 035/194] [Fix] typo in TOGSIM_CONFIG --- tutorial/session1/DNNServing.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorial/session1/DNNServing.ipynb b/tutorial/session1/DNNServing.ipynb index 56ad5ab6..741f463f 100644 --- a/tutorial/session1/DNNServing.ipynb +++ b/tutorial/session1/DNNServing.ipynb @@ -83,7 +83,7 @@ "target_model1 = resnet18().eval()\n", "\n", "# Init scheduler\n", - "scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.TOGSIM_CONFIG)\n", + "scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG)\n", "# Register compiled model\n", "opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)\n", "SchedulerDNNModel.register_model(\"resnet18\", opt_model1)\n", From d1ffac21708a30d40ff378e315c00c44d840fdc5 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 17 Dec 2025 23:15:10 +0900 Subject: [PATCH 036/194] Remove echo command from Dockerfile Removed unnecessary echo command from Dockerfile. --- tutorial/jupyterhub/Dockerfile.ksc2025 | 2 -- 1 file changed, 2 deletions(-) diff --git a/tutorial/jupyterhub/Dockerfile.ksc2025 b/tutorial/jupyterhub/Dockerfile.ksc2025 index 4993538a..9eaec15a 100644 --- a/tutorial/jupyterhub/Dockerfile.ksc2025 +++ b/tutorial/jupyterhub/Dockerfile.ksc2025 @@ -80,8 +80,6 @@ RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \ RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0 # Prepare PyTorchSim project -RUN echo "Remove me!" - RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch tutorial RUN cd PyTorchSim/TOGSim && \ git submodule update --recursive --init && \ From 7c45f8015e1a83b2abc177c25e3b2d66ff2ac0a7 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 18 Dec 2025 00:21:44 +0900 Subject: [PATCH 037/194] Refactor NPU module variable naming convention --- Scheduler/scheduler.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 94092723..8aa849b1 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -144,7 +144,7 @@ class PyTorchSimRunner: PARTITION_BUSY = 0 PARTITION_IDLE = 1 SELECT_NOTHING = 2 - _npu_module = None + NPU_MODULE = None def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None: self.module = self.setup_device() self.num_partion = num_partion @@ -163,9 +163,9 @@ def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None: os.environ["TOGSIM_EAGER_MODE"] = "1" @classmethod - def setup_device(): - if cls._npu_module is not None: - return cls._npu_module + def setup_device(cls): + if cls.NPU_MODULE is not None: + return cls.NPU_MODULE source_file_path = os.path.dirname(os.path.abspath(__file__)) source_file = os.path.join( source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimFrontend/extension_device.cpp" @@ -204,7 +204,7 @@ def setup_device(): get_wrapper_codegen_for_device("npu") == ExtensionWrapperCodegen ) - cls._npu_module = module + cls.NPU_MODULE = module return module def submit(self, batched_req, partition_idx) -> List[RequestReturn]: From af48bc382958846e064d9ad4d7cdd21daeedcac4 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 5 Jan 2026 11:25:22 +0000 Subject: [PATCH 038/194] [Fix] Indirect store & add a test case --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 10 +++++----- tests/test_indirect_access.py | 6 ++++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 266d884b..297ea162 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -438,12 +438,12 @@ def store(self, name: str, index: sympy.Expr, value, mode=None, *args, **kwargs) # Handle scatter store if "tmp" in str(index): - if mode == "atomic_add": - # Convert the output buffer type to the inplace buffer - arg_name = V.graph.scheduler.mutation_real_name.get(name, name) - if arg_name not in self.kernel_group.args.inplace_buffers: - self.kernel_group.args.make_inplace(arg_name, arg_name) + # Convert the output buffer type to the inplace buffer + arg_name = V.graph.scheduler.mutation_real_name.get(name, name) + if arg_name not in self.kernel_group.args.inplace_buffers: + self.kernel_group.args.make_inplace(arg_name, arg_name) + if mode == "atomic_add": loaded_value = ops.load(name, index) value = ops.add(loaded_value, value) index, _ = self.convert_indirect_indexing(index) diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py index 6cfa7b58..d103ee1b 100644 --- a/tests/test_indirect_access.py +++ b/tests/test_indirect_access.py @@ -70,11 +70,12 @@ def vectoradd(a, idx, b): a[idx, :] = b return a x = torch.randn(size, dtype=torch.float32).to(device=device) + x_cpu = x.clone().cpu() idx = torch.randint(0,128, [128]).to(device=device) - y = torch.randn(128, dtype=torch.float32).to(device=device) + y = torch.randn(size[1], dtype=torch.float32).to(device=device) opt_fn = torch.compile(dynamic=False)(vectoradd) res = opt_fn(x, idx, y) - out = vectoradd(x.cpu(), idx.cpu(), y.cpu()) + out = vectoradd(x_cpu, idx.cpu(), y.cpu()) test_result("Indirect VectorAdd", res, out) if __name__ == "__main__": @@ -86,6 +87,7 @@ def vectoradd(a, idx, b): module = PyTorchSimRunner.setup_device() device = module.custom_device() test_scatter_full(device) + test_scatter_full(device, size=(2048, 2048)) test_scatter_add(device) test_indirect_vectoradd(device) #test_embedding(device, 1024, 2048) \ No newline at end of file From 6d043ad4675a4cee2242b1f0f7226f9e47926bf4 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 5 Jan 2026 14:09:42 +0000 Subject: [PATCH 039/194] [Fix] relax vlane_stride constraints to resolve tile size conflicts #201 --- PyTorchSimFrontend/mlir/mlir_common.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 15408c0d..b86607ea 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -332,8 +332,8 @@ def _adjust_one(dim_size, tile_size): remain = candidate_tile_size[axis] % stride if remain: - candidate_tile_size[axis] += stride - remain - self.tile_constraint[axis].must_divide_dim = False + # #201: relax vlane_stride constraints + self.vmap.vlane_stride = 1 return candidate_tile_size def scale_tile_dim(self, axis, dim_sz, scale_factor=2): @@ -488,7 +488,7 @@ def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=N self.name = "" self._tile_size = list(tile_size) self._tile_stride = None - self.tile_constraint = [TileConstraint(vlane_stride) for _ in tile_size] + self.tile_constraint = [TileConstraint(vlane_stride if idx == vlane_split_axis else 1) for idx, _ in enumerate(tile_size)] self.tile_axis_order = list(range(len(tile_size))) self.update_tile_stride() @@ -718,13 +718,13 @@ def compute_tile_size(self, nodes, vars, reduction_vars): init_tile_desc.nr_rdim = len(reduction_vars) self.kernel_group.set_tile_info(init_tile_desc) - # Handle edge case - if len(self.ranges)==1 and self.ranges[0] == 1: # Scalar case 2 - self.kernel_group.tile_desc.vmap.vlane_stride = 1 - self.kernel_group.tile_desc.vmap.vlane_split_axis = 0 - elif vlane_split_axis == -1: # Reduction only case - self.kernel_group.tile_desc.vmap.vlane_split_axis = 0 - self.kernel_group.tile_desc.vmap.vlane_stride = self.kernel_group.tile_desc.get_tile_size()[0] + # Handle edge case + if len(self.ranges)==1 and self.ranges[0] == 1: # Scalar case 2 + self.kernel_group.tile_desc.vmap.vlane_stride = 1 + self.kernel_group.tile_desc.vmap.vlane_split_axis = 0 + elif vlane_split_axis == -1: # Reduction only case + self.kernel_group.tile_desc.vmap.vlane_split_axis = 0 + self.kernel_group.tile_desc.vmap.vlane_stride = self.kernel_group.tile_desc.get_tile_size()[0] # Handle implict dims. Input operand could be high dimension tensor. # Note: https://github.com/PSAL-POSTECH/PyTorchSim/issues/173 From f6ada1f5b1fe4f44e0162c03dfbe12c21633734c Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 5 Jan 2026 14:10:35 +0000 Subject: [PATCH 040/194] [Refactor] Remove unused env vars --- Dockerfile.base | 2 -- tutorial/jupyterhub/Dockerfile.ksc2025 | 2 -- 2 files changed, 4 deletions(-) diff --git a/Dockerfile.base b/Dockerfile.base index 6a21760b..f961859e 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -67,9 +67,7 @@ RUN curl -L -H "Accept: application/octet-stream" https://api.github.com/repos/P # Store RISC-V LLVM for TorchSim ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin -ENV TORCHSIM_LLVM_INCLUDE_PATH=/riscv-llvm/include ENV TORCHSIM_DIR=/workspace/PyTorchSim -ENV LLVM_DIR=/riscv-llvm # Download Spike simulator RUN curl -L -H "Accept: application/octet-stream" https://api.github.com/repos/PSAL-POSTECH/riscv-isa-sim/releases/assets/${SPIKE_ASSET_ID} -o /tmp/spike-release.tar.gz && \ diff --git a/tutorial/jupyterhub/Dockerfile.ksc2025 b/tutorial/jupyterhub/Dockerfile.ksc2025 index 9eaec15a..7633c048 100644 --- a/tutorial/jupyterhub/Dockerfile.ksc2025 +++ b/tutorial/jupyterhub/Dockerfile.ksc2025 @@ -52,9 +52,7 @@ RUN cd llvm-project && mkdir build && cd build && \ # Store RISC-V LLVM for TorchSim ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin -ENV TORCHSIM_LLVM_INCLUDE_PATH=/riscv-llvm/include ENV TORCHSIM_DIR=/workspace/PyTorchSim -ENV LLVM_DIR=/riscv-llvm # Download RISC-V tool chain RUN apt install -y wget && \ From 3ccfc113940def78366d773c4fe19a3d8bfe7232 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 6 Jan 2026 08:03:38 +0000 Subject: [PATCH 041/194] [CI] Add CI for pytorch2.8 --- .github/workflows/docker-base-image-2-8.yml | 71 +++++++++++++++++++++ .github/workflows/docker-base-image.yml | 10 ++- .github/workflows/docker-image-2-8.yml | 61 ++++++++++++++++++ Dockerfile | 3 +- Dockerfile.base | 3 +- 5 files changed, 143 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/docker-base-image-2-8.yml create mode 100644 .github/workflows/docker-image-2-8.yml diff --git a/.github/workflows/docker-base-image-2-8.yml b/.github/workflows/docker-base-image-2-8.yml new file mode 100644 index 00000000..f8649303 --- /dev/null +++ b/.github/workflows/docker-base-image-2-8.yml @@ -0,0 +1,71 @@ +name: Docker Base Image CI (PyTorch 2.8) + +on: + push: + branches: [ "base" ] + workflow_dispatch: + repository_dispatch: + types: [ build_base ] + +jobs: + build: + runs-on: ubuntu-latest + + permissions: + contents: read + packages: write + + steps: + - name: Checkout Code + uses: actions/checkout@v4 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set environment + env: + GIT_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + if [ -n "${{ github.event.pull_request.head.sha }}" ]; then + echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV + echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}" + else + echo "GITHUB_SHA=${{ github.sha }}" >> $GITHUB_ENV + echo "GITHUB_SHA=${{ github.sha }}" + fi + + gem5_response_file=/tmp/releases-gem5-latest.json + curl -s https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} + GEM5_ASSET_ID=$(jq ".assets[0].id" ${gem5_response_file}) + echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" + echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV + + llvm_response_file=/tmp/releases-gem5-latest.json + curl -s https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file} + LLVM_ASSET_ID=$(jq ".assets[0].id" ${llvm_response_file}) + echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" + echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV + + spike_response_file=/tmp/releases-spike-latest.json + curl -s https://api.github.com/repos/PSAL-POSTECH/riscv-isa-sim/releases/latest > ${spike_response_file} + SPIKE_ASSET_ID=$(jq ".assets[0].id" ${spike_response_file}) + echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID" + echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID" >> $GITHUB_ENV + + - name: Build and Push Docker Image (PyTorch 2.8) + uses: docker/build-push-action@v4 + with: + context: . + file: ./Dockerfile.base + push: true + build-args: | + PYTORCH_IMAGE=pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime + GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }} + LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }} + SPIKE_ASSET_ID=${{ env.SPIKE_ASSET_ID }} + tags: | + ghcr.io/psal-postech/torchsim_base_2_8:latest diff --git a/.github/workflows/docker-base-image.yml b/.github/workflows/docker-base-image.yml index bb79925c..2c29a11b 100644 --- a/.github/workflows/docker-base-image.yml +++ b/.github/workflows/docker-base-image.yml @@ -32,9 +32,13 @@ jobs: env: GIT_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - echo "IMAGE_TAG=torchsim-ci:${GITHUB_SHA}" >> $GITHUB_ENV - echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}" >> $GITHUB_ENV - echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}" + if [ -n "${{ github.event.pull_request.head.sha }}" ]; then + echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV + echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}" + else + echo "GITHUB_SHA=${{ github.sha }}" >> $GITHUB_ENV + echo "GITHUB_SHA=${{ github.sha }}" + fi gem5_response_file=/tmp/releases-gem5-latest.json curl -s https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} diff --git a/.github/workflows/docker-image-2-8.yml b/.github/workflows/docker-image-2-8.yml new file mode 100644 index 00000000..cb5f73d1 --- /dev/null +++ b/.github/workflows/docker-image-2-8.yml @@ -0,0 +1,61 @@ +name: Docker image CI (PyTorch 2.8) + +on: + pull_request: + branches: [ "torch_v2.8" ] + workflow_dispatch: + +jobs: + build-and-test: + runs-on: self-hosted + + permissions: + contents: read + packages: write + + steps: + - name: Checkout Code + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + submodules: recursive + + - name: Login to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and Push Docker Image (PyTorch 2.8) + uses: docker/build-push-action@v6 + with: + context: . + file: ./Dockerfile + push: true + no-cache: true + build-args: | + BASE_IMAGE=ghcr.io/psal-postech/torchsim_base_2_8:latest + tags: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }} + + - name: Wait for GHCR propagation + run: | + for i in {1..30}; do + echo "Checking if image exists in GHCR (attempt $i)..." + if docker manifest inspect ghcr.io/psal-postech/torchsim-test-2-8:${GITHUB_SHA} > /dev/null 2>&1; then + echo "Image is now available in GHCR." + exit 0 + fi + echo "Image not yet available, retrying in 30 seconds..." + sleep 20 + done + echo "Image did not become available in GHCR within expected time." + exit 1 + + test-pytorchsim-wrapper: + needs: build-and-test + uses: ./.github/workflows/pytorchsim_test.yml + with: + image_name: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }} + vector_lane: 128 + spad_size: 128 diff --git a/Dockerfile b/Dockerfile index 37721940..088daa43 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,6 @@ # syntax=docker/dockerfile:1.4 -FROM ghcr.io/psal-postech/torchsim_base:latest +ARG BASE_IMAGE=ghcr.io/psal-postech/torchsim_base:latest +FROM ${BASE_IMAGE} # Prepare PyTorchSim project COPY . /workspace/PyTorchSim diff --git a/Dockerfile.base b/Dockerfile.base index f961859e..897b8195 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -23,7 +23,8 @@ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime +ARG PYTORCH_IMAGE=pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime +FROM ${PYTORCH_IMAGE} # Copied from Gem5 Docker file ENV DEBIAN_FRONTEND=noninteractive From 0abfffefcef1cf09ba54be79ba3bc01a881c3d87 Mon Sep 17 00:00:00 2001 From: OkkyunWoo Date: Wed, 24 Sep 2025 12:59:04 +0000 Subject: [PATCH 042/194] PyTorch version upgrade: tested on single-operator tests --- PyTorchSimFrontend/extension_codecache.py | 3 +- PyTorchSimFrontend/extension_device.cpp | 6 +- .../extension_device_interface.py | 63 +++++++++++++++++++ .../extension_device_op_overrides.py | 25 ++++++++ PyTorchSimFrontend/extension_utils.py | 26 ++++++++ PyTorchSimFrontend/mlir/mlir_autotune.py | 8 ++- .../mlir/mlir_codegen_backend.py | 63 ++++++++++++++++--- PyTorchSimFrontend/mlir/mlir_common.py | 35 ++++++----- PyTorchSimFrontend/mlir/mlir_scheduling.py | 44 ++++++------- PyTorchSimFrontend/mlir/mlir_template.py | 21 +++---- Scheduler/scheduler.py | 17 +++-- 11 files changed, 243 insertions(+), 68 deletions(-) create mode 100644 PyTorchSimFrontend/extension_device_interface.py create mode 100644 PyTorchSimFrontend/extension_device_op_overrides.py create mode 100644 PyTorchSimFrontend/extension_utils.py diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 2e35220c..ef8c63e6 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -3,7 +3,8 @@ import shlex import subprocess -from torch._inductor.codecache import AsyncCompile, get_lock_dir, get_hash, write +from torch._inductor.codecache import get_lock_dir, get_hash, write +from torch._inductor.async_compile import AsyncCompile from AsmParser.tog_generator import tog_generator from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen from PyTorchSimFrontend import extension_config diff --git a/PyTorchSimFrontend/extension_device.cpp b/PyTorchSimFrontend/extension_device.cpp index cfaecf2b..b8a6e092 100644 --- a/PyTorchSimFrontend/extension_device.cpp +++ b/PyTorchSimFrontend/extension_device.cpp @@ -159,7 +159,7 @@ at::Tensor custom_to_device( // A dummy allocator for our custom device, that secretly uses the CPU struct DummyCustomAllocator final : at::Allocator { DummyCustomAllocator() = default; - at::DataPtr allocate(size_t nbytes) const override { + at::DataPtr allocate(size_t nbytes) override { void* data = c10::alloc_cpu(nbytes); return {data, data, &ReportAndDelete, at::Device(at::DeviceType::PrivateUse1, 0)}; } @@ -174,6 +174,10 @@ struct DummyCustomAllocator final : at::Allocator { at::DeleterFnPtr raw_deleter() const override { return &ReportAndDelete; } + + void copy_data(void* dest, const void* src, std::size_t count) const override { + std::memcpy(dest, src, count); + } }; // Register our dummy allocator diff --git a/PyTorchSimFrontend/extension_device_interface.py b/PyTorchSimFrontend/extension_device_interface.py new file mode 100644 index 00000000..e5875ab7 --- /dev/null +++ b/PyTorchSimFrontend/extension_device_interface.py @@ -0,0 +1,63 @@ +import torch +from torch._dynamo.device_interface import DeviceInterface, caching_worker_current_devices, caching_worker_device_properties + +class _ExtensionDeviceProperties: # FIXME: Dummy property values + name: str = "Extension_device" + platform_name: str + vendor: str + driver_version: str + version: str + max_compute_units: int + gpu_eu_count: int + max_work_group_size: int + max_num_sub_groups: int + sub_group_sizes: list[int] + has_fp16: bool + has_fp64: bool + has_atomic64: bool + has_bfloat16_conversions: bool + has_subgroup_matrix_multiply_accumulate: bool + has_subgroup_matrix_multiply_accumulate_tensor_float32: bool + has_subgroup_2d_block_io: bool + total_memory: int + multi_processor_count: int = 128 # gpu_subslice_count, num_sm + architecture: int + type: str + +_ExtensionDeviceProperties = _ExtensionDeviceProperties + +class ExtensionDeviceInterface(DeviceInterface): + class Worker: + @staticmethod + def set_device(device: int): + caching_worker_current_devices["extension_device"] = device + + @staticmethod + def current_device() -> int: + if "extension_device" in caching_worker_current_devices: + return caching_worker_current_devices["extension_device"] + return torch.xpu.current_device() + + @staticmethod + def get_device_properties(device: torch.types.Device = None) -> _ExtensionDeviceProperties: + if device is not None: + if isinstance(device, str): + device = torch.device(device) + assert device.type == "extension_device" + if isinstance(device, torch.device): + device = device.index + if device is None: + device = ExtensionDeviceInterface.Worker.current_device() + + if "extension_device" not in caching_worker_device_properties: + device_prop = [ + torch.cuda.get_device_properties(i) + for i in range(torch.cuda.device_count()) + ] + caching_worker_device_properties["extension_device"] = device_prop + + return _ExtensionDeviceProperties + + @staticmethod + def get_compute_capability(device: torch.types.Device = None): + return 36 \ No newline at end of file diff --git a/PyTorchSimFrontend/extension_device_op_overrides.py b/PyTorchSimFrontend/extension_device_op_overrides.py new file mode 100644 index 00000000..b76dae0f --- /dev/null +++ b/PyTorchSimFrontend/extension_device_op_overrides.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +from textwrap import dedent + +from torch._inductor.codegen.common import DeviceOpOverrides, register_device_op_overrides + +class ExtensionDeviceOpOverrides(DeviceOpOverrides): + def import_get_raw_stream_as(self, name: str) -> str: + return dedent( + """ + def get_raw_stream(_): + return 0 + """ + ) + + def set_device(self, device_idx: int) -> str: + return "pass" + + def synchronize(self) -> str: + return "pass" + + def device_guard(self, device_idx: int) -> str: + return "pass" + +register_device_op_overrides("extension_device", ExtensionDeviceOpOverrides()) \ No newline at end of file diff --git a/PyTorchSimFrontend/extension_utils.py b/PyTorchSimFrontend/extension_utils.py new file mode 100644 index 00000000..0418cacd --- /dev/null +++ b/PyTorchSimFrontend/extension_utils.py @@ -0,0 +1,26 @@ +import sympy +import torch + +""" +NOTE: Temporary File + +This file contains functions that were removed or changed in newer versions +of PyTorch. It is kept here only to temporarily enable compatibility while +upgrading to PyTorch 2.8 from PyTorch 2.2. + +These functions will eventually be integrated into the appropriate source files +or removed once no longer needed. + +This file is not intended to be permanent and should be deleted in the future. +""" + +def free_symbol_startswith(index: sympy.Expr, prefix: str): + return any(v.name.startswith(prefix) for v in index.free_symbols) + +def sympy_symbol(name: str) -> sympy.Symbol: + # This should never be used for creating shape/stride symbols, as those + # should all be allocated before Inductor. + assert name[0] != "s" + # NOTE: shape symbols are positive (> 0), but index variables are only + # non-negative (>= 0). + return sympy.Symbol(name, integer=True, nonnegative=True) \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py index 988408ea..138bec50 100644 --- a/PyTorchSimFrontend/mlir/mlir_autotune.py +++ b/PyTorchSimFrontend/mlir/mlir_autotune.py @@ -49,6 +49,9 @@ def __init__( self.extra_args = extra_args #self.hash_key, self.source_file = CUDACodeCache.write(self.source_code, "so") + def __str__(self) -> str: + return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}" + def make_run_fn( self, input_tensors: torch.Tensor, output_tensors: torch.Tensor ) -> Callable[[], None]: @@ -84,5 +87,6 @@ def cached_run_fn(*args, **kwargs): *args, ) - def __str__(self) -> str: - return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}" \ No newline at end of file + def update_workspace_size(self) -> None: + # FIXME: Not implemented yet. Checkout torch/_inductor/codegen/rocm/rocm_benchmark_request.py + return \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 297ea162..9f5c0674 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -6,12 +6,14 @@ from functools import reduce from operator import mul import torch +from typing import Optional from collections import defaultdict from concurrent.futures import ThreadPoolExecutor from torch._dynamo.testing import rand_strided from torch._inductor.autotune_process import TensorMeta from torch._dynamo.utils import dynamo_timed from torch._inductor.codegen import cpp, wrapper, common, memory_planning +from torch._inductor.ir import GraphPartitionSignature from torch._inductor.virtualized import V, _ops as ops from torch._inductor.codecache import write_atomic from torch._inductor.utils import ( @@ -57,10 +59,25 @@ def reduction_partial_combine_vec(reduction_type, vector_value, init_value): return ops.logical_and(vector_value, init_value) raise AssertionError(reduction_type) -class ExtensionWrapperCodegen(wrapper.WrapperCodeGen): +class ExtensionWrapperCodegen(wrapper.PythonWrapperCodegen): def __init__(self): super().__init__() + @classmethod + def create( + cls, + is_subgraph: bool, + subgraph_name: Optional[str], + parent_wrapper: Optional[wrapper.PythonWrapperCodegen], + partition_signatures: Optional[GraphPartitionSignature] = None, + ): + if is_subgraph: + assert subgraph_name is not None and parent_wrapper is not None + return wrapper.SubgraphPythonWrapperCodegen( + subgraph_name, parent_wrapper, partition_signatures + ) + return cls() + def write_header(self): self.header.splice( f""" @@ -89,6 +106,7 @@ def write_header(self): reinterpret_tensor = torch.ops.aten._reinterpret_tensor custom_async_compile = CustomAsyncCompile() os.environ["TORCHSIM_LAST_COMPILED_MODULE"] = __file__ + print(f\'Wrapper Codegen Path = {{__file__}}\') """ ) self.header.splice( @@ -132,7 +150,7 @@ def call(args): self.prefix.writeline(f"{lhs} = args") self.prefix.writeline("args.clear()") - self.codegen_inputs(self.prefix, V.graph.graph_inputs) + self.codegen_inputs() self.codegen_input_size_asserts() self.codegen_sram_plan_prefix() @@ -152,10 +170,27 @@ def codegen_sram_plan_postfix(self, outputs): continue self.wrapper_call.writeline(f"sram_plan_postfix('{name}', {name})") - @dynamo_timed + def _generate_kernel_call_helper( + self, + kernel_name: str, + call_args, + *, + device=None, + triton=True, + arg_types=None, + raw_keys=None, + raw_args=None, + triton_meta=None, + graph_name="", + original_fxnode_name=None, + ): + device = device or V.graph.get_current_device_or_throw() + self.writeline(self.wrap_kernel_call(kernel_name, call_args)) + return + def generate(self, is_inference): result = IndentedBuffer() - result.splice(self.header) + # result.splice(self.header) with contextlib.ExitStack() as stack: stack.enter_context(self.wrapper_call.indent()) @@ -170,8 +205,13 @@ def generate(self, is_inference): if isinstance(line, wrapper.MemoryPlanningLine): line.codegen(self.wrapper_call) + elif isinstance(line, wrapper.KernelCallLine): + self.wrapper_call.writeline(self.wrap_kernel_call(line.kernel_name, line.call_args)) else: - self.wrapper_call.writeline(line) + if isinstance(line, wrapper.WrapperLine): + line.codegen(self.wrapper_call) + else: + self.wrapper_call.writeline(line) # Add buffer plan hook for alloc if isinstance(line, memory_planning.AllocFromPoolLine) or isinstance(line, wrapper.AllocateLine): self.wrapper_call.writeline(f"sram_plan_prefix('{line.node.get_name()}', {line.node.get_name()})") @@ -180,7 +220,9 @@ def generate(self, is_inference): self.mark_output_type() self.generate_return(output_refs) - self.append_precomputed_sizes_to_prefix() + # self.append_precomputed_sizes_to_prefix() # FIXME: Need to replace append_precomputed_sizes_to_prefix() + result.splice(self.header) + self.finalize_prefix() result.splice(self.prefix) @@ -189,7 +231,10 @@ def generate(self, is_inference): self.generate_end(result) self.add_benchmark_harness(result) - return result.getvaluewithlinemap() + return ( + result.getvaluewithlinemap(), + self.kernel_declarations.getvaluewithlinemap(), + ) def memory_plan(self): self.lines = memory_planning.MemoryPlanner(self).plan(self.lines) @@ -964,13 +1009,13 @@ def _log_autotune_result(self, best_choice, best_cycle): ) def codegen_nodes(self, nodes, kernel_name): - src_code = super().codegen_nodes(nodes, kernel_name) + src_code, meta_code = super().codegen_nodes(nodes, kernel_name) self._prepare_simulator_headers(src_code) if "autotune" in extension_config.codegen_mapping_strategy and extension_config.pytorchsim_timing_mode: optimal_src_code = self.autotune(nodes, kernel_name)[0] if optimal_src_code is not None: return optimal_src_code - return src_code + return src_code, meta_code def _prepare_simulator_headers(self, src_code): write_path = extension_codecache.get_write_path(src_code) diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index b86607ea..f98a2132 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -14,6 +14,7 @@ from torch._inductor.virtualized import V from torch._inductor.ir import MultiOutputLayout from torch._inductor.dependencies import MemoryDep, StarDep, WeakDep +from torch._inductor.codegen.wrapper import KernelDefinitionLine from torch.utils._sympy.functions import ModularIndexing, FloorDiv, Mod import sympy import contextlib @@ -25,15 +26,20 @@ import torch.fx from torch.utils._sympy.value_ranges import ValueRanges from torch._inductor.utils import ( - free_symbol_startswith, get_sympy_Expr_dtype, IndentedBuffer, sympy_subs, - sympy_symbol, unique, ) from PyTorchSimFrontend import extension_config from PyTorchSimFrontend import extension_codecache +from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest + +from PyTorchSimFrontend.extension_utils import ( + free_symbol_startswith, + sympy_symbol +) + schedule_log = torch._logging.getArtifactLogger(__name__, "schedule") DTYPE_TO_MLIR = { @@ -654,7 +660,7 @@ def call_kernel(self, kernel_name): wrapper = V.graph.wrapper_code _, call_args, _, _ = self.kernel_group.args.mlir_argdefs() # generate the code to call this - wrapper.generate_kernel_call(kernel_name, call_args, cuda=False) + wrapper.generate_kernel_call(kernel_name, call_args, triton=False) def is_modular_indexing(self, expr): return "ModularIndexing" in str(expr) @@ -778,8 +784,8 @@ def codegen_nodes(self, nodes, kernel_name): V.graph.removed_buffers |= self.removed_buffers # V.graph.inplaced_to_remove |= self.inplaced_to_remove src_code = self.codegen_kernel(kernel_name=kernel_name) - self.meta_kernel() - return src_code + meta_code = self.meta_kernel() + return src_code, meta_code def codegen_kernel(self, kernel_name): arg_defs, _, _, _ = self.kernel_group.args.mlir_argdefs() @@ -797,12 +803,9 @@ def codegen_kernel(self, kernel_name): return code.getvalue() def meta_kernel(self): - wrapper = V.graph.wrapper_code _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs() - wrapper.add_import_once('\nprint(f\'Wrapper Codegen Path = {__file__}\')') - # Dump loop and load/store information - wrapper.add_import_once(f"arg_attributes = {arg_attributes}") - return arg_attributes + meta_code = arg_attributes + return meta_code def get_constant_vector(self, expr): constant_vector = [[int(expr.coeff(var)),None] for var in self.itervars] @@ -903,10 +906,10 @@ def load(name: str, index: sympy.Expr): if name in store_cache: return store_cache[name] key = name+str(index) - if key not in self.cse.cache: + if key not in self.cse._cache: result = self.load(name, index) - self.cse.cache[key] = result - return self.cse.cache[key] + self.cse._cache[key] = result + return self.cse._cache[key] @staticmethod def store(name, index, value, mode=None): @@ -914,7 +917,7 @@ def store(name, index, value, mode=None): if mode is None: self.cse.store_cache[name] = value if self.current_node: - for other_name in self.current_node.get_mutations(): + for other_name in self.current_node.get_output(name).get_mutations(): self.cse.store_cache[other_name] = value if name not in V.graph.removed_buffers: return self.store(name, index, value, mode=mode) @@ -924,7 +927,7 @@ def store_reduction(name, index, value): self.store_buffer_names.add(name) self.cse.store_cache[name] = value if self.current_node: - for other_name in self.current_node.get_mutations(): + for other_name in self.current_node.get_output(name).get_mutations(): self.cse.store_cache[other_name] = value if name not in V.graph.removed_buffers: @@ -970,7 +973,7 @@ def bucketize( super().__enter__() assert self.overrides - parent_handler = self.overrides(V.get_ops_handler()) + parent_handler = self.overrides() self.exit_stack.enter_context(V.set_ops_handler(CSEProxy())) self.exit_stack.enter_context(V.set_kernel_handler(self)) return self diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 23be941c..66155e9c 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -22,8 +22,6 @@ class MLIRScheduling(BaseScheduling): target_kernel = MLIRKernel def __init__(self, scheduler): self.scheduler = scheduler - self.scheduler.can_fuse_origin = self.scheduler.can_fuse - self.scheduler.can_fuse = self.can_fuse_with_exceptions #self.scheduler.enter_context = self.enter_context_fixed # FIXME. Monkey patch: For fixing the inductor bug self.kernel_group = mlir_common.MLIRWrapperKenrelGroup() self._ready_to_flush = False @@ -90,6 +88,9 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule def _set_flush_status(self, status: bool): self._ready_to_flush = status + def reset_kernel_group(self): + self.kernel_group = mlir_common.MLIRWrapperKenrelGroup() + def can_fuse_vertical(self, node1, node2): return self.can_fuse_horizontal(node1, node2) @@ -103,7 +104,7 @@ def can_fuse_horizontal(self, node1, node2): # Reduction is currently not supported if node1.is_reduction() and node2.is_reduction() and not node1.is_template() and not node2.is_template() and extension_config.CONFIG_FUSION_REDUCTION_REDUCTION: - return vars1 == vars2 and reduce1 == reduce2 and node1.inverse_users == node2.inverse_users + return vars1 == vars2 and reduce1 == reduce2 # and node1.inverse_users == node2.inverse_users if node1.is_reduction() or node2.is_reduction(): return False @@ -180,7 +181,8 @@ def revert_group(self, act_nodes, args=None, var_ranges=None): def group_fn(self, sizes): return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes) - def codegen_nodes(self, nodes): + def codegen_node(self, _node): + nodes = _node.get_nodes() _, (group, reduction_group) = max( nodes, key=lambda x: int(x.is_reduction()) ).group @@ -210,8 +212,8 @@ def codegen_nodes(self, nodes): kernel_name_candidate = f"extension_kernel_{MLIRScheduling.count}" MLIRScheduling.count += 1 - src_code = ex_kernel.codegen_nodes(nodes, kernel_name_candidate) - kernel_name = self.define_kernel(src_code, kernel_name_candidate, ex_kernel.vector_lane, + src_code, meta_code = ex_kernel.codegen_nodes(nodes, kernel_name_candidate) + kernel_name = self.define_kernel(src_code, meta_code, kernel_name_candidate, ex_kernel.vector_lane, ex_kernel.spad_info, origins= {str(i) for i in nodes[0].node.origins}) ex_kernel.call_kernel(kernel_name) _, args, _, _ = ex_kernel.args.mlir_argdefs() @@ -230,26 +232,30 @@ def codegen_sync(self): pass def flush(self): - self.kernel_group.codegen_define_and_call(V.graph.wrapper_code) - self.kernel_group = mlir_common.MLIRWrapperKenrelGroup() + src_code = self.kernel_group.codegen_group() + if src_code: + kernel_name = self.define_kernel( + src_code, self.kernel_group.scheduled_nodes + ) + self.kernel_group.call_kernel(V.graph.wrapper_code, kernel_name) + self.reset_kernel_group() self._set_flush_status(False) def define_function(self, kernel): partial_code, function_name = kernel.def_function() if partial_code is not None and function_name not in self.outer_function: with V.set_kernel_handler(kernel): - code = partial_code.finalize() + code = partial_code.finalize_all() wrapper = V.graph.wrapper_code wrapper.header.writeline(code) self.outer_function.add(function_name) - def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size=None, origins={}): + def define_kernel(self, src_code, meta_code, kernel_name, vector_lane, spad_info, loop_size=None, origins={}): wrapper = V.graph.wrapper_code if src_code in wrapper.src_to_kernel: kernel_name = wrapper.src_to_kernel[src_code] else: wrapper.src_to_kernel[src_code] = kernel_name - codecache_def = IndentedBuffer() codecache_def.writeline(f"custom_async_compile.mlir('''{src_code}''', ") codecache_def.writeline(f"vectorlane_size={vector_lane},") @@ -261,26 +267,16 @@ def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False) return kernel_name - def codegen_template(self, template_node, epilogue_nodes): - # Handle prologue pattern - prologue_nodes = [] - if not template_node.is_template(): - epilogue_nodes = [template_node] + epilogue_nodes - for i, node in enumerate(epilogue_nodes): - if node.is_template(): - template_node = node - prologue_nodes = epilogue_nodes[:i] - epilogue_nodes = epilogue_nodes[i+1:] - break - + def codegen_template(self, template_node, prologue_nodes, epilogue_nodes): # Generate template code template_buffer = template_node.node kernel, tile_candidates, render = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group) _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs() src_code = kernel.codegen_nodes(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) + meta_code = kernel.meta_kernel() with V.set_kernel_handler(kernel): - kernel_name = self.define_kernel(src_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info, + kernel_name = self.define_kernel(src_code, meta_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info, kernel.loop_size, origins={str(i) for i in template_node.node.origins}) self.define_function(kernel) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index a36bc907..4cfe71bf 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -13,8 +13,8 @@ from typing import List, Optional from unittest.mock import patch -from torch._inductor.codegen.common import KernelTemplate, ChoiceCaller, CSE, DeferredLine -from torch._inductor.ir import Buffer, IRNode, TemplateBuffer +from torch._inductor.codegen.common import KernelTemplate, CSE, DeferredLine +from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, ChoiceCaller from torch._inductor.select_algorithm import PartialRender from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller from torch._inductor.autotune_process import TensorMeta @@ -394,18 +394,14 @@ def meta_kernel(self): for idx in range(len(arg_attributes)): if arg_attributes[idx][0] == name: arg_attributes[idx][1] = attr - wrapper.add_import_once('\nprint(f\'Wrapper Codegen Path = {__file__}\')') - # Dump loop and load/store information - wrapper.add_import_once(f"loop_info = {self.loop_info}") - wrapper.add_import_once(f"arg_attributes = {arg_attributes}") + return arg_attributes def call_kernel(self, kernel_name): wrapper = V.graph.wrapper_code _, call_args, _, _ = self.kernel_group.args.mlir_argdefs() # generate the code to call this wrapper.generate_kernel_call( - kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", - call_args, cuda=False) + kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", call_args) def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes, tile_info): with self as kernel: @@ -479,7 +475,7 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_ src_code = ( partial_code if isinstance(partial_code, str) - else partial_code.finalize() + else partial_code.finalize_all() ) # For consistency, white space could make wrong write_path @@ -753,7 +749,7 @@ def hook(): return "" def def_function(self): - _, call_args, _ = self.kernel_group.args.python_argdefs() + _, call_args, _, _ = self.kernel_group.args.python_argdefs() if self.outer_func_render is not None: partial_code, function_name = self.outer_func_render(input_args=call_args) return PartialRender( @@ -1153,7 +1149,7 @@ def __init__(self, name, input_nodes, layout, input_reorder = None): """ super().__init__(name) self.input_nodes = [node for node in input_nodes if node is not None] - self.output_node: Buffer = Buffer("buf_out", layout) + self.output_node: Buffer = Buffer(name="buf_out", layout=layout) self.input_reorder = input_reorder self.layout = layout @@ -1218,7 +1214,10 @@ def make_kernel_render( self.output_node.get_layout(), make_kernel_render, bmreq, + False, # supports_epilogue_fusion self, + kwargs, + "" # Currently Empty description ) def get_tile_candidates(self, **kwargs): diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 8aa849b1..04fa3c8d 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -1,5 +1,6 @@ from typing import List import os +import sys import numpy as np import torch from pathlib import Path @@ -7,6 +8,10 @@ from PyTorchSimFrontend.extension_codecache import hash_prefix from Simulator.simulator import TOGSimulator from PyTorchSimFrontend import extension_config +from PyTorchSimFrontend.extension_device_interface import ExtensionDeviceInterface + +from torch._dynamo.device_interface import register_interface_for_device + def import_module_from_path(module_name, path): module_path = Path(path) # Convert to Path object for safety @@ -194,17 +199,21 @@ def setup_device(cls): from PyTorchSimFrontend.mlir.mlir_scheduling import ( MLIRScheduling ) + register_backend_for_device( - "npu", MLIRScheduling, ExtensionWrapperCodegen - ) - assert( - get_scheduling_for_device("npu") == MLIRScheduling + "npu", + lambda scheduling: MLIRScheduling(scheduling), + ExtensionWrapperCodegen ) + import PyTorchSimFrontend.extension_device_op_overrides + assert( get_wrapper_codegen_for_device("npu") == ExtensionWrapperCodegen ) cls.NPU_MODULE = module + sys.modules['torch.npu'] = module + register_interface_for_device(module.custom_device(), ExtensionDeviceInterface) return module def submit(self, batched_req, partition_idx) -> List[RequestReturn]: From b7a275e186ff24f68f91442c6b763e43cfceb2c1 Mon Sep 17 00:00:00 2001 From: OkkyunWoo Date: Wed, 24 Sep 2025 13:28:55 +0000 Subject: [PATCH 043/194] [Test] Add torch.no_grad(), change to use torch.nn.ReLU, fuion off --- PyTorchSimFrontend/mlir/mlir_scheduling.py | 1 + tests/test_activation.py | 5 +++-- tests/test_conv2d.py | 25 +++++++++++----------- tests/test_layernorm.py | 5 +++-- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 66155e9c..b6b8dea5 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -97,6 +97,7 @@ def can_fuse_vertical(self, node1, node2): def can_fuse_horizontal(self, node1, node2): if not extension_config.CONFIG_FUSION: return False + if (len(node1.get_nodes())+ len(node2.get_nodes())) > self.max_fusion_size: return False _, (vars1, reduce1) = node1.group diff --git a/tests/test_activation.py b/tests/test_activation.py index 575fc7e8..49a9467c 100644 --- a/tests/test_activation.py +++ b/tests/test_activation.py @@ -23,9 +23,10 @@ def test_ReLU(device, size=(128, 128)): input = torch.randn(size) x1 = input.to(device=device) x2 = input.to("cpu") - opt_fn = torch.compile(dynamic=False)(torch.nn.functional.relu) + ReLU = torch.nn.ReLU() + opt_fn = torch.compile(dynamic=False)(ReLU) y = opt_fn(x1) - cpu_y = torch.nn.functional.relu(x2) + cpu_y = ReLU(x2) test_result("ReLU", y, cpu_y) def test_GeLU(device, size=(128, 128), approximate='none'): diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py index e964319d..97e5cdea 100644 --- a/tests/test_conv2d.py +++ b/tests/test_conv2d.py @@ -44,15 +44,16 @@ def custom_conv2d(a, b, bias): module = PyTorchSimRunner.setup_device() device = module.custom_device() torch._dynamo.config.cache_size_limit = 64 - test_conv2d(device, batch_size=8, in_channels=3, out_channels=32, input_size=32, kernel_size=1, stride=1, padding=0) - test_conv2d(device, batch_size=1, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=2, padding=3) - test_conv2d(device, batch_size=2, in_channels=3, out_channels=64, input_size=32//2, kernel_size=7, stride=1, padding=3) - test_conv2d(device, batch_size=4, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=1, padding=3) - test_conv2d(device, batch_size=4, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=1, padding=3) - test_conv2d(device, batch_size=2, in_channels=128, out_channels=256, input_size=13, kernel_size=5, stride=1, padding=2) - test_conv2d(device, batch_size=2, in_channels=128, out_channels=512, input_size=14, kernel_size=7, stride=1, padding=3) - test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=14, kernel_size=3, stride=2, padding=1) - test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=7, kernel_size=3, stride=2, padding=1) - test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=2, kernel_size=1, stride=1, padding=0) - test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=14, kernel_size=1, stride=2, padding=0) - test_conv2d(device, batch_size=1, in_channels=3, out_channels=768, input_size=224, kernel_size=16,stride=16, padding=0) + with torch.no_grad(): + test_conv2d(device, batch_size=8, in_channels=3, out_channels=32, input_size=32, kernel_size=1, stride=1, padding=0) + test_conv2d(device, batch_size=1, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=2, padding=3) + test_conv2d(device, batch_size=2, in_channels=3, out_channels=64, input_size=32//2, kernel_size=7, stride=1, padding=3) + test_conv2d(device, batch_size=4, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=1, padding=3) + test_conv2d(device, batch_size=4, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=1, padding=3) + test_conv2d(device, batch_size=2, in_channels=128, out_channels=256, input_size=13, kernel_size=5, stride=1, padding=2) + test_conv2d(device, batch_size=2, in_channels=128, out_channels=512, input_size=14, kernel_size=7, stride=1, padding=3) + test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=14, kernel_size=3, stride=2, padding=1) + test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=7, kernel_size=3, stride=2, padding=1) + test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=2, kernel_size=1, stride=1, padding=0) + test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=14, kernel_size=1, stride=2, padding=0) + test_conv2d(device, batch_size=1, in_channels=3, out_channels=768, input_size=224, kernel_size=16,stride=16, padding=0) diff --git a/tests/test_layernorm.py b/tests/test_layernorm.py index 28e38d37..a2e842d0 100644 --- a/tests/test_layernorm.py +++ b/tests/test_layernorm.py @@ -44,5 +44,6 @@ def test_LayerNorm(device, size=(64, 64)): from Scheduler.scheduler import PyTorchSimRunner module = PyTorchSimRunner.setup_device() device = module.custom_device() - #test_LayerNorm(device) - test_LayerNorm(device, shape) + with torch.no_grad(): + #test_LayerNorm(device) + test_LayerNorm(device, shape) From 5c5e61c82b1482ec8b2eb48cf64e956bfccd4d94 Mon Sep 17 00:00:00 2001 From: OkkyunWoo Date: Thu, 6 Nov 2025 05:28:52 +0000 Subject: [PATCH 044/194] [Implement] Hook and GuardImpl for extension device --- PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp | 8 ++ PyTorchSimDevice/ExtensionDeviceGuardImpl.h | 127 ++++++++++++++++++ .../extension_device.cpp | 10 +- .../extension_device_interface.py | 0 .../extension_device_op_overrides.py | 0 PyTorchSimDevice/extension_hooks.cpp | 48 +++++++ PyTorchSimDevice/extension_hooks.h | 30 +++++ Scheduler/scheduler.py | 8 +- 8 files changed, 221 insertions(+), 10 deletions(-) create mode 100644 PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp create mode 100644 PyTorchSimDevice/ExtensionDeviceGuardImpl.h rename {PyTorchSimFrontend => PyTorchSimDevice}/extension_device.cpp (99%) rename {PyTorchSimFrontend => PyTorchSimDevice}/extension_device_interface.py (100%) rename {PyTorchSimFrontend => PyTorchSimDevice}/extension_device_op_overrides.py (100%) create mode 100644 PyTorchSimDevice/extension_hooks.cpp create mode 100644 PyTorchSimDevice/extension_hooks.h diff --git a/PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp b/PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp new file mode 100644 index 00000000..a0b1395d --- /dev/null +++ b/PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp @@ -0,0 +1,8 @@ +#include "ExtensionDeviceGuardImpl.h" +#include + +namespace c10::extension_device::impl { + +C10_REGISTER_GUARD_IMPL(extension_device, ExtensionDeviceGuardImpl); + +} // namespace c10::extension_device::impl diff --git a/PyTorchSimDevice/ExtensionDeviceGuardImpl.h b/PyTorchSimDevice/ExtensionDeviceGuardImpl.h new file mode 100644 index 00000000..6d35677b --- /dev/null +++ b/PyTorchSimDevice/ExtensionDeviceGuardImpl.h @@ -0,0 +1,127 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace c10::extension_device::impl { + +struct ExtensionDeviceGuardImpl final : public c10::impl::DeviceGuardImplInterface { + static constexpr DeviceType static_type = DeviceType::PrivateUse1; // ✅ your backend type + + ExtensionDeviceGuardImpl() = default; + + explicit ExtensionDeviceGuardImpl(DeviceType t) { + TORCH_CHECK( + t == static_type, + "ExtensionDeviceGuardImpl initialized with non-extension_device DeviceType: ", + t); + } + + // -------------------------------------------------------------------------- + // 기본적인 device guard (CPU처럼 동작) + // -------------------------------------------------------------------------- + DeviceType type() const override { + return static_type; + } + + Device exchangeDevice(Device d) const override { + TORCH_CHECK(d.type() == static_type, "Expected extension_device but got ", d); + return d; // nothing to exchange, CPU-like + } + + Device getDevice() const override { + return Device(static_type, 0); + } + + void setDevice(Device d) const override { + TORCH_CHECK(d.type() == static_type, "Expected extension_device but got ", d); + } + + void uncheckedSetDevice(Device d) const noexcept override {} + + DeviceIndex deviceCount() const noexcept override { + return 1; // pretend single device + } + + // -------------------------------------------------------------------------- + // Stream handling (동기식이므로 기본 stream만 사용) + // -------------------------------------------------------------------------- + Stream getStream(Device d) const override { + return Stream(Stream::DEFAULT, d); + } + + Stream getNewStream(Device d, int priority = 0) const override { + return Stream(Stream::DEFAULT, d); + } + + Stream getStreamFromGlobalPool(Device d, bool = false) const override { + return Stream(Stream::DEFAULT, d); + } + + Stream exchangeStream(Stream s) const override { + return s; + } + + bool queryStream(const Stream& stream) const override { + (void)stream; + return true; + } + + void synchronizeStream(const Stream& stream) const override { + (void)stream; + } + + void synchronizeDevice(DeviceIndex device_index) const override { + (void)device_index; + } + + // -------------------------------------------------------------------------- + // Event handling (전부 no-op) + // -------------------------------------------------------------------------- + void destroyEvent(void* event, const DeviceIndex device_index) const noexcept override { + (void)event; + (void)device_index; + } + + void record(void** event, const Stream& stream, const DeviceIndex device_index, const EventFlag flag) const override { + (void)event; + (void)stream; + (void)device_index; + (void)flag; + } + + void block(void* event, const Stream& stream) const override { + (void)event; + (void)stream; + } + + bool queryEvent(void* event) const override { + (void)event; + return true; + } + + void synchronizeEvent(void* event) const override { + (void)event; + } + + double elapsedTime(void* start_event, void* end_event, const DeviceIndex device_index) const override { + (void)start_event; + (void)end_event; + (void)device_index; + return 0.0; + } + + // -------------------------------------------------------------------------- + // Misc (allocator integration) + // -------------------------------------------------------------------------- + void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream) const override { + (void)data_ptr; + (void)stream; + } +}; + +} // namespace c10::extension_device::impl diff --git a/PyTorchSimFrontend/extension_device.cpp b/PyTorchSimDevice/extension_device.cpp similarity index 99% rename from PyTorchSimFrontend/extension_device.cpp rename to PyTorchSimDevice/extension_device.cpp index b8a6e092..a1dcfcf4 100644 --- a/PyTorchSimFrontend/extension_device.cpp +++ b/PyTorchSimDevice/extension_device.cpp @@ -55,16 +55,12 @@ static inline at::MemoryFormat fix_memory_format(c10::optional return mf; } +#include "ExtensionDeviceGuardImpl.h" + static uint64_t op_counter = 0; static uint64_t last_saved_value = 0; -// register guard -namespace at { -namespace detail { - -C10_REGISTER_GUARD_IMPL(PrivateUse1, c10::impl::NoOpDeviceGuardImpl); - -}} // namespace at::detail +C10_REGISTER_GUARD_IMPL(PrivateUse1, c10::extension_device::impl::ExtensionDeviceGuardImpl); // basic dummy add function at::Tensor custom_add_Tensor(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) { diff --git a/PyTorchSimFrontend/extension_device_interface.py b/PyTorchSimDevice/extension_device_interface.py similarity index 100% rename from PyTorchSimFrontend/extension_device_interface.py rename to PyTorchSimDevice/extension_device_interface.py diff --git a/PyTorchSimFrontend/extension_device_op_overrides.py b/PyTorchSimDevice/extension_device_op_overrides.py similarity index 100% rename from PyTorchSimFrontend/extension_device_op_overrides.py rename to PyTorchSimDevice/extension_device_op_overrides.py diff --git a/PyTorchSimDevice/extension_hooks.cpp b/PyTorchSimDevice/extension_hooks.cpp new file mode 100644 index 00000000..aadd6d2a --- /dev/null +++ b/PyTorchSimDevice/extension_hooks.cpp @@ -0,0 +1,48 @@ +#include "extension_hooks.h" + +bool ExtensionPU1Hooks::isBuilt() const { return true; } +bool ExtensionPU1Hooks::isAvailable() const { return true; } + +const at::Generator& ExtensionPU1Hooks::getDefaultGenerator(c10::DeviceIndex idx) const { + if (idx < 0) idx = 0; + static std::vector gens; + static std::mutex m; + std::lock_guard g(m); + if (gens.size() <= (size_t)idx) gens.resize((size_t)idx + 1); + if (!gens[idx].defined()) gens[idx] = at::GetGeneratorForPrivateuse1(idx); + return gens[idx]; // 영속 객체 참조 반환 +} + +at::Generator ExtensionPU1Hooks::getNewGenerator(c10::DeviceIndex idx) const { + if (idx < 0) idx = 0; + return at::GetGeneratorForPrivateuse1(idx); +} + +at::Device ExtensionPU1Hooks::getDeviceFromPtr(void* data) const { + return at::Device(at::kPrivateUse1, 0); // MVP: 단일 디바이스 가정 +} + +bool ExtensionPU1Hooks::isPinnedPtr(const void* data) const { + return false; +} + +at::Allocator* ExtensionPU1Hooks::getPinnedMemoryAllocator() const { + return at::getHostAllocator(at::kPrivateUse1); +} + +bool ExtensionPU1Hooks::hasPrimaryContext(c10::DeviceIndex device_index) const { return true; } + +void ExtensionPU1Hooks::resizePrivateUse1Bytes(const c10::Storage&, size_t) const { + TORCH_CHECK(false, "resizePrivateUse1Bytes not implemented"); +} + +// REGISTER_EXTENSION_HOOKS(ExtensionPU1Hooks); + +namespace { +struct AutoRegistrar { + AutoRegistrar() { + at::RegisterPrivateUse1HooksInterface(new ExtensionPU1Hooks()); + } +}; +static AutoRegistrar _auto_registrar; +} diff --git a/PyTorchSimDevice/extension_hooks.h b/PyTorchSimDevice/extension_hooks.h new file mode 100644 index 00000000..fdf3505a --- /dev/null +++ b/PyTorchSimDevice/extension_hooks.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +struct ExtensionPU1Hooks final : public at::PrivateUse1HooksInterface { + ExtensionPU1Hooks() {} + bool isBuilt() const; + bool isAvailable() const; + + const at::Generator& getDefaultGenerator(c10::DeviceIndex device_index) const override; + + at::Generator getNewGenerator(c10::DeviceIndex device_index = -1) const override; + + at::Device getDeviceFromPtr(void* data) const override; + + bool isPinnedPtr(const void* data) const override; + + at::Allocator* getPinnedMemoryAllocator() const override; + + bool hasPrimaryContext(c10::DeviceIndex device_index) const override; + + void resizePrivateUse1Bytes(const c10::Storage& /*storage*/, size_t /*newsize*/) const override; +}; \ No newline at end of file diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 04fa3c8d..215700eb 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -8,7 +8,7 @@ from PyTorchSimFrontend.extension_codecache import hash_prefix from Simulator.simulator import TOGSimulator from PyTorchSimFrontend import extension_config -from PyTorchSimFrontend.extension_device_interface import ExtensionDeviceInterface +from PyTorchSimDevice.extension_device_interface import ExtensionDeviceInterface from torch._dynamo.device_interface import register_interface_for_device @@ -173,14 +173,16 @@ def setup_device(cls): return cls.NPU_MODULE source_file_path = os.path.dirname(os.path.abspath(__file__)) source_file = os.path.join( - source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimFrontend/extension_device.cpp" + source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimDevice/extension_device.cpp" ) + hook_file = os.path.join(source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimDevice/extension_hooks.cpp") import torch.utils.cpp_extension module = torch.utils.cpp_extension.load( name="npu", sources=[ str(source_file), + str(hook_file), ], extra_cflags=["-g"], verbose=True, @@ -205,7 +207,7 @@ def setup_device(cls): lambda scheduling: MLIRScheduling(scheduling), ExtensionWrapperCodegen ) - import PyTorchSimFrontend.extension_device_op_overrides + import PyTorchSimDevice.extension_device_op_overrides assert( get_wrapper_codegen_for_device("npu") From 74704b8fbbc38763b7214c1ca4dd0679623c5b98 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 6 Jan 2026 11:21:20 +0000 Subject: [PATCH 045/194] [CI] Change the trigger condition --- .github/workflows/docker-image-2-8.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-image-2-8.yml b/.github/workflows/docker-image-2-8.yml index cb5f73d1..4d511a1a 100644 --- a/.github/workflows/docker-image-2-8.yml +++ b/.github/workflows/docker-image-2-8.yml @@ -1,7 +1,7 @@ name: Docker image CI (PyTorch 2.8) on: - pull_request: + push: branches: [ "torch_v2.8" ] workflow_dispatch: From d3f32988da41de1334159d2d8c783a4a1fdd059a Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 6 Jan 2026 12:42:26 +0000 Subject: [PATCH 046/194] [CI] Use CMake 3 to build pytorchsim --- Dockerfile.base | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.base b/Dockerfile.base index 897b8195..c5f200bc 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -34,7 +34,7 @@ RUN apt -y update && \ python3-dev python-is-python3 libboost-all-dev \ libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \ python3-venv black libssl-dev libasan5 libubsan1 curl device-tree-compiler wget ninja-build && \ - pip install onnx matplotlib scikit-learn pydot tabulate && pip install --user conan==1.56.0 && rm -rf /var/lib/apt/lists/* + pip install onnx matplotlib scikit-learn pydot tabulate && pip install --user conan==1.56.0 cmake==3.26.4 && rm -rf /var/lib/apt/lists/* # Download RISC-V tool chain RUN wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz && \ From 07633630d7c008bddb1bdbee3c288e7d8b771aae Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 6 Jan 2026 12:44:55 +0000 Subject: [PATCH 047/194] [CI] Seperate base image --- .github/workflows/docker-base-image-2-8.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-base-image-2-8.yml b/.github/workflows/docker-base-image-2-8.yml index f8649303..3a1d97a1 100644 --- a/.github/workflows/docker-base-image-2-8.yml +++ b/.github/workflows/docker-base-image-2-8.yml @@ -2,7 +2,7 @@ name: Docker Base Image CI (PyTorch 2.8) on: push: - branches: [ "base" ] + branches: [ "base_v2.8" ] workflow_dispatch: repository_dispatch: types: [ build_base ] From 45914036118126799b762e06d990115f4372fde5 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 7 Jan 2026 04:43:59 +0000 Subject: [PATCH 048/194] [Fix] PyTorch2.8 support (WIP) --- PyTorchSimDevice/extension_device_op_overrides.py | 2 +- PyTorchSimFrontend/mlir/mlir_common.py | 2 -- PyTorchSimFrontend/mlir/mlir_scheduling.py | 8 +++++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/PyTorchSimDevice/extension_device_op_overrides.py b/PyTorchSimDevice/extension_device_op_overrides.py index b76dae0f..17439b95 100644 --- a/PyTorchSimDevice/extension_device_op_overrides.py +++ b/PyTorchSimDevice/extension_device_op_overrides.py @@ -22,4 +22,4 @@ def synchronize(self) -> str: def device_guard(self, device_idx: int) -> str: return "pass" -register_device_op_overrides("extension_device", ExtensionDeviceOpOverrides()) \ No newline at end of file +register_device_op_overrides("npu", ExtensionDeviceOpOverrides()) \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index f98a2132..6888f9a1 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -23,7 +23,6 @@ import sympy -import torch.fx from torch.utils._sympy.value_ranges import ValueRanges from torch._inductor.utils import ( get_sympy_Expr_dtype, @@ -33,7 +32,6 @@ ) from PyTorchSimFrontend import extension_config from PyTorchSimFrontend import extension_codecache -from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest from PyTorchSimFrontend.extension_utils import ( free_symbol_startswith, diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index b6b8dea5..2d578c61 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -167,6 +167,8 @@ def revert_group(self, act_nodes, args=None, var_ranges=None): act_node.node.get_store_function(), (args if act_node.node.get_reduction_type() else args[:1]), var_ranges, + args[0], + args[1] ) index_size = [] reduce_size = [] @@ -188,7 +190,7 @@ def codegen_node(self, _node): nodes, key=lambda x: int(x.is_reduction()) ).group - # Note: We assume that ther is at least one loop in the nodes + # Note: We assume that there is at least one loop in the nodes # But, inductor simplifies the group, there could be no loop # In that case, we add dummy loop(size=1) to the group if len(group) == 0: @@ -263,9 +265,9 @@ def define_kernel(self, src_code, meta_code, kernel_name, vector_lane, spad_info codecache_def.writeline(f"loop_size={loop_size},") codecache_def.writeline(f"spad_info={spad_info},") codecache_def.writeline(f"origins={origins},") - codecache_def.writeline("arg_attributes=arg_attributes,") + codecache_def.writeline(f"arg_attributes={meta_code},") codecache_def.writeline(f"vlen={extension_config.vpu_vector_length_bits})") - wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False) + wrapper.define_kernel(kernel_name, codecache_def.getvalue(), gpu=False) return kernel_name def codegen_template(self, template_node, prologue_nodes, epilogue_nodes): From b9d4144bdba3c4007079c180934570eac245f61c Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 7 Jan 2026 07:51:11 +0000 Subject: [PATCH 049/194] [Fix] Use official prologue fusion path --- PyTorchSimFrontend/mlir/mlir_scheduling.py | 150 ++++++++++----------- 1 file changed, 75 insertions(+), 75 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 2d578c61..3799633c 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -29,62 +29,6 @@ def __init__(self, scheduler): config.inplace_buffers = False # FIXME. inout kernel makes trouble.. So disabled it! self.max_fusion_size = 5 - def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool: - # Extract base template node - base_template_node1 = [node for node in node1.get_nodes() if node.is_template()] - base_template_node2 = [node for node in node2.get_nodes() if node.is_template()] - if node1.get_device() != node2.get_device(): - return False - if not (isinstance(node1, (SchedulerNode, FusedSchedulerNode)) and isinstance(node2, (SchedulerNode, FusedSchedulerNode))): - return False - - if len(base_template_node1) == 1 and len(base_template_node2) == 0 and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE: - from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate - from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate - if (isinstance(base_template_node1[0].node.template, MLIRGemmTemplate) or isinstance(base_template_node1[0].node.template, MLIRBMMTemplate)) and node2.is_reduction(): - # For matmul/bmm+reduction case - size_match = node1.get_nodes()[0].node.get_numel() == reduce(operator.mul, node2.get_nodes()[0].node.get_size(), 1) * reduce(operator.mul, node2.get_nodes()[0].node.get_reduction_size(), 1) - target_symbol = symbols("r0") - try: - stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.get_nodes()[0].node).split("\n") if "r0" in i][1] - stride = int(sympify(stride).coeff(target_symbol)) - except: - return False - - # We can't fuse dim=-1 - layout_possible = stride != 1 - # Directed linked? - dependency_check = node2.get_nodes()[0] in [node.node for node in base_template_node1[0].users]# and len(node2.read_writes.reads)==1 - dependency_size = all([i.get_numel() == node1.get_nodes()[0].node.get_numel() for i in node2.read_writes.reads]) - return size_match and layout_possible and dependency_check and dependency_size - - # For prologue fusion case - if extension_config.CONFIG_FUSION_PROLOGUE and len(base_template_node1) == 0 and len(node1.get_nodes())==1 and len(base_template_node2) == 1: - from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate - from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate - target_node = base_template_node2[0].node - if target_node.origin_node is not None and hasattr(target_node.origin_node.target, "_name") and target_node.origin_node.target._name == 'aten::convolution': - return False - if node1.is_reduction(): - return False - if len(node1.read_writes.writes) != 1: - return False - if node1.node not in target_node.inputs or any(["view" in str(ori) for ori in node1.node.origins]): #FIXME - return False - - # Currently only BMM, MM support prologue fusion - if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)): - return False - # We don't fuse this edge case... - if base_template_node2[0].group[1][0][0] == 1: - return False - - if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]: - node1 = self.revert_group(node1) - return True - - return self.scheduler.can_fuse_origin(node1, node2) - def _set_flush_status(self, status: bool): self._ready_to_flush = status @@ -100,15 +44,10 @@ def can_fuse_horizontal(self, node1, node2): if (len(node1.get_nodes())+ len(node2.get_nodes())) > self.max_fusion_size: return False + _, (vars1, reduce1) = node1.group _, (vars2, reduce2) = node2.group - # Reduction is currently not supported - if node1.is_reduction() and node2.is_reduction() and not node1.is_template() and not node2.is_template() and extension_config.CONFIG_FUSION_REDUCTION_REDUCTION: - return vars1 == vars2 and reduce1 == reduce2 # and node1.inverse_users == node2.inverse_users - if node1.is_reduction() or node2.is_reduction(): - return False - # Can't fuse two template node if node1.is_template() and node2.is_template(): return False @@ -116,17 +55,25 @@ def can_fuse_horizontal(self, node1, node2): if '_unsafe_index' in node1.get_nodes()[0].node.origins or "_unsafe_index" in node2.get_nodes()[0].node.origins: return False - # Check template node fusion - if node1.is_template() or node2.is_template(): + # Extract base template node + base_template_node1 = [node for node in node1.get_nodes() if node.is_template()] + base_template_node2 = [node for node in node2.get_nodes() if node.is_template()] + + # Case 0: Reduction fusion + if node1.is_reduction() and node2.is_reduction() and not node1.is_template() and not node2.is_template() and extension_config.CONFIG_FUSION_REDUCTION_REDUCTION: + return vars1 == vars2 and reduce1 == reduce2 + + # Case 1: Template + Pointwise fusion + if len(base_template_node1) == 1 and len(base_template_node2) == 0 and not node2.is_reduction(): # Don't fuse maxpool template code from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate - template_node1 = next((n for n in node1.get_nodes() if n.is_template()), None) - template_node2 = next((n for n in node2.get_nodes() if n.is_template()), None) - if template_node1 and len(node1.get_nodes()) == 1 and isinstance(template_node1.node.template, MLIRMaxPoolTemplate) or \ - template_node2 and len(node2.get_nodes()) == 1 and isinstance(template_node2.node.template, MLIRMaxPoolTemplate): + template_node = base_template_node1[0] + epilogue_node = node2 + + if isinstance(template_node.node.template, MLIRMaxPoolTemplate): return False # Pointwise check @@ -135,23 +82,76 @@ def can_fuse_horizontal(self, node1, node2): if v1_total != v2_total: return False - # Pattern check - template_node, act_node = (template_node1, node2) if template_node1 else (template_node2, node1) - has_depedency = set(act_node.inverse_users) <= set(template_node.get_nodes()) + # Pattern check: check data dependency between act_node and template_node + template_sched_nodes = list(template_node.get_nodes()) + # Buffers produced by the template (its outputs) + template_writes = { + dep + for n in template_sched_nodes + for dep in n.read_writes.writes + } + # Buffers still required by the activation node (unmet) or read by it + epilogue_unmet = { dep for dep in epilogue_node.unmet_dependencies } + has_depedency = bool(template_writes) and template_writes.issubset(epilogue_unmet) if not has_depedency: return False # Revert act_node.group : simplify_and_reorder() modified _body, _size, group - if template_node.group != act_node.group: + if template_node.group != epilogue_node.group: # We don't fuse this case... if (isinstance(template_node.node.template, MLIRBMMTemplate) or isinstance(template_node.node.template, MLIRGemmTemplate)) and template_node.group[1][0][0] == 1: return False - if list(template_node.group[1][0]) != list(act_node.get_nodes()[0].node.data.get_size()): + if list(template_node.group[1][0]) != list(epilogue_node.get_nodes()[0].node.data.get_size()): return False - self.revert_group(act_node) + self.revert_group(epilogue_node) return True + # Case 2: Tempalte + Reduction fusion + if len(base_template_node1) == 1 and len(base_template_node2) == 0 and node2.is_reduction() and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE: + from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate + from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate + if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)): + return False + + size_match = node1.get_nodes()[0].node.get_numel() == reduce(operator.mul, node2.get_nodes()[0].node.get_size(), 1) * reduce(operator.mul, node2.get_nodes()[0].node.get_reduction_size(), 1) + target_symbol = symbols("r0") + try: + stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.get_nodes()[0].node).split("\n") if "r0" in i][1] + stride = int(sympify(stride).coeff(target_symbol)) + except: + return False + + # We can't fuse dim=-1 + layout_possible = stride != 1 + # Directed linked? + dependency_check = node2.get_nodes()[0] in [node.node for node in base_template_node1[0].users]# and len(node2.read_writes.reads)==1 + dependency_size = all([i.get_numel() == node1.get_nodes()[0].node.get_numel() for i in node2.read_writes.reads]) + return size_match and layout_possible and dependency_check and dependency_size + + # Case 3: Prologue(Pointwise) + Tempalte + if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE: + from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate + from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate + + target_node = base_template_node2[0].node + # Currently only BMM, MM support prologue fusion + if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)): + return False + + if len(node1.read_writes.writes) != 1: + return False + if node1.node not in target_node.inputs or any(["view" in str(ori) for ori in node1.node.origins]): #FIXME + return False + + # We don't fuse this edge case... + if base_template_node2[0].group[1][0][0] == 1: + return False + + if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]: + node1 = self.revert_group(node1) + return True + # Check elementwise fusion if vars1 == vars2 and reduce1 == reduce2: return True @@ -270,7 +270,7 @@ def define_kernel(self, src_code, meta_code, kernel_name, vector_lane, spad_info wrapper.define_kernel(kernel_name, codecache_def.getvalue(), gpu=False) return kernel_name - def codegen_template(self, template_node, prologue_nodes, epilogue_nodes): + def codegen_template(self, template_node, epilogue_nodes, prologue_nodes): # Generate template code template_buffer = template_node.node kernel, tile_candidates, render = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group) From 9abc0602b7b279e064ec2a4ec3ac921fae658d64 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 7 Jan 2026 10:25:51 +0000 Subject: [PATCH 050/194] [Fix] Don't split a reduce kernel --- PyTorchSimFrontend/mlir/mlir_scheduling.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 3799633c..640a00be 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -7,12 +7,14 @@ from PyTorchSimFrontend import extension_config from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel +from torch.utils._ordered_set import OrderedSet from torch._inductor import config from torch._inductor.scheduler import BaseScheduling, FusedSchedulerNode, SchedulerNode, BaseSchedulerNode from torch._inductor.utils import IndentedBuffer from torch._inductor.virtualized import V from torch._inductor.ir import LoopBody from torch._inductor import dependencies +from torch._inductor.codegen.common import BackendFeature from . import mlir_common from . import mlir_lowering # DO NOT REMOVE THIS LINE, it is used for lowering @@ -35,6 +37,10 @@ def _set_flush_status(self, status: bool): def reset_kernel_group(self): self.kernel_group = mlir_common.MLIRWrapperKenrelGroup() + def get_backend_features(self, device): + """Return a set of .codegen.common.BackendFeature()""" + return OrderedSet([BackendFeature.REDUCE_TO_SINGLE_ELEMENT]) + def can_fuse_vertical(self, node1, node2): return self.can_fuse_horizontal(node1, node2) From 2c7264b903bc2aae7d215a2c1f9de592c2ac94a3 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 7 Jan 2026 10:43:44 +0000 Subject: [PATCH 051/194] [Fix] Add a missing reduction fusion condition --- PyTorchSimFrontend/mlir/mlir_scheduling.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 640a00be..35ccfee8 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -53,6 +53,11 @@ def can_fuse_horizontal(self, node1, node2): _, (vars1, reduce1) = node1.group _, (vars2, reduce2) = node2.group + # For input/dependency checks + reads1 = {dep.name for dep in node1.read_writes.reads} + reads2 = {dep.name for dep in node2.read_writes.reads} + writes1 = {dep.name for dep in node1.read_writes.writes} + writes2 = {dep.name for dep in node2.read_writes.writes} # Can't fuse two template node if node1.is_template() and node2.is_template(): @@ -66,8 +71,20 @@ def can_fuse_horizontal(self, node1, node2): base_template_node2 = [node for node in node2.get_nodes() if node.is_template()] # Case 0: Reduction fusion - if node1.is_reduction() and node2.is_reduction() and not node1.is_template() and not node2.is_template() and extension_config.CONFIG_FUSION_REDUCTION_REDUCTION: - return vars1 == vars2 and reduce1 == reduce2 + if ( + node1.is_reduction() + and node2.is_reduction() + and not node1.is_template() + and not node2.is_template() + and extension_config.CONFIG_FUSION_REDUCTION_REDUCTION + ): + # 1) Same loop/iteration domain + same_iter = vars1 == vars2 and reduce1 == reduce2 + # 2) No data dependency between the two reductions + no_dependency = not ( + writes1 & (reads2 | writes2) or writes2 & (reads1 | writes1) + ) + return same_iter and no_dependency # Case 1: Template + Pointwise fusion if len(base_template_node1) == 1 and len(base_template_node2) == 0 and not node2.is_reduction(): From b951b95ac596692a83fca926d0a44de3776d5e30 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 7 Jan 2026 11:20:55 +0000 Subject: [PATCH 052/194] [Fix] update indirect_index interface for v2.8 --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +- PyTorchSimFrontend/mlir/mlir_common.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 9f5c0674..bc4592b4 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -667,7 +667,7 @@ def store_reduction(self, name, index, value): dram_shape, tile_shape, attribute) self.reductions_suffix.writeline(common.DeferredLine(name, code)) - def indirect_indexing(self, index_var, size, check=True): + def indirect_indexing(self, index_var, size, check=True, wrap_neg=True): return str(index_var) def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index): diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 6888f9a1..468f1a47 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -645,7 +645,7 @@ def store(self, name, index, value, mode=None): def reduction(self, dtype, src_dtype, reduction_type, value): raise NotImplementedError() - def indirect_indexing(self, index_var, size, check): + def indirect_indexing(self, index_var, size, check, wrap_neg): raise NotImplementedError() def codegen_global_init(self): @@ -888,9 +888,9 @@ def inner(*args, **kwargs): return inner @staticmethod - def indirect_indexing(index_var, size, check=True): + def indirect_indexing(index_var, size, check=True, wrap_neg=True): # Skip CSE since this doesn't return an expression - return self.indirect_indexing(index_var, size, check) + return self.indirect_indexing(index_var, size, check, wrap_neg) @staticmethod def load(name: str, index: sympy.Expr): From c6ba98c6e0d82bdadc013918c54aeaa56a0520df Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 7 Jan 2026 13:53:31 +0000 Subject: [PATCH 053/194] [Fix] Allow cpp kernel code in the wrapper function --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index bc4592b4..654099c1 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -91,6 +91,7 @@ def write_header(self): from torch._inductor.hooks import run_intermediate_hooks from torch._inductor.utils import maybe_profile from torch._inductor.codegen.memory_planning import _align as align + from torch._inductor.async_compile import AsyncCompile from torch import device, empty, empty_strided from {extension_codecache.__name__} import CustomAsyncCompile @@ -105,6 +106,7 @@ def write_header(self): alloc_from_pool = torch.ops.inductor._alloc_from_pool reinterpret_tensor = torch.ops.aten._reinterpret_tensor custom_async_compile = CustomAsyncCompile() + async_compile = AsyncCompile() os.environ["TORCHSIM_LAST_COMPILED_MODULE"] = __file__ print(f\'Wrapper Codegen Path = {{__file__}}\') """ @@ -138,6 +140,7 @@ def device2host_memcpy(buffer): ) def write_prefix(self): + self.write_async_compile_wait() self.prefix.splice( """ def call(args): From fd07eda99e4f8ceea01d0388a39d3d8952f0c139 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 8 Jan 2026 05:42:25 +0000 Subject: [PATCH 054/194] [Ops] Use V.kernel instead of argument passing --- PyTorchSimFrontend/mlir/mlir_common.py | 2 +- PyTorchSimFrontend/mlir/mlir_ops.py | 413 +++++++++++------------ PyTorchSimFrontend/mlir/mlir_template.py | 2 +- 3 files changed, 208 insertions(+), 209 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 468f1a47..7b6ee11c 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -867,7 +867,7 @@ class CSEProxy: @staticmethod def __getattr__(name: str) -> Callable[..., common.CSEVariable]: # type: ignore[misc] def inner(*args, **kwargs): - code, ret_info = getattr(parent_handler, name)(*args, var_info=self.var_info, **kwargs) + code, ret_info = getattr(parent_handler, name)(*args, **kwargs) target_buffer = self.target_buffer_override.get() target_cse = self.target_cse_override.get() if isinstance(code, common.DeferredLine): diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index 21995512..2b964c55 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -20,7 +20,7 @@ def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape, class ExtensionOverrides(common.OpOverrides): @staticmethod - def constant(value, src_type, *args, var_info=None, **kwargs): + def constant(value, src_type, *args, **kwargs): if isinstance(src_type, torch.dtype): src_type = mlir_common.DTYPE_TO_MLIR[src_type] @@ -37,8 +37,8 @@ def constant(value, src_type, *args, var_info=None, **kwargs): return f'arith.constant {value} : {src_type}', [1, src_type] @staticmethod - def broadcast(operand, target_size, *args, var_info=None, **kwargs): - src_size, dtype = var_info[operand] + def broadcast(operand, target_size, *args, **kwargs): + src_size, dtype = V.kernel.var_info[operand] src_shape = f"vector<{src_size}x{dtype}>" if src_size > 1 else dtype dst_shape = f"vector<{target_size}x{dtype}>" @@ -63,8 +63,8 @@ def broadcast(operand, target_size, *args, var_info=None, **kwargs): return op_str, [target_size, dtype] @staticmethod - def broadcast_unflat(operand, target_size, *args, var_info=None, **kwargs): - src_size, dtype = var_info[operand] + def broadcast_unflat(operand, target_size, *args, **kwargs): + src_size, dtype = V.kernel.var_info[operand] outer_dim = target_size // src_size src_shape = f"vector<{src_size}x{dtype}>" @@ -87,33 +87,33 @@ def randint64(self, *args, **kwargs): # Special operaitons @staticmethod - def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", ninf_declared=False, **kwargs): + def masked(mask, body, other, *args, tile_size=16, dtype="f32", ninf_declared=False, **kwargs): result = body() val = ops.constant(other, dtype, *args, **kwargs) result = ops.where(mask, result, val) - return result, var_info[result] + return result, V.kernel.var_info[result] @staticmethod - def where(condition, operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) - cond_type = var_info[condition] - operand_type = var_info[operand1] + def where(condition, operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) + cond_type = V.kernel.var_info[condition] + operand_type = V.kernel.var_info[operand1] condition = ops.to_bool(condition) if cond_type[0] < tile_size: condition = ops.broadcast(condition, tile_size) elif cond_type[0] > tile_size: operand1 = ops.broadcast(operand1, cond_type[0]) operand2 = ops.broadcast(operand2, cond_type[0]) - tile_size, ret_type = var_info[operand1] + tile_size, ret_type = V.kernel.var_info[operand1] shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type cond_shape = f"vector<{tile_size}xi1>" if tile_size > 1 else "" return f"arith.select %{condition}, %{operand1}, %{operand2} : {cond_shape}, {shape}", [tile_size, ret_type] @staticmethod - def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs): + def to_dtype(operand, dst_mlir_dtype, *args, **kwargs): # Extract source information - src_mlir_dtype = var_info[operand][1] - tile_size = var_info[operand][0] + src_mlir_dtype = V.kernel.var_info[operand][1] + tile_size = V.kernel.var_info[operand][0] # Normalize destination type (Torch dtype -> MLIR string) if isinstance(dst_mlir_dtype, torch.dtype): @@ -172,13 +172,13 @@ def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs): return op_str, [tile_size, dst_mlir_dtype] @staticmethod - def identity(operand, *args, var_info=None, **kwargs): - operand_info = var_info[operand] + def identity(operand, *args, **kwargs): + operand_info = V.kernel.var_info[operand] return operand, operand_info @staticmethod - def to_dtype_bitcast(operand, dtype, *args, var_info=None, **kwargs): - tile_size, current_src_type = var_info[operand] + def to_dtype_bitcast(operand, dtype, *args, **kwargs): + tile_size, current_src_type = V.kernel.var_info[operand] if isinstance(dtype, torch.dtype): dst_mlir_type = mlir_common.DTYPE_TO_MLIR[dtype] @@ -201,11 +201,12 @@ def to_dtype_bitcast(operand, dtype, *args, var_info=None, **kwargs): # Binary element wise operations @staticmethod - def binary_elementwise_common(operand1, operand2, var_info): + def binary_elementwise_common(operand1, operand2): + V.kernel.var_info = V.kernel.var_info operand1.bounds = operand1.bounds.unknown() operand2.bounds = operand2.bounds.unknown() - op_type1 = var_info[operand1] - op_type2 = var_info[operand2] + op_type1 = V.kernel.var_info[operand1] + op_type2 = V.kernel.var_info[operand2] # Tile size check if op_type1[0] != op_type2[0]: # Try to broad cast @@ -213,33 +214,33 @@ def binary_elementwise_common(operand1, operand2, var_info): rhs_tile_size, rhs_dtype = op_type2 if lhs_tile_size > rhs_tile_size: operand2 = ops.broadcast(operand2, lhs_tile_size) - op_type2 = var_info[operand2] + op_type2 = V.kernel.var_info[operand2] elif lhs_tile_size < rhs_tile_size: operand1 = ops.broadcast(operand1, rhs_tile_size) - op_type1 = var_info[operand1] + op_type1 = V.kernel.var_info[operand1] # Data type check if op_type1[1] != op_type2[1]: if op_type1[1] == "index" or op_type1 == "index": if op_type1[1] == "index": operand1 = ops.index_cast(operand1, op_type2[1]) - op_type1 = var_info[operand1] + op_type1 = V.kernel.var_info[operand1] if op_type2[1] == "index": operand2 = ops.index_cast(operand2, op_type1[1]) - op_type2 = var_info[operand2] + op_type2 = V.kernel.var_info[operand2] elif op_type1[1][0] == "i" and op_type2[1][0] == "f": operand1 = ops.to_dtype(operand1, op_type2[1]) - op_type1 = var_info[operand1] + op_type1 = V.kernel.var_info[operand1] elif op_type1[1][0] == "f" and op_type2[1][0] == "i": operand2 = ops.to_dtype(operand2, op_type1[1]) - op_type2 = var_info[operand2] + op_type2 = V.kernel.var_info[operand2] elif op_type1[1][0] == op_type2[1][0]: if mlir_common.MLIR_TO_BIT[op_type1[1]] > mlir_common.MLIR_TO_BIT[op_type2[1]]: operand2 = ops.ext(operand2, op_type1[1]) - op_type2 = var_info[operand2] + op_type2 = V.kernel.var_info[operand2] elif mlir_common.MLIR_TO_BIT[op_type1[1]] < mlir_common.MLIR_TO_BIT[op_type2[1]]: operand1 = ops.ext(operand1, op_type2[1]) - op_type1 = var_info[operand1] + op_type1 = V.kernel.var_info[operand1] else: raise NotImplementedError("Unsupported type converting") @@ -249,45 +250,45 @@ def binary_elementwise_common(operand1, operand2, var_info): return tile_size, ret_type, operand1, operand2 @staticmethod - def abs(operand, *args, var_info=None, **kwargs): + def abs(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def exp(operand, *args, var_info=None, **kwargs): + def exp(operand, *args, **kwargs): # Check scalar - op_type = var_info[operand] + op_type = V.kernel.var_info[operand] if op_type[0] == 1: operand = ops.broadcast(operand, 4) val = ops.exp(operand) result = ops.extractelement(val, 0) - return result, var_info[result] - op_type = var_info[operand] + return result, V.kernel.var_info[result] + op_type = V.kernel.var_info[operand] tile_size = op_type[0] dtype = op_type[1] shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype return f'math.exp %{operand} : {shape}', [tile_size, dtype] @staticmethod - def exp2(operand, *args, var_info=None, **kwargs): + def exp2(operand, *args, **kwargs): # Hands-on part: implement exp2 using math.exp2 - # var_info = {operand: [tile_size, dtype]} - # Ex) var_info[operand] = [8, "f32"] + # V.kernel.var_info = {operand: [tile_size, dtype]} + # Ex) V.kernel.var_info[operand] = [8, "f32"] ln2 = math.log(2) coeff = ops.constant(ln2, "f32") operand = ops.mul(operand, coeff) - return ops.exp(operand), var_info[operand] + return ops.exp(operand), V.kernel.var_info[operand] @staticmethod - def expm1(operand, *args, var_info=None, **kwargs): + def expm1(operand, *args, **kwargs): coeff = ops.constant(1.0, "f32") operand = ops.exp(operand) operand = ops.sub(operand, coeff) - return operand, var_info[operand] + return operand, V.kernel.var_info[operand] @staticmethod - def sqrt(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] + def sqrt(operand, *args, **kwargs): + op_type = V.kernel.var_info[operand] tile_size = op_type[0] dtype = op_type[1] @@ -300,14 +301,14 @@ def sqrt(operand, *args, var_info=None, **kwargs): return f'math.sqrt %{operand} : {shape}', [tile_size, dtype] @staticmethod - def relu(operand, *args, var_info=None, **kwargs): - src_mlir_dtype = var_info[operand][1] - tile_size = var_info[operand][0] + def relu(operand, *args, **kwargs): + src_mlir_dtype = V.kernel.var_info[operand][1] + tile_size = V.kernel.var_info[operand][0] return ops.maximum(operand, ops.constant(0, src_mlir_dtype)), [tile_size, src_mlir_dtype] @staticmethod - def minimum(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def minimum(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type if ret_type[0] == "f": opcode = f'arith.minimumf' @@ -316,8 +317,8 @@ def minimum(operand1, operand2, *args, var_info=None, **kwargs): return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod - def maximum(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def maximum(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type if ret_type[0] == "f": opcode = f'arith.maximumf' @@ -326,17 +327,17 @@ def maximum(operand1, operand2, *args, var_info=None, **kwargs): return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod - def cos(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] + def cos(operand, *args, **kwargs): + op_type = V.kernel.var_info[operand] # Check scalar - op_type = var_info[operand] + op_type = V.kernel.var_info[operand] if op_type[0] == 1: operand = ops.broadcast(operand, 4) val = ops.cos(operand) result = ops.extractelement(val, 0) - return result, var_info[result] - op_type = var_info[operand] + return result, V.kernel.var_info[result] + op_type = V.kernel.var_info[operand] tile_size = op_type[0] dtype = op_type[1] @@ -347,17 +348,17 @@ def cos(operand, *args, var_info=None, **kwargs): return f'math.cos %{operand} : {shape}', [tile_size, dtype] @staticmethod - def sin(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] + def sin(operand, *args, **kwargs): + op_type = V.kernel.var_info[operand] # Check scalar - op_type = var_info[operand] + op_type = V.kernel.var_info[operand] if op_type[0] == 1: operand = ops.broadcast(operand, 4) val = ops.sin(operand) result = ops.extractelement(val, 0) - return result, var_info[result] - op_type = var_info[operand] + return result, V.kernel.var_info[result] + op_type = V.kernel.var_info[operand] tile_size = op_type[0] dtype = op_type[1] @@ -368,51 +369,51 @@ def sin(operand, *args, var_info=None, **kwargs): return f'math.sin %{operand} : {shape}', [tile_size, dtype] @staticmethod - def tan(operand, *args, var_info=None, **kwargs): + def tan(operand, *args, **kwargs): sin_res = ops.sin(operand) cos_res = ops.cos(operand) operand = ops.truediv(sin_res, cos_res) - return operand, var_info[operand] + return operand, V.kernel.var_info[operand] @staticmethod - def lgamma(operand, *args, var_info=None, **kwargs): + def lgamma(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def erf(operand, *args, var_info=None, **kwargs): + def erf(operand, *args, **kwargs): # Check scalar - op_type = var_info[operand] + op_type = V.kernel.var_info[operand] if op_type[0] == 1: operand = ops.broadcast(operand, 4) val = ops.erf(operand) result = ops.extractelement(val, 0) - return result, var_info[result] - op_type = var_info[operand] + return result, V.kernel.var_info[result] + op_type = V.kernel.var_info[operand] tile_size = op_type[0] dtype = op_type[1] shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype return f'math.erf %{operand} : {shape}', [tile_size, dtype] @staticmethod - def cosh(operand, *args, var_info=None, **kwargs): + def cosh(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def sinh(operand, *args, var_info=None, **kwargs): + def sinh(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def tanh(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] + def tanh(operand, *args, **kwargs): + op_type = V.kernel.var_info[operand] # Check scalar - op_type = var_info[operand] + op_type = V.kernel.var_info[operand] if op_type[0] == 1: operand = ops.broadcast(operand, 4) val = ops.tanh(operand) result = ops.extractelement(val, 0) - return result, var_info[result] - op_type = var_info[operand] + return result, V.kernel.var_info[result] + op_type = V.kernel.var_info[operand] tile_size = op_type[0] dtype = op_type[1] @@ -423,80 +424,80 @@ def tanh(operand, *args, var_info=None, **kwargs): return f'math.tanh %{operand} : {shape}', [tile_size, dtype] @staticmethod - def acos(operand, *args, var_info=None, **kwargs): + def acos(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def acosh(operand, *args, var_info=None, **kwargs): + def acosh(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def asin(operand, *args, var_info=None, **kwargs): + def asin(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def asinh(operand, *args, var_info=None, **kwargs): + def asinh(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def atan2(operand1, operand2, *args, var_info=None, **kwargs): + def atan2(operand1, operand2, *args, **kwargs): raise NotImplementedError @staticmethod - def atan(operand, *args, var_info=None, **kwargs): + def atan(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def atanh(operand, *args, var_info=None, **kwargs): + def atanh(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def copysign(operand1, operand2, *args, var_info=None, **kwargs): + def copysign(operand1, operand2, *args, **kwargs): raise NotImplementedError @staticmethod - def erfc(operand, *args, var_info=None, **kwargs): + def erfc(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def erfinv(operand, *args, var_info=None, **kwargs): + def erfinv(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def frexp(operand, *args, var_info=None, **kwargs): + def frexp(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def hypot(operand1, operand2, *args, var_info=None, **kwargs): + def hypot(operand1, operand2, *args, **kwargs): raise NotImplementedError @staticmethod - def log10(operand, *args, var_info=None, **kwargs): + def log10(operand, *args, **kwargs): val_ln = ops.log(operand) - tile_size, dtype = var_info[val_ln] + tile_size, dtype = V.kernel.var_info[val_ln] inv_ln10 = 1/math.log(10) const_op = ops.constant(inv_ln10, dtype) # Multiply: ln(x) * (1/ln(10)) result = ops.mul(val_ln, const_op) - return result, var_info[result] + return result, V.kernel.var_info[result] @staticmethod - def log2(operand, *args, var_info=None, **kwargs): + def log2(operand, *args, **kwargs): val_ln = ops.log(operand) - tile_size, dtype = var_info[val_ln] + tile_size, dtype = V.kernel.var_info[val_ln] inv_ln10 = 1/math.log(2) const_op = ops.constant(inv_ln10, dtype) # Multiply: ln(x) * (1/ln(10)) result = ops.mul(val_ln, const_op) - return result, var_info[result] + return result, V.kernel.var_info[result] @staticmethod - def log(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] + def log(operand, *args, **kwargs): + op_type = V.kernel.var_info[operand] tile_size = op_type[0] dtype = op_type[1] @@ -508,109 +509,107 @@ def log(operand, *args, var_info=None, **kwargs): return f'math.log %{operand} : {shape}', [tile_size, dtype] @staticmethod - def log1p(operand, *args, var_info=None, **kwargs): - tile_size, dtype = var_info[operand] + def log1p(operand, *args, **kwargs): + tile_size, dtype = V.kernel.var_info[operand] const_one = ops.constant(1, dtype) - # 3. 덧셈 연산: (x + 1) - # ops.add가 (result_ssa, result_info)를 반환한다고 가정 val_add = ops.add(operand, const_one) result = ops.log(val_add) - return result, var_info[result] + return result, V.kernel.var_info[result] @staticmethod - def nextafter(operand1, operand2, *args, var_info=None, **kwargs): + def nextafter(operand1, operand2, *args, **kwargs): raise NotImplementedError @staticmethod - def logical_and(operand1, operand2, *args, var_info=None, **kwargs): - if var_info[operand1][1] != "i1": + def logical_and(operand1, operand2, *args, **kwargs): + if V.kernel.var_info[operand1][1] != "i1": operand1 = ops.to_bool(operand1) - if var_info[operand2][1] != "i1": + if V.kernel.var_info[operand2][1] != "i1": operand2 = ops.to_bool(operand2) result = ops.and_(operand1, operand2) - return result, var_info[result] + return result, V.kernel.var_info[result] @staticmethod - def logical_or(operand1, operand2, *args, var_info=None, **kwargs): - if var_info[operand1][1] != "i1": + def logical_or(operand1, operand2, *args, **kwargs): + if V.kernel.var_info[operand1][1] != "i1": operand1 = ops.to_bool(operand1) - if var_info[operand2][1] != "i1": + if V.kernel.var_info[operand2][1] != "i1": operand2 = ops.to_bool(operand2) result = ops.or_(operand1, operand2) - return result, var_info[result] + return result, V.kernel.var_info[result] @staticmethod - def logical_xor(operand1, operand2, *args, var_info=None, **kwargs): - if var_info[operand1][1] != "i1": + def logical_xor(operand1, operand2, *args, **kwargs): + if V.kernel.var_info[operand1][1] != "i1": operand1 = ops.to_bool(operand1) - if var_info[operand2][1] != "i1": + if V.kernel.var_info[operand2][1] != "i1": operand2 = ops.to_bool(operand2) result = ops.xor(operand1, operand2) - return result, var_info[result] + return result, V.kernel.var_info[result] @staticmethod - def logical_not(operand, *args, var_info=None, **kwargs): - op_info = var_info[operand] + def logical_not(operand, *args, **kwargs): + op_info = V.kernel.var_info[operand] tile_size = op_info[0] dtype = op_info[1] zero_const = ops.constant(0, dtype) result = ops.eq(operand, zero_const) - return result, var_info[result] + return result, V.kernel.var_info[result] @staticmethod - def bitwise_and(operand1, operand2, *args, var_info=None, **kwargs): + def bitwise_and(operand1, operand2, *args, **kwargs): # Float check - if var_info[operand1][1].startswith("f") or var_info[operand2][1].startswith("f"): + if V.kernel.var_info[operand1][1].startswith("f") or V.kernel.var_info[operand2][1].startswith("f"): raise ValueError("Bitwise AND not supported for floats") result = ops.and_(operand1, operand2) - return result, var_info[result] + return result, V.kernel.var_info[result] @staticmethod - def bitwise_not(operand, *args, var_info=None, **kwargs): - tile_size, dtype = var_info[operand] + def bitwise_not(operand, *args, **kwargs): + tile_size, dtype = V.kernel.var_info[operand] # Float check - if var_info[operand][1].startswith("f"): + if V.kernel.var_info[operand][1].startswith("f"): raise ValueError("Bitwise NOT not supported for floats") neg_one = ops.constant(-1, dtype) result = ops.xor(operand, neg_one) - return result, var_info[result] + return result, V.kernel.var_info[result] @staticmethod - def bitwise_or(operand1, operand2, *args, var_info=None, **kwargs): + def bitwise_or(operand1, operand2, *args, **kwargs): # Float check - if var_info[operand1][1].startswith("f") or var_info[operand2][1].startswith("f"): + if V.kernel.var_info[operand1][1].startswith("f") or V.kernel.var_info[operand2][1].startswith("f"): raise ValueError("Bitwise AND not supported for floats") result = ops.or_(operand1, operand2) - return result, var_info[result] + return result, V.kernel.var_info[result] @staticmethod - def bitwise_xor(operand1, operand2, *args, var_info=None, **kwargs): + def bitwise_xor(operand1, operand2, *args, **kwargs): # Float check - if var_info[operand1][1].startswith("f") or var_info[operand2][1].startswith("f"): + if V.kernel.var_info[operand1][1].startswith("f") or V.kernel.var_info[operand2][1].startswith("f"): raise ValueError("Bitwise AND not supported for floats") result = ops.xor(operand1, operand2) - return result, var_info[result] + return result, V.kernel.var_info[result] @staticmethod - def bitwise_left_shift(operand1, operand2, *args, var_info=None, **kwargs): + def bitwise_left_shift(operand1, operand2, *args, **kwargs): raise NotImplementedError @staticmethod - def bitwise_right_shift(operand1, operand2, *args, var_info=None, **kwargs): + def bitwise_right_shift(operand1, operand2, *args, **kwargs): raise NotImplementedError @staticmethod - def rsqrt(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] + def rsqrt(operand, *args, **kwargs): + op_type = V.kernel.var_info[operand] tile_size = op_type[0] dtype = op_type[1] @@ -622,28 +621,28 @@ def rsqrt(operand, *args, var_info=None, **kwargs): return f'math.rsqrt %{operand} : {shape}', [tile_size, dtype] @staticmethod - def sigmoid(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] + def sigmoid(operand, *args, **kwargs): + op_type = V.kernel.var_info[operand] tile_size = op_type[0] dtype = op_type[1] one = ops.constant(1, dtype) return ops.truediv(one, ops.add(one, ops.exp(ops.neg(operand)))), [tile_size, dtype] @staticmethod - def fmod(operand1, operand2, *args, var_info=None, **kwargs): + def fmod(operand1, operand2, *args, **kwargs): raise NotImplementedError @staticmethod - def isinf(operand, *args, var_info=None, **kwargs): + def isinf(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def isnan(operand, *args, var_info=None, **kwargs): + def isnan(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def round(operand, *args, var_info=None, **kwargs): - tile_size, dtype = var_info[operand] + def round(operand, *args, **kwargs): + tile_size, dtype = V.kernel.var_info[operand] shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype if dtype.startswith("f"): @@ -652,8 +651,8 @@ def round(operand, *args, var_info=None, **kwargs): return operand, [tile_size, dtype] @staticmethod - def floor(operand, *args, var_info=None, **kwargs): - tile_size, dtype = var_info[operand] + def floor(operand, *args, **kwargs): + tile_size, dtype = V.kernel.var_info[operand] shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype if dtype.startswith("f"): @@ -662,12 +661,12 @@ def floor(operand, *args, var_info=None, **kwargs): return operand, [tile_size, dtype] @staticmethod - def sign(operand, *args, var_info=None, **kwargs): + def sign(operand, *args, **kwargs): raise NotImplementedError @staticmethod - def trunc(operand, *args, var_info=None, **kwargs): - tile_size, dtype = var_info[operand] + def trunc(operand, *args, **kwargs): + tile_size, dtype = V.kernel.var_info[operand] shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype if dtype.startswith("f"): @@ -676,8 +675,8 @@ def trunc(operand, *args, var_info=None, **kwargs): return operand, [tile_size, dtype] @staticmethod - def ceil(operand, *args, var_info=None, **kwargs): - tile_size, dtype = var_info[operand] + def ceil(operand, *args, **kwargs): + tile_size, dtype = V.kernel.var_info[operand] shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype if dtype.startswith("f"): @@ -687,8 +686,8 @@ def ceil(operand, *args, var_info=None, **kwargs): # Logical operations @staticmethod - def neg(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] + def neg(operand, *args, **kwargs): + op_type = V.kernel.var_info[operand] tile_size = op_type[0] dtype = op_type[1] @@ -700,8 +699,8 @@ def neg(operand, *args, var_info=None, **kwargs): return f'arith.negf %{operand} : {shape}', [tile_size, dtype] @staticmethod - def reciprocal(operand, *args, var_info=None, **kwargs): - op_type = var_info[operand] + def reciprocal(operand, *args, **kwargs): + op_type = V.kernel.var_info[operand] tile_size = op_type[0] dtype = op_type[1] @@ -712,8 +711,8 @@ def reciprocal(operand, *args, var_info=None, **kwargs): return ops.truediv(ops.constant(1.0, dtype), operand), [tile_size, dtype] @staticmethod - def eq(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def eq(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) if ret_type[0] == "f": op_type = "arith.cmpf" attribute = "oeq" @@ -727,8 +726,8 @@ def eq(operand1, operand2, *args, var_info=None, **kwargs): return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] @staticmethod - def ne(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def ne(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) if ret_type[0] == "f": op_type = "arith.cmpf" attribute = "one" @@ -742,8 +741,8 @@ def ne(operand1, operand2, *args, var_info=None, **kwargs): return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] @staticmethod - def lt(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def lt(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) if ret_type[0] == "f": op_type = "arith.cmpf" attribute = "olt" @@ -757,8 +756,8 @@ def lt(operand1, operand2, *args, var_info=None, **kwargs): return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] @staticmethod - def gt(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def gt(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) if ret_type[0] == "f": op_type = "arith.cmpf" attribute = "ogt" @@ -772,8 +771,8 @@ def gt(operand1, operand2, *args, var_info=None, **kwargs): return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] @staticmethod - def le(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def le(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) if ret_type[0] == "f": op_type = "arith.cmpf" attribute = "ole" @@ -787,8 +786,8 @@ def le(operand1, operand2, *args, var_info=None, **kwargs): return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] @staticmethod - def ge(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def ge(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) if ret_type[0] == "f": op_type = "arith.cmpf" attribute = "oge" @@ -802,29 +801,29 @@ def ge(operand1, operand2, *args, var_info=None, **kwargs): return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] @staticmethod - def add(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def add(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type opcode = f'arith.add{ret_type[0]}' return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod - def sub(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def sub(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type opcode = f'arith.sub{ret_type[0]}' return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod - def mul(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def mul(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type opcode = f'arith.mul{ret_type[0]}' return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod - def pow(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def pow(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) # Type check & auto cast if ret_type.startswith("f"): operand1 = ops.to_dtype(operand1, "f32") @@ -837,37 +836,37 @@ def pow(operand1, operand2, *args, var_info=None, **kwargs): return f"math.pow{ret_type[0]} %{operand1}, %{operand2} : {shape}", [tile_size, ret_type] @staticmethod - def and_(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def and_(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type return f'arith.andi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod - def or_(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def or_(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type return f'arith.ori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod - def xor(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def xor(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type return f'arith.xori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod - def lshift(operand1, operand2, *args, var_info=None, **kwargs): + def lshift(operand1, operand2, *args, **kwargs): raise NotImplementedError @staticmethod - def rshift(operand1, operand2, *args, var_info=None, **kwargs): + def rshift(operand1, operand2, *args, **kwargs): raise NotImplementedError @staticmethod - def truncdiv(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def truncdiv(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type if ret_type.startswith("f"): @@ -877,8 +876,8 @@ def truncdiv(operand1, operand2, *args, var_info=None, **kwargs): return f'arith.divsi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod - def floordiv(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def floordiv(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type if ret_type.startswith("f"): @@ -889,8 +888,8 @@ def floordiv(operand1, operand2, *args, var_info=None, **kwargs): return f'arith.floordivsi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod - def truediv(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def truediv(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type if not ret_type.startswith("f"): @@ -899,12 +898,12 @@ def truediv(operand1, operand2, *args, var_info=None, **kwargs): return f'arith.divf %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod - def int_truediv(operand1, operand2, *args, var_info=None, **kwargs): + def int_truediv(operand1, operand2, *args, **kwargs): """ True division for Integers (Int -> Float). Promotes integers to floats, then performs floating-point division. """ - tile_size, src_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + tile_size, src_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) if not src_type.startswith("f"): target_float_type = "f32" operand1 = ops.to_dtype(operand1, target_float_type) @@ -912,11 +911,11 @@ def int_truediv(operand1, operand2, *args, var_info=None, **kwargs): src_type = target_float_type result = ops.truediv(operand1, operand2) - return result, var_info[result] + return result, V.kernel.var_info[result] @staticmethod - def mod(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def mod(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type if ret_type[0] == "f": raise NotImplementedError("Not support remainder operation for floating point") @@ -925,8 +924,8 @@ def mod(operand1, operand2, *args, var_info=None, **kwargs): return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod - def remainder(operand1, operand2, *args, var_info=None, **kwargs): - tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info) + def remainder(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type if ret_type.startswith("f"): @@ -937,28 +936,28 @@ def remainder(operand1, operand2, *args, var_info=None, **kwargs): return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] @staticmethod - def square(operand, *args, var_info=None, **kwargs): + def square(operand, *args, **kwargs): result = ops.mul(operand, operand) - return result, var_info[result] + return result, V.kernel.var_info[result] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # PyTorchSim specific operations @staticmethod - def alloc(size, src_type, *args, var_info=None, **kwargs): + def alloc(size, src_type, *args, **kwargs): return f"memref.alloc() : memref<{size}x{src_type}>", [size, src_type] @staticmethod - def extractelement(operand, idx, *args, var_info=None, **kwargs): - op_type = var_info[operand] + def extractelement(operand, idx, *args, **kwargs): + op_type = V.kernel.var_info[operand] tile_size = op_type[0] dtype = op_type[1] shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype return f"vector.extract %{operand}[{idx}]: {dtype} from {shape}", [1, dtype] @staticmethod - def ext(operand, dtype, *args, var_info=None, **kwargs): - op_type = var_info[operand] + def ext(operand, dtype, *args, **kwargs): + op_type = V.kernel.var_info[operand] shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else f"{op_type[1]}" target_type = f"vector<{op_type[0]}x{dtype}>" if op_type[0] > 1 else f"{dtype}" if op_type[0] == "f": @@ -968,8 +967,8 @@ def ext(operand, dtype, *args, var_info=None, **kwargs): return f'{opcode} %{operand} : {shape} to {target_type}', [op_type[0], dtype] @staticmethod - def to_bool(operand, *args, var_info=None, **kwargs): - tile_size, ret_type = var_info[operand] + def to_bool(operand, *args, **kwargs): + tile_size, ret_type = V.kernel.var_info[operand] if ret_type == "i1": return operand, [tile_size, ret_type] @@ -984,15 +983,15 @@ def step(size, dtype, *args, **kwargs): return f"vector.step : {index_shape}", [size, dtype] @staticmethod - def index_cast(operand, target_type, *args, var_info=None, **kwrags): - op_type = var_info[operand] + def index_cast(operand, target_type, *args, **kwrags): + op_type = V.kernel.var_info[operand] src_shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else op_type[1] des_shape = f"vector<{op_type[0]}x{target_type}>" if op_type[0] > 1 else target_type return f"arith.index_cast %{operand} : {src_shape} to {des_shape}", [op_type[0], target_type] @staticmethod - def shape_cast(operand, src_shape, dst_shape, *args, var_info=None, **kwargs): - operand_type = var_info[operand] + def shape_cast(operand, src_shape, dst_shape, *args, **kwargs): + operand_type = V.kernel.var_info[operand] return f"vector.shape_cast %{operand} : {src_shape} to {dst_shape}", operand_type @staticmethod @@ -1008,7 +1007,7 @@ def multi_reduction(acc, init, vec_size, red_size, red_shape, red_type, type_nam return line, [red_size, type_name] @staticmethod - def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, var_info=None, **kwargs): + def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, **kwargs): if compute_vec_size == 1: vshape = f"{mlir_dtype}" operation = "affine.load" @@ -1020,8 +1019,8 @@ def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, va return line, [compute_vec_size, mlir_dtype] @staticmethod - def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, var_info=None, **kwargs): - compute_vec_size, mlir_dtype = var_info[operand][0], var_info[operand][1] + def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, **kwargs): + compute_vec_size, mlir_dtype = V.kernel.var_info[operand][0], V.kernel.var_info[operand][1] if compute_vec_size == 1: vshape = f"{mlir_dtype}" diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 4cfe71bf..8f92554c 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -925,7 +925,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs): _, operand_type = self.var_info[value] if mlir_dtype != operand_type: - value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info) + value = ops.to_dtype(value, mlir_dtype) compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"]) # Generate vector load instruction buffer_name = name if not store_force else None From 4bed31b4e48031ac4dacfeb4062180c34b166ca8 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 8 Jan 2026 07:52:40 +0000 Subject: [PATCH 055/194] [Fix] Set epilogue fusoin condition --- PyTorchSimFrontend/mlir/mlir_scheduling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 35ccfee8..f5fadbc3 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -115,7 +115,7 @@ def can_fuse_horizontal(self, node1, node2): } # Buffers still required by the activation node (unmet) or read by it epilogue_unmet = { dep for dep in epilogue_node.unmet_dependencies } - has_depedency = bool(template_writes) and template_writes.issubset(epilogue_unmet) + has_depedency = bool(template_writes) and epilogue_unmet.issubset(template_writes) if not has_depedency: return False From 758b5b379b5880c3214b5d2a3356f5603850d9f6 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 8 Jan 2026 07:53:39 +0000 Subject: [PATCH 056/194] [Fix] Support Identity indexing + Fix wrapper codegen --- .../mlir/mlir_codegen_backend.py | 48 +++++++++++-------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 654099c1..72cd691e 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -21,7 +21,7 @@ is_welford_reduction, sympy_product ) -from torch.utils._sympy.functions import ModularIndexing, FloorDiv +from torch.utils._sympy.functions import ModularIndexing, FloorDiv, Identity from PyTorchSimFrontend import extension_codecache from PyTorchSimFrontend import extension_config from . import mlir_common @@ -198,26 +198,27 @@ def generate(self, is_inference): with contextlib.ExitStack() as stack: stack.enter_context(self.wrapper_call.indent()) self.memory_plan_reuse() - for line in self.lines: - # Add buffer plan hook for dealloc - if isinstance(line, memory_planning.DeallocFromPoolLine): - self.wrapper_call.writeline(f"sram_plan_postfix('{line.node.get_name()}', {line.node.get_name()})") - elif isinstance(line, str) and "del" in line: - name = line.split(" ")[1] - self.wrapper_call.writeline(f"sram_plan_postfix('{name}', {name})") - - if isinstance(line, wrapper.MemoryPlanningLine): - line.codegen(self.wrapper_call) - elif isinstance(line, wrapper.KernelCallLine): - self.wrapper_call.writeline(self.wrap_kernel_call(line.kernel_name, line.call_args)) - else: - if isinstance(line, wrapper.WrapperLine): + with self.set_writeline(self.wrapper_call.writeline): + for line in self.lines: + # Add buffer plan hook for dealloc + if isinstance(line, memory_planning.DeallocFromPoolLine): + self.wrapper_call.writeline(f"sram_plan_postfix('{line.node.get_name()}', {line.node.get_name()})") + elif isinstance(line, str) and "del" in line: + name = line.split(" ")[1] + self.wrapper_call.writeline(f"sram_plan_postfix('{name}', {name})") + + if isinstance(line, wrapper.MemoryPlanningLine): line.codegen(self.wrapper_call) + elif isinstance(line, wrapper.KernelCallLine): + self.wrapper_call.writeline(self.wrap_kernel_call(line.kernel_name, line.call_args)) else: - self.wrapper_call.writeline(line) - # Add buffer plan hook for alloc - if isinstance(line, memory_planning.AllocFromPoolLine) or isinstance(line, wrapper.AllocateLine): - self.wrapper_call.writeline(f"sram_plan_prefix('{line.node.get_name()}', {line.node.get_name()})") + if isinstance(line, wrapper.WrapperLine): + line.codegen(self.wrapper_call) + else: + self.wrapper_call.writeline(line) + # Add buffer plan hook for alloc + if isinstance(line, memory_planning.AllocFromPoolLine) or isinstance(line, wrapper.AllocateLine): + self.wrapper_call.writeline(f"sram_plan_prefix('{line.node.get_name()}', {line.node.get_name()})") output_refs = self.get_output_refs() self.codegen_sram_plan_postfix(output_refs) self.mark_output_type() @@ -334,6 +335,7 @@ def convert_index(self, expr, buffer): expr_str = expr_str.replace("//", " floordiv ") else: raise NotImplementedError("What is this case?") + first_arg = expr.args[0] if len(first_arg.free_symbols) != 1: raise NotImplementedError("What is this case?") @@ -356,6 +358,11 @@ def parse_indices(self, expr, buffer=None, comments="", indirect_dims=[]) -> com if len(expr.args) == 0 and len(indirect_dims) == 0: return expr + # Replace Identity arguments with Identity.args[0] + for arg in expr.args: + if isinstance(arg, Identity): + expr = expr.replace(arg, arg.args[0] if arg.args else arg) + if len(expr.args) == 0: args = [expr] else: @@ -677,9 +684,10 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index): # In case of index expr, dimension size should be divisible by tile size if not self.kernel_group.tile_desc.is_dim_dividable(self.ranges): new_tile_size = self.kernel_group.tile_desc.adjust_tile_to_divisible(self.ranges) + prior_tile_size, prior_ranges = self.kernel_group.tile_desc.get_tile_size(), self.ranges self.kernel_group.tile_desc.set_tile_size(new_tile_size) self.reset("recompile") - raise mlir_common.RecompileSignal(f"Index access (tile size {self.kernel_group.tile_desc.get_tile_size()} is not divisible by {self.ranges})") + raise mlir_common.RecompileSignal(f"Index access (tile size {prior_tile_size} is not divisible by {prior_ranges})") tile_size = tile_desc.get_tile_size_per_lane() compute_vec_size = tile_desc.get_compute_vec_size() From a7ab604788e84f2ddfef55cd46e370deac5bc44d Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 8 Jan 2026 07:54:31 +0000 Subject: [PATCH 057/194] [Fix] Keep contextvar after reset() --- PyTorchSimFrontend/mlir/mlir_common.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 7b6ee11c..3bbf3db7 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -609,9 +609,14 @@ def __init__(self, kernel_group, reason=None): self.recodegen = reason # spad overflow, tile size, vlane stride self.stop_autotune = False - # Context var for codegen - self.target_buffer_override = contextvars.ContextVar("Handler_compute_override", default=self.compute) - self.target_cse_override = contextvars.ContextVar("Handler_cse_override", default=self.cse) + # Context var for codegen - preserve existing ContextVar on reset to avoid Token mismatch + # Don't recreate if already exists (e.g., when reset() is called during active context manager) + if not hasattr(self, 'target_buffer_override'): + instance_id = id(self) + self.target_buffer_override = contextvars.ContextVar(f"Handler_compute_override_{instance_id}", default=self.compute) + self.target_cse_override = contextvars.ContextVar(f"Handler_cse_override_{instance_id}", default=self.cse) + else: + pass def set_ranges(self, lengths, reduction_lengths): if self.call_ranges: From cd52f57713e2ec18439d28eae47d8f8346aaa4f9 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 8 Jan 2026 11:19:26 +0000 Subject: [PATCH 058/194] [Frontend] Add decompsition of default attetnion --- .../extension_device_op_overrides.py | 4 +- PyTorchSimFrontend/mlir/mlir_decomposition.py | 146 ++++++++++++++++++ PyTorchSimFrontend/mlir/mlir_scheduling.py | 1 + 3 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 PyTorchSimFrontend/mlir/mlir_decomposition.py diff --git a/PyTorchSimDevice/extension_device_op_overrides.py b/PyTorchSimDevice/extension_device_op_overrides.py index 17439b95..27a47357 100644 --- a/PyTorchSimDevice/extension_device_op_overrides.py +++ b/PyTorchSimDevice/extension_device_op_overrides.py @@ -3,6 +3,7 @@ from textwrap import dedent from torch._inductor.codegen.common import DeviceOpOverrides, register_device_op_overrides +from torch._inductor.codegen.cpu_device_op_overrides import CpuDeviceOpOverrides class ExtensionDeviceOpOverrides(DeviceOpOverrides): def import_get_raw_stream_as(self, name: str) -> str: @@ -22,4 +23,5 @@ def synchronize(self) -> str: def device_guard(self, device_idx: int) -> str: return "pass" -register_device_op_overrides("npu", ExtensionDeviceOpOverrides()) \ No newline at end of file +register_device_op_overrides("npu", ExtensionDeviceOpOverrides()) +register_device_op_overrides("cpu", CpuDeviceOpOverrides()) \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_decomposition.py b/PyTorchSimFrontend/mlir/mlir_decomposition.py new file mode 100644 index 00000000..33389a91 --- /dev/null +++ b/PyTorchSimFrontend/mlir/mlir_decomposition.py @@ -0,0 +1,146 @@ +import math +import torch +import torch.nn.functional as F +from torch._inductor.decomposition import register_decomposition + +aten = torch.ops.aten + +@register_decomposition(aten._native_multi_head_attention.default) +def decompose_native_multi_head_attention( + query, + key, + value, + embed_dim: int, + num_heads: int, + qkv_weight, + qkv_bias, + proj_weight, + proj_bias, + mask=None, + need_weights: bool = False, +): + """ + Decompose _native_multi_head_attention into scaled_dot_product_attention operations. + + Based on F.scaled_dot_product_attention and nn.MultiheadAttention implementation: + 1. QKV projection (if needed - but query/key/value may already be projected) + 2. Reshape to multi-head format + 3. Scaled dot product: Q @ K^T / sqrt(head_dim) + 4. Softmax + 5. Attention @ V + 6. Reshape back and output projection + """ + head_dim = embed_dim // num_heads + scale_factor = 1.0 / math.sqrt(head_dim) + + # Get input shapes - assuming [batch, seq_len, embed_dim] format + query_shape = query.shape + if len(query_shape) == 3: + # [batch, seq_len, embed_dim] format + batch_size = query_shape[0] + seq_len = query_shape[1] + elif len(query_shape) == 2: + # [seq_len, embed_dim] -> add batch dimension + batch_size = 1 + seq_len = query_shape[0] + query = query.unsqueeze(0) # [1, seq_len, embed_dim] + key = key.unsqueeze(0) + value = value.unsqueeze(0) + else: + # Fallback: assume first dim is batch, second is seq_len + batch_size = query_shape[0] if len(query_shape) > 0 else 1 + seq_len = query_shape[1] if len(query_shape) > 1 else query_shape[0] + + # Step 1: QKV projection (if query/key/value are not already projected) + # In many cases, query/key/value are already projected, so we check if qkv_weight is used + # For now, assume they might need projection + # Note: In practice, _native_multi_head_attention often receives already projected inputs + + # Reshape for projection: [batch, seq_len, embed_dim] -> [batch*seq_len, embed_dim] + if len(query.shape) == 3: + query_flat = query.view(-1, embed_dim) + key_flat = key.view(-1, embed_dim) + value_flat = value.view(-1, embed_dim) + else: + query_flat = query + key_flat = key + value_flat = value + + # QKV projection using qkv_weight and qkv_bias + # qkv_weight shape: [3*embed_dim, embed_dim] -> split into 3 parts + # Split qkv_weight into Q, K, V weights + qkv_weight_q, qkv_weight_k, qkv_weight_v = torch.split(qkv_weight, embed_dim, dim=0) + if qkv_bias is not None: + # qkv_bias shape: [3*embed_dim] -> split into 3 parts + qkv_bias_q, qkv_bias_k, qkv_bias_v = torch.split(qkv_bias, embed_dim, dim=0) + else: + qkv_bias_q = qkv_bias_k = qkv_bias_v = None + + # Project Q, K, V + q = torch.nn.functional.linear(query_flat, qkv_weight_q, qkv_bias_q) + k = torch.nn.functional.linear(key_flat, qkv_weight_k, qkv_bias_k) + v = torch.nn.functional.linear(value_flat, qkv_weight_v, qkv_bias_v) + + # Reshape back: [batch*seq_len, embed_dim] -> [batch, seq_len, embed_dim] + q = q.view(batch_size, seq_len, embed_dim) + k = k.view(batch_size, seq_len, embed_dim) + v = v.view(batch_size, seq_len, embed_dim) + + # Step 2: Reshape to multi-head format + # [batch, seq_len, embed_dim] -> [batch, seq_len, num_heads, head_dim] + q = q.view(batch_size, seq_len, num_heads, head_dim) + k = k.view(batch_size, seq_len, num_heads, head_dim) + v = v.view(batch_size, seq_len, num_heads, head_dim) + + # Transpose to [batch, num_heads, seq_len, head_dim] for bmm + # [batch, seq_len, embed_dim] -> [batch, seq_len, num_heads, head_dim] + q = q.view(batch_size, seq_len, num_heads, head_dim) + k = k.view(batch_size, seq_len, num_heads, head_dim) + v = v.view(batch_size, seq_len, num_heads, head_dim) + + # Transpose to [batch, num_heads, seq_len, head_dim] for bmm + q = q.transpose(1, 2) # [batch, num_heads, seq_len, head_dim] + k = k.transpose(1, 2) # [batch, num_heads, seq_len, head_dim] + v = v.transpose(1, 2) # [batch, num_heads, seq_len, head_dim] + + # Step 3: Scaled dot product attention + # Scale Q + q_scaled = q * scale_factor + + # Q @ K^T: [batch, num_heads, seq_len, head_dim] @ [batch, num_heads, head_dim, seq_len] + # -> [batch, num_heads, seq_len, seq_len] + k_transposed = k.transpose(-2, -1) # [batch, num_heads, head_dim, seq_len] + scores = torch.matmul(q_scaled, k_transposed) # [batch, num_heads, seq_len, seq_len] + + # Step 4: Apply mask if provided + if mask is not None: + scores = scores + mask + + # Step 5: Softmax along the last dimension (seq_len dimension) + # Stable softmax: subtract max, exp, divide by sum + scores_max = scores.amax(dim=-1, keepdim=True) # [batch, num_heads, seq_len, 1] + scores_shifted = scores - scores_max + scores_exp = scores_shifted.exp() + scores_sum = scores_exp.sum(dim=-1, keepdim=True) # [batch, num_heads, seq_len, 1] + attn_weights = scores_exp / scores_sum # [batch, num_heads, seq_len, seq_len] + + # Step 6: Attention @ V + # [batch, num_heads, seq_len, seq_len] @ [batch, num_heads, seq_len, head_dim] + # -> [batch, num_heads, seq_len, head_dim] + attn_output = torch.matmul(attn_weights, v) + + # Step 7: Reshape back to [batch, seq_len, embed_dim] + attn_output = attn_output.transpose(1, 2) # [batch, seq_len, num_heads, head_dim] + attn_output = attn_output.contiguous().view(batch_size, seq_len, embed_dim) + + # Step 8: Output projection + attn_output_flat = attn_output.view(-1, embed_dim) + output = torch.nn.functional.linear(attn_output_flat, proj_weight, proj_bias) + output = output.view(batch_size, seq_len, embed_dim) + + if need_weights: + # Return attention weights: [batch, num_heads, seq_len, seq_len] -> [batch, seq_len, seq_len] + attn_weights_mean = attn_weights.mean(dim=1) # Average over heads + return output, attn_weights_mean + else: + return (output, None) \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index f5fadbc3..bfcda258 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -18,6 +18,7 @@ from . import mlir_common from . import mlir_lowering # DO NOT REMOVE THIS LINE, it is used for lowering +from . import mlir_decomposition # DO NOT REMOVE THIS LINE, it is used for decomposition class MLIRScheduling(BaseScheduling): count = 0 From 08e0c8be825a8c41633ca02da5972c1f8089d053 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 8 Jan 2026 12:54:16 +0000 Subject: [PATCH 059/194] [Fix] Add missing case --- .../mlir/mlir_codegen_backend.py | 2 ++ PyTorchSimFrontend/mlir/mlir_common.py | 25 +++++++++---------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 72cd691e..27fdf757 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -360,6 +360,8 @@ def parse_indices(self, expr, buffer=None, comments="", indirect_dims=[]) -> com # Replace Identity arguments with Identity.args[0] for arg in expr.args: + if arg.is_Mul and arg.args[0].is_number and isinstance(arg.args[1], Identity): + expr = expr.replace(arg.args[1], arg.args[1].args[0]) if isinstance(arg, Identity): expr = expr.replace(arg, arg.args[0] if arg.args else arg) diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 3bbf3db7..d96eb452 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -609,14 +609,9 @@ def __init__(self, kernel_group, reason=None): self.recodegen = reason # spad overflow, tile size, vlane stride self.stop_autotune = False - # Context var for codegen - preserve existing ContextVar on reset to avoid Token mismatch - # Don't recreate if already exists (e.g., when reset() is called during active context manager) - if not hasattr(self, 'target_buffer_override'): - instance_id = id(self) - self.target_buffer_override = contextvars.ContextVar(f"Handler_compute_override_{instance_id}", default=self.compute) - self.target_cse_override = contextvars.ContextVar(f"Handler_cse_override_{instance_id}", default=self.cse) - else: - pass + instance_id = id(self) + self.target_buffer_override = contextvars.ContextVar(f"Handler_compute_override_{instance_id}", default=self.compute) + self.target_cse_override = contextvars.ContextVar(f"Handler_cse_override_{instance_id}", default=self.cse) def set_ranges(self, lengths, reduction_lengths): if self.call_ranges: @@ -697,7 +692,9 @@ def extract_dividers(self, implicit_ops): } new_index = operand.index.subs(subs_map) for arg in new_index.args: - if len(arg.free_symbols) != 1: + if arg.is_number: + continue + if len(arg.free_symbols) > 1: raise NotImplementedError("Not supporting this view operation...!") if arg.is_Mul and arg.args[0].is_number: arg = arg.args[1] @@ -852,18 +849,20 @@ def rename_indexing(self, index) -> sympy.Expr: @contextmanager def override_buffer_cse(self, *, buffer=None, cse=None): + buffer_override = self.target_buffer_override + cse_override = self.target_cse_override target_buffer = target_cse = None try: if buffer is not None: - target_buffer = self.target_buffer_override.set(buffer) + target_buffer = buffer_override.set(buffer) if cse is not None: - target_cse = self.target_cse_override.set(cse) + target_cse = cse_override.set(cse) yield self finally: if target_cse is not None: - self.target_cse_override.reset(target_cse) + cse_override.reset(target_cse) if target_buffer is not None: - self.target_buffer_override.reset(target_buffer) + buffer_override.reset(target_buffer) def __enter__(self): class CSEProxy: From 1d1508acc3be5623c0a3672b03e3c63e9d664414 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 8 Jan 2026 12:54:47 +0000 Subject: [PATCH 060/194] [Test] Add GQA test file --- PyTorchSimFrontend/mlir/mlir_decomposition.py | 61 +++- tests/test_gqa.py | 335 ++++++++++++++++++ 2 files changed, 377 insertions(+), 19 deletions(-) create mode 100644 tests/test_gqa.py diff --git a/PyTorchSimFrontend/mlir/mlir_decomposition.py b/PyTorchSimFrontend/mlir/mlir_decomposition.py index 33389a91..141fa9e4 100644 --- a/PyTorchSimFrontend/mlir/mlir_decomposition.py +++ b/PyTorchSimFrontend/mlir/mlir_decomposition.py @@ -67,14 +67,37 @@ def decompose_native_multi_head_attention( value_flat = value # QKV projection using qkv_weight and qkv_bias - # qkv_weight shape: [3*embed_dim, embed_dim] -> split into 3 parts - # Split qkv_weight into Q, K, V weights - qkv_weight_q, qkv_weight_k, qkv_weight_v = torch.split(qkv_weight, embed_dim, dim=0) - if qkv_bias is not None: - # qkv_bias shape: [3*embed_dim] -> split into 3 parts - qkv_bias_q, qkv_bias_k, qkv_bias_v = torch.split(qkv_bias, embed_dim, dim=0) + # Check if GQA (Grouped Query Attention) is used + # Standard MHA: qkv_weight shape = [3*embed_dim, embed_dim] + # GQA: qkv_weight shape = [embed_dim + 2*kv_embed_dim, embed_dim] where kv_embed_dim < embed_dim + qkv_weight_total = qkv_weight.shape[0] + + # Determine if GQA: if qkv_weight is not exactly 3*embed_dim, it might be GQA + if qkv_weight_total == 3 * embed_dim: + # Standard MHA: split equally + qkv_weight_q, qkv_weight_k, qkv_weight_v = torch.split(qkv_weight, embed_dim, dim=0) + if qkv_bias is not None: + qkv_bias_q, qkv_bias_k, qkv_bias_v = torch.split(qkv_bias, embed_dim, dim=0) + else: + qkv_bias_q = qkv_bias_k = qkv_bias_v = None + kv_embed_dim = embed_dim + kv_heads = num_heads else: - qkv_bias_q = qkv_bias_k = qkv_bias_v = None + # GQA: Q has embed_dim, K and V share the rest + # Assume Q = embed_dim, K = V = (qkv_weight_total - embed_dim) / 2 + q_dim = embed_dim + kv_dim = (qkv_weight_total - embed_dim) // 2 + qkv_weight_q = qkv_weight[:q_dim] + qkv_weight_k = qkv_weight[q_dim:q_dim + kv_dim] + qkv_weight_v = qkv_weight[q_dim + kv_dim:] + if qkv_bias is not None: + qkv_bias_q = qkv_bias[:q_dim] + qkv_bias_k = qkv_bias[q_dim:q_dim + kv_dim] + qkv_bias_v = qkv_bias[q_dim + kv_dim:] + else: + qkv_bias_q = qkv_bias_k = qkv_bias_v = None + kv_embed_dim = kv_dim + kv_heads = kv_embed_dim // head_dim # Number of KV heads # Project Q, K, V q = torch.nn.functional.linear(query_flat, qkv_weight_q, qkv_bias_q) @@ -83,25 +106,25 @@ def decompose_native_multi_head_attention( # Reshape back: [batch*seq_len, embed_dim] -> [batch, seq_len, embed_dim] q = q.view(batch_size, seq_len, embed_dim) - k = k.view(batch_size, seq_len, embed_dim) - v = v.view(batch_size, seq_len, embed_dim) + k = k.view(batch_size, seq_len, kv_embed_dim) + v = v.view(batch_size, seq_len, kv_embed_dim) # Step 2: Reshape to multi-head format # [batch, seq_len, embed_dim] -> [batch, seq_len, num_heads, head_dim] q = q.view(batch_size, seq_len, num_heads, head_dim) - k = k.view(batch_size, seq_len, num_heads, head_dim) - v = v.view(batch_size, seq_len, num_heads, head_dim) - - # Transpose to [batch, num_heads, seq_len, head_dim] for bmm - # [batch, seq_len, embed_dim] -> [batch, seq_len, num_heads, head_dim] - q = q.view(batch_size, seq_len, num_heads, head_dim) - k = k.view(batch_size, seq_len, num_heads, head_dim) - v = v.view(batch_size, seq_len, num_heads, head_dim) + k = k.view(batch_size, seq_len, kv_heads, head_dim) + v = v.view(batch_size, seq_len, kv_heads, head_dim) # Transpose to [batch, num_heads, seq_len, head_dim] for bmm q = q.transpose(1, 2) # [batch, num_heads, seq_len, head_dim] - k = k.transpose(1, 2) # [batch, num_heads, seq_len, head_dim] - v = v.transpose(1, 2) # [batch, num_heads, seq_len, head_dim] + k = k.transpose(1, 2) # [batch, kv_heads, seq_len, head_dim] + v = v.transpose(1, 2) # [batch, kv_heads, seq_len, head_dim] + + # GQA: If key/value have fewer heads, repeat them to match query heads + if kv_heads < num_heads: + repeat_factor = num_heads // kv_heads + k = k.repeat_interleave(repeat_factor, dim=1) # [batch, num_heads, seq_len, head_dim] + v = v.repeat_interleave(repeat_factor, dim=1) # [batch, num_heads, seq_len, head_dim] # Step 3: Scaled dot product attention # Scale Q diff --git a/tests/test_gqa.py b/tests/test_gqa.py new file mode 100644 index 00000000..c5f2f6f6 --- /dev/null +++ b/tests/test_gqa.py @@ -0,0 +1,335 @@ +import sys +import os +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch._dynamo +import argparse + + +def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): + if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): + message = f"|{name} Test Passed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + else: + message = f"|{name} Test Failed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + print("custom out: ", out.cpu()) + print("cpu out: ", cpu_out) + exit(1) + + +class GQAMultiheadAttention(nn.Module): + """ + Grouped Query Attention (GQA) implementation. + Query has num_heads, but key/value have num_kv_heads (num_kv_heads < num_heads). + """ + def __init__(self, embed_dim, num_heads, num_kv_heads=None, head_dim=None, bias=True, dropout=0.0): + super().__init__() + assert embed_dim % num_heads == 0 + if head_dim is None: + head_dim = embed_dim // num_heads + assert embed_dim == num_heads * head_dim + + # If num_kv_heads is not specified, use num_heads (standard MHA) + if num_kv_heads is None: + num_kv_heads = num_heads + + assert num_kv_heads <= num_heads + assert embed_dim % num_kv_heads == 0 + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = head_dim + self.dropout = dropout + + # QKV projection: Q has embed_dim, K and V have kv_embed_dim each + kv_embed_dim = num_kv_heads * head_dim + total_qkv_dim = embed_dim + 2 * kv_embed_dim + + self.qkv_proj = nn.Linear(embed_dim, total_qkv_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def forward(self, query, key=None, value=None, attn_mask=None, need_weights=False): + """ + Args: + query: [batch, seq_len, embed_dim] or [seq_len, batch, embed_dim] + key: optional, same shape as query + value: optional, same shape as query + attn_mask: optional attention mask + need_weights: whether to return attention weights + """ + # For compatibility with nn.MultiheadAttention API + if key is None: + key = query + if value is None: + value = query + + # Handle batch_first vs batch_second + if query.dim() == 3: + batch_first = True + batch_size, seq_len, _ = query.shape + else: + batch_first = False + seq_len, batch_size, _ = query.shape + query = query.transpose(0, 1) + key = key.transpose(0, 1) + value = value.transpose(0, 1) + + # Project QKV + # Use query for QKV projection (standard MHA/GQA pattern) + qkv = self.qkv_proj(query) # [batch, seq_len, total_qkv_dim] + + # Split into Q, K, V + kv_embed_dim = self.num_kv_heads * self.head_dim + q = qkv[:, :, :self.embed_dim] # [batch, seq_len, embed_dim] + k = qkv[:, :, self.embed_dim:self.embed_dim + kv_embed_dim] # [batch, seq_len, kv_embed_dim] + v = qkv[:, :, self.embed_dim + kv_embed_dim:] # [batch, seq_len, kv_embed_dim] + + # Reshape to multi-head format + q = q.view(batch_size, seq_len, self.num_heads, self.head_dim) # [batch, seq_len, num_heads, head_dim] + k = k.view(batch_size, seq_len, self.num_kv_heads, self.head_dim) # [batch, seq_len, num_kv_heads, head_dim] + v = v.view(batch_size, seq_len, self.num_kv_heads, self.head_dim) # [batch, seq_len, num_kv_heads, head_dim] + + # Transpose for attention: [batch, num_heads, seq_len, head_dim] + q = q.transpose(1, 2) # [batch, num_heads, seq_len, head_dim] + k = k.transpose(1, 2) # [batch, num_kv_heads, seq_len, head_dim] + v = v.transpose(1, 2) # [batch, num_kv_heads, seq_len, head_dim] + + # Scaled dot product attention with GQA support + # enable_gqa=True allows different number of heads for Q vs K/V + attn_output = F.scaled_dot_product_attention( + q, k, v, + attn_mask=attn_mask, + dropout_p=self.dropout if self.training else 0.0, + is_causal=False, + enable_gqa=(self.num_kv_heads < self.num_heads) + ) # [batch, num_heads, seq_len, head_dim] + + # Reshape back: [batch, num_heads, seq_len, head_dim] -> [batch, seq_len, embed_dim] + attn_output = attn_output.transpose(1, 2) # [batch, seq_len, num_heads, head_dim] + attn_output = attn_output.contiguous().view(batch_size, seq_len, self.embed_dim) + + # Output projection + output = self.out_proj(attn_output) # [batch, seq_len, embed_dim] + + if not batch_first: + output = output.transpose(0, 1) # [seq_len, batch, embed_dim] + + if need_weights: + # Compute attention weights for return + # This is simplified - in practice you'd want the actual attention weights + attn_weights = None + return output, attn_weights + else: + return output + + +def test_gqa_attention(device, batch=1, seq_len=32, embed_dim=768, num_heads=12, num_kv_heads=4): + """ + Test Grouped Query Attention (GQA) where num_kv_heads < num_heads. + + Args: + device: target device + batch: batch size + seq_len: sequence length + embed_dim: embedding dimension + num_heads: number of query heads + num_kv_heads: number of key/value heads (should be <= num_heads) + """ + print(f"Testing GQA Attention (batch={batch}, seq_len={seq_len}, embed_dim={embed_dim}, " + f"num_heads={num_heads}, num_kv_heads={num_kv_heads})") + + # Create GQA model + gqa = GQAMultiheadAttention( + embed_dim=embed_dim, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + bias=True, + dropout=0.0 + ).eval() + + # Initialize weights + torch.nn.init.normal_(gqa.qkv_proj.weight, mean=0.0, std=0.02) + torch.nn.init.normal_(gqa.qkv_proj.bias, mean=0.0, std=0.02) + torch.nn.init.normal_(gqa.out_proj.weight, mean=0.0, std=0.02) + torch.nn.init.normal_(gqa.out_proj.bias, mean=0.0, std=0.02) + + # Create input + x = torch.randn(batch, seq_len, embed_dim) + query = x.clone() + key = x.clone() + value = x.clone() + + # Run on custom device + gqa_device = gqa.to(device) + q1, k1, v1 = query.to(device), key.to(device), value.to(device) + + compiled_gqa = torch.compile(gqa_device, dynamic=False) + with torch.no_grad(): + out_device = compiled_gqa(q1, k1, v1) + + # Run on CPU + gqa_cpu = gqa.cpu() + q2, k2, v2 = query.cpu(), key.cpu(), value.cpu() + with torch.no_grad(): + out_cpu = gqa_cpu(q2, k2, v2) + + test_result("GQA Attention", out_device, out_cpu) + print("Max diff > ", torch.max(torch.abs(out_device.cpu() - out_cpu))) + print("GQA Attention Simulation Done") + + +def test_standard_mha_via_gqa(device, batch=1, seq_len=32, embed_dim=768, num_heads=12): + """ + Test standard Multi-Head Attention using GQA with num_kv_heads == num_heads. + This should behave the same as standard MHA. + """ + print(f"Testing Standard MHA via GQA (batch={batch}, seq_len={seq_len}, " + f"embed_dim={embed_dim}, num_heads={num_heads})") + + test_gqa_attention(device, batch, seq_len, embed_dim, num_heads, num_kv_heads=num_heads) + + +def test_repeat_interleave_compilation(device, batch=1, seq_len=32, embed_dim=768, num_heads=12, num_kv_heads=4): + """ + Test that repeat_interleave operation compiles and works correctly using scaled_dot_product_attention implementation. + + This test uses the exact implementation from F.scaled_dot_product_attention to verify + that repeat_interleave works correctly when enable_gqa=True. + + Args: + device: target device + batch: batch size + seq_len: sequence length + embed_dim: embedding dimension + num_heads: number of query heads + num_kv_heads: number of key/value heads (should be < num_heads) + """ + import math + + print(f"Testing repeat_interleave compilation using scaled_dot_product_attention implementation " + f"(batch={batch}, seq_len={seq_len}, embed_dim={embed_dim}, " + f"num_heads={num_heads}, num_kv_heads={num_kv_heads})") + + head_dim = embed_dim // num_heads + assert num_kv_heads < num_heads, "num_kv_heads must be less than num_heads for GQA" + + # Create Q, K, V tensors + # Q: [batch, num_heads, seq_len, head_dim] + # K, V: [batch, num_kv_heads, seq_len, head_dim] + q = torch.randn(batch, num_heads, seq_len, head_dim) + k = torch.randn(batch, num_kv_heads, seq_len, head_dim) + v = torch.randn(batch, num_kv_heads, seq_len, head_dim) + + # Move to device + q_device = q.to(device) + k_device = k.to(device) + v_device = v.to(device) + + # Implementation from F.scaled_dot_product_attention + def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, + is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor: + L, S = query.size(-2), key.size(-2) + scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale + attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device) + if is_causal: + assert attn_mask is None + temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0) + attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf")) + attn_bias.to(query.dtype) + + if attn_mask is not None: + if attn_mask.dtype == torch.bool: + attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf")) + else: + attn_bias = attn_mask + attn_bias + + if enable_gqa: + key = key.repeat_interleave(query.size(-3)//key.size(-3), -3) + value = value.repeat_interleave(query.size(-3)//value.size(-3), -3) + + attn_weight = query @ key.transpose(-2, -1) * scale_factor + attn_weight += attn_bias + attn_weight = torch.softmax(attn_weight, dim=-1) + return attn_weight, value, attn_weight @ value + + # Compile the function + compiled_attn = torch.compile(scaled_dot_product_attention, dynamic=False) + + # Run on custom device with enable_gqa=True + with torch.no_grad(): + output_device = compiled_attn(q_device, k_device, v_device, + attn_mask=None, dropout_p=0.0, + is_causal=False, scale=None, enable_gqa=True) + + # Run on CPU for comparison + q_cpu = q.cpu() + k_cpu = k.cpu() + v_cpu = v.cpu() + with torch.no_grad(): + output_cpu = scaled_dot_product_attention(q_cpu, k_cpu, v_cpu, + attn_mask=None, dropout_p=0.0, + is_causal=False, scale=None, enable_gqa=True) + + # Compare results + test_result("repeat_interleave in scaled_dot_product_attention", output_device[0], output_cpu[0]) + print("Max diff > ", torch.max(torch.abs(output_device[0].cpu() - output_cpu[0]))) + test_result("repeat_interleave in scaled_dot_product_attention", output_device[1], output_cpu[1]) + print("Max diff > ", torch.max(torch.abs(output_device[1].cpu() - output_cpu[1]))) + test_result("repeat_interleave in scaled_dot_product_attention", output_device[2], output_cpu[2]) + print("Max diff > ", torch.max(torch.abs(output_device[2].cpu() - output_cpu[2]))) + print("repeat_interleave compilation test Done") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--device", type=str, default="npu", help="Device to use") + parser.add_argument("--batch", type=int, default=1, help="Batch size") + parser.add_argument("--seq_len", type=int, default=32, help="Sequence length") + parser.add_argument("--embed_dim", type=int, default=768, help="Embedding dimension") + parser.add_argument("--num_heads", type=int, default=8, help="Number of query heads") + parser.add_argument("--num_kv_heads", type=int, default=4, help="Number of key/value heads") + parser.add_argument("--test_standard", action="store_true", help="Also test standard MHA via GQA") + parser.add_argument("--test_repeat_interleave", action="store_true", help="Test repeat_interleave compilation") + + args = parser.parse_args() + + sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) + from Scheduler.scheduler import PyTorchSimRunner + module = PyTorchSimRunner.setup_device() + device = module.custom_device() + + test_repeat_interleave_compilation( + device=device, + batch=args.batch, + seq_len=args.seq_len, + embed_dim=args.embed_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads + ) + + # Test GQA + test_gqa_attention( + device=device, + batch=args.batch, + seq_len=args.seq_len, + embed_dim=args.embed_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads + ) + + # Optionally test standard MHA via GQA + # if args.test_standard: + # test_standard_mha_via_gqa( + # device=args.device, + # batch=args.batch, + # seq_len=args.seq_len, + # embed_dim=args.embed_dim, + # num_heads=args.num_heads + # ) From 862ba443c81b910c66bb2dd80b151571a11add8d Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 9 Jan 2026 09:55:53 +0000 Subject: [PATCH 061/194] [Fix+Log] Change logging system + Fix meta_code interface --- PyTorchSimFrontend/extension_codecache.py | 17 +- PyTorchSimFrontend/extension_config.py | 42 ++++- .../mlir/mlir_codegen_backend.py | 51 +++--- PyTorchSimFrontend/mlir/mlir_ops.py | 3 + PyTorchSimFrontend/mlir/mlir_scheduling.py | 3 +- PyTorchSimFrontend/mlir/mlir_template.py | 30 ++-- Scheduler/scheduler.py | 21 ++- Simulator/simulator.py | 151 +++++++++--------- 8 files changed, 189 insertions(+), 129 deletions(-) diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index ef8c63e6..5066d214 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -10,6 +10,9 @@ from PyTorchSimFrontend import extension_config from Simulator.simulator import FunctionalSimulator, CycleSimulator, TOGSimulator +# Configure logger for extension_codecache module (WARNING level by default) +logger = extension_config.setup_logger() + LOCK_TIMEOUT = 600 def hash_prefix(hash_value): @@ -166,8 +169,8 @@ def load(cls, source_code, subprocess.check_call(translate_cmd) subprocess.check_call(llc_cmd) except subprocess.CalledProcessError as e: - print("Command failed with exit code", e.returncode) - print("Error output:", e.output) + logger.error(f"Command failed with exit code {e.returncode}") + logger.error(f"Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}") assert(0) val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.pytorchsim_functional_mode, arg_attributes) @@ -179,8 +182,10 @@ def load(cls, source_code, spad_size = val_llvm_caller.get_spad_size(target) spad_usage = stack_size + spad_size # Spad usage per lane if extension_config.CONFIG_SPAD_INFO["spad_size"] < spad_usage: - print(f"[Warning] Scratchpad size exceeded: required {spad_usage} bytes, " - f"but only {extension_config.CONFIG_SPAD_INFO['spad_size']} bytes available.") + logger.debug( + f"Scratchpad size exceeded: required {spad_usage} bytes, " + f"but only {extension_config.CONFIG_SPAD_INFO['spad_size']} bytes available." + ) raise SpadOverflowError() # Launch tile graph generator @@ -197,8 +202,8 @@ def load(cls, source_code, subprocess.check_call(gem5_translate_cmd) subprocess.check_call(gem5_llc_cmd) except subprocess.CalledProcessError as e: - print("Command failed with exit code", e.returncode) - print("Error output:", e.output) + logger.error(f"Command failed with exit code {e.returncode}") + logger.error(f"Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}") assert(0) if not extension_config.pytorchsim_timing_mode: diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index 2b1b3102..b0bcac7f 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -2,6 +2,7 @@ import sys import importlib import yaml +import logging CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') CONFIG_GEM5_PATH = os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt") @@ -134,4 +135,43 @@ def load_plan_from_module(module_path): CONFIG_USE_TIMING_POOLING = int(os.environ.get('TORCHSIM_USE_TIMING_POOLING', default=0)) -CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=0)) \ No newline at end of file +CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=0)) + + +def setup_logger(name=None, level=None): + """ + Setup a logger with consistent formatting across all modules. + + Args: + name: Logger name (default: __name__ of calling module) + level: Logging level (default: DEBUG if CONFIG_DEBUG_MODE else INFO) + + Returns: + Logger instance + """ + if name is None: + import inspect + # Get the calling module's name + frame = inspect.currentframe().f_back + name = frame.f_globals.get('__name__', 'PyTorchSim') + + # Convert logger name to lowercase + name = name.lower() + logger = logging.getLogger(name) + + # Only configure if not already configured (avoid duplicate handlers) + if not logger.handlers: + handler = logging.StreamHandler() + formatter = logging.Formatter( + fmt='[%(asctime)s.%(msecs)03d] [%(levelname)s] [%(name)s] %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + + # Set log level + if level is None: + level = logging.DEBUG if CONFIG_DEBUG_MODE else logging.INFO + logger.setLevel(level) + + return logger \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 27fdf757..d0c8f815 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -2,7 +2,6 @@ import sympy import re import os -import math from functools import reduce from operator import mul import torch @@ -29,6 +28,9 @@ from .mlir_ops import ExtensionOverrides from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest +# Configure logger for mlir_codegen_backend module +logger = extension_config.setup_logger() + def reduction_init(reduction_type, dtype): if dtype in cpp.DTYPE_LOWP_FP: # Since load promotes all half-precision inputs to float, the initial @@ -95,11 +97,14 @@ def write_header(self): from torch import device, empty, empty_strided from {extension_codecache.__name__} import CustomAsyncCompile - from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_TOGSIM_EAGER_MODE + from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_TOGSIM_EAGER_MODE, setup_logger from Simulator.simulator import TOGSimulator from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer from torch._inductor.select_algorithm import extern_kernels + # Configure logger for generated wrapper code + _logger = setup_logger("PyTorchSimFrontend.mlir.generated_wrapper") + aten = torch.ops.aten inductor_ops = torch.ops.inductor assert_size_stride = torch._C._dynamo.guards.assert_size_stride @@ -108,7 +113,7 @@ def write_header(self): custom_async_compile = CustomAsyncCompile() async_compile = AsyncCompile() os.environ["TORCHSIM_LAST_COMPILED_MODULE"] = __file__ - print(f\'Wrapper Codegen Path = {{__file__}}\') + _logger.info(f'Wrapper Codegen Path = {{__file__}}') """ ) self.header.splice( @@ -909,15 +914,14 @@ def make_choices(self, nodes, kernel_name): # Try initial tile size self.reset(None) - src_code = super().codegen_nodes(nodes, kernel_name) + src_code, meta_code = super().codegen_nodes(nodes, kernel_name) current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size()) search_space.add(current_tile_sz) - if extension_config.CONFIG_DEBUG_MODE: - print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}") + logger.debug(f"Auto-tune: Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}") self._prepare_simulator_headers(src_code) bench_runner = self.run_bench(nodes, kernel_name, src_code) - choices.append((bench_runner, src_code, current_tile_sz, self.kernel_group.tile_desc.vmap.vlane_stride)) + choices.append((bench_runner, src_code, meta_code, current_tile_sz, self.kernel_group.tile_desc.vmap.vlane_stride)) while prevent_infinite_loop < 10 and candidate_axes: for axis in list(candidate_axes): @@ -939,7 +943,7 @@ def make_choices(self, nodes, kernel_name): continue self.reset(None) - src_code = super().codegen_nodes(nodes, kernel_name) + src_code, meta_code = super().codegen_nodes(nodes, kernel_name) current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size()) # FIXME. How to intergrate this constraint to tile system? @@ -956,11 +960,10 @@ def make_choices(self, nodes, kernel_name): # Add this choice search_space.add(current_tile_sz) - if extension_config.CONFIG_DEBUG_MODE: - print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}") + logger.debug(f"Auto-tune: Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}") self._prepare_simulator_headers(src_code) bench_runner = self.run_bench(nodes, kernel_name, src_code) - choices.append((bench_runner, src_code, self.kernel_group.tile_desc.get_tile_size(), self.kernel_group.tile_desc.vmap.vlane_stride)) + choices.append((bench_runner, src_code, meta_code, self.kernel_group.tile_desc.get_tile_size(), self.kernel_group.tile_desc.vmap.vlane_stride)) prevent_infinite_loop += 1 self.kernel_group.tile_desc.prev_tail_threshold = prev_tail_threshold return choices @@ -976,18 +979,20 @@ def get_cycle(choice): return float("inf") return float("inf") # Exceeded maximum number of autotuning attempts choices = self.make_choices(*args) - if len(choices) == 0: # Can't autotune - return [None, None] + return [None, None, None] + + # Get cycle time for each choice with ThreadPoolExecutor(max_workers=8) as executor: results = list(executor.map(get_cycle, choices)) - max_idx = results.index(min(results)) + min_idx = results.index(min(results)) if min(results) == float("inf"): raise RuntimeError("Failed to find optimal tile size...") - if extension_config.CONFIG_DEBUG_MODE: - self._log_autotune_result(choices[max_idx], results[max_idx]) - optimal_src_code, loop_size = choices[max_idx][1], choices[max_idx][-1] - return optimal_src_code, loop_size + + self._log_autotune_result(choices[min_idx], results[min_idx]) + + optimal_src_code, meta_code, loop_size = choices[min_idx][1], choices[min_idx][2], choices[min_idx][-1] + return optimal_src_code, meta_code, loop_size def run_bench(self, nodes, kernel_name, src_code): _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs() @@ -1015,9 +1020,9 @@ def run_bench(self, nodes, kernel_name, src_code): return bmreq.make_run_fn(dummy_inputs, dummy_outputs) def _log_autotune_result(self, best_choice, best_cycle): - print( - f"[Auto-tune] Optimal tile size: {list(best_choice[2])}, " - f"vlane_stride: {best_choice[3]}, " + logger.debug( + f"Auto-tune: Optimal tile size: {list(best_choice[3])}, " + f"vlane_stride: {best_choice[4]}, " f"cycles: {best_cycle}" ) @@ -1025,9 +1030,9 @@ def codegen_nodes(self, nodes, kernel_name): src_code, meta_code = super().codegen_nodes(nodes, kernel_name) self._prepare_simulator_headers(src_code) if "autotune" in extension_config.codegen_mapping_strategy and extension_config.pytorchsim_timing_mode: - optimal_src_code = self.autotune(nodes, kernel_name)[0] + optimal_src_code, meta_code = self.autotune(nodes, kernel_name)[:2] if optimal_src_code is not None: - return optimal_src_code + return optimal_src_code, meta_code return src_code, meta_code def _prepare_simulator_headers(self, src_code): diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index 2b964c55..dce59ed6 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -1,10 +1,13 @@ import math import torch +import warnings from torch._inductor.codegen import common from torch._inductor.virtualized import V, _ops as ops from . import mlir_common +warnings.filterwarnings('ignore', message='undefined OpHandler\\..*, please add missing op schema') + def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape, reduced_shape): if reduction_type == "sum": return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index bfcda258..f2bcba7e 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -299,8 +299,7 @@ def codegen_template(self, template_node, epilogue_nodes, prologue_nodes): template_buffer = template_node.node kernel, tile_candidates, render = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group) _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs() - src_code = kernel.codegen_nodes(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) - meta_code = kernel.meta_kernel() + src_code, meta_code = kernel.codegen_nodes(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) with V.set_kernel_handler(kernel): kernel_name = self.define_kernel(src_code, meta_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info, diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 8f92554c..304d0090 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -32,6 +32,9 @@ from PyTorchSimFrontend import extension_config from . import mlir_common +# Configure logger for mlir_template module +logger = extension_config.setup_logger() + class IndentedBufferGroup: def __init__(self, kernel: 'MLIRTemplateKernel', prefix=""): self.kernel = kernel @@ -386,7 +389,6 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio return tile_candidates def meta_kernel(self): - wrapper = V.graph.wrapper_code kernel_arg_attributes = self.kernel_arg_attributes _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs() if kernel_arg_attributes is not None: @@ -483,38 +485,36 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_ buffer.splice(src_code) src_code = buffer.getvalue() self._prepare_simulator_headers(src_code) - return src_code + meta_code = self.meta_kernel() + return src_code, meta_code def make_choices(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes): choices = [] for tile_info in tile_candidates: - if extension_config.CONFIG_DEBUG_MODE: - # Compute Tile M, N, K DMA Tile M, N, K - print(f"[Auto-tune] Trying tile size: {list(tile_info)}") - src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info) + # Compute Tile M, N, K DMA Tile M, N, K + logger.debug(f"Auto-tune: Trying tile size: {list(tile_info)}") + src_code, meta_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info) bench_runner = self.run_bench([template_node], self.kernel_name, src_code) - choices.append((bench_runner, src_code, tile_info, self.loop_size)) + choices.append((bench_runner, src_code, meta_code, tile_info, self.loop_size)) self.reset(reason=None) return choices def _log_autotune_result(self, best_choice, best_cycle): - tile_size = best_choice[2] - print( - f"[Auto-tune] Optimal tile size: {list(tile_size)}, " + tile_size = best_choice[3] + logger.debug( + f"Auto-tune: Optimal tile size: {list(tile_size)}, " f"cycles: {best_cycle}" ) def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes): if "autotune" in extension_config.codegen_mapping_strategy and len(tile_candidates): - src_code, loop_size = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) + src_code, meta_code, loop_size = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) self.loop_size = loop_size else: tile_info = tile_candidates[0] if tile_candidates else None - src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info) + src_code, meta_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info) - with V.set_kernel_handler(self): - self.meta_kernel() - return src_code + return src_code, meta_code def _prepare_simulator_headers(self, src_code): spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n" diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 215700eb..3f5673a8 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -12,6 +12,9 @@ from torch._dynamo.device_interface import register_interface_for_device +# Configure logger for Scheduler module +logger = extension_config.setup_logger() + def import_module_from_path(module_name, path): module_path = Path(path) # Convert to Path object for safety @@ -380,7 +383,7 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, elif engine_select == Scheduler.RR_ENGINE: self.execution_engine = RoundRobinRunner(self.tog_simulator, self.num_request_queue) else: - print(f"Not supporetd engine type {engine_select}") + logger.error(f"Not supported engine type {engine_select}") exit(1) def add_request(self, request: Request, request_time=-1): @@ -441,9 +444,11 @@ def finish_request(self, req : Request): self.finish_queue.append(req) self.request_queue[req.request_queue_idx].remove(req) turnaround_time, response_time, tbt_time = req.get_latency() - print(f"[Request-{req.id} finished] partition: {req.request_queue_idx} arrival_time: " - f"{req.arrival_time} start_time: {req.start_time[0]} turnaround latency: {turnaround_time}, " - f"response time: {response_time} tbt_time: {tbt_time}") + logger.info( + f"[Request-{req.id} finished] partition: {req.request_queue_idx} arrival_time: " + f"{req.arrival_time} start_time: {req.start_time[0]} turnaround latency: {turnaround_time}, " + f"response time: {response_time} tbt_time: {tbt_time}" + ) def per_schedule(self, request_queue_idx): # Wait partition is idle @@ -454,11 +459,13 @@ def per_schedule(self, request_queue_idx): if not request_list: return False - print(f"[Request issue] partition: {request_queue_idx} batch size: {len(request_list)}", flush=True) + logger.info(f"[Request issue] partition: {request_queue_idx} batch size: {len(request_list)}") for req in request_list: req.set_start(self.current_time()) - print(f"[Request-{req.id} issue] partition: {req.request_queue_idx} " - f"arrival_time: {req.arrival_time} start_time: {req.start_time[0]}", flush=True) + logger.info( + f"[Request-{req.id} issue] partition: {req.request_queue_idx} " + f"arrival_time: {req.arrival_time} start_time: {req.start_time[0]}" + ) # Submit batched request self.execution_engine.submit(request_list, request_queue_idx) diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 672ae6ec..6ed679d6 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -17,7 +17,46 @@ from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs from PyTorchSimFrontend import extension_config -print_lock = threading.Lock() +# Configure logger for Simulator module +logger = extension_config.setup_logger() +from tqdm import tqdm + + +class ProgressBar: + def __init__(self, desc, silent_mode=False, update_interval=0.5): + self.desc = desc + self.silent_mode = silent_mode + self.update_interval = update_interval + self.pbar = None + self.finished = False + self.progress_thread = None + + def __enter__(self): + if not self.silent_mode: + self.pbar = tqdm( + desc=self.desc, + bar_format='{desc}: {elapsed}', + leave=False, # Don't leave the bar when done (it will disappear) + ncols=80, + disable=False, + total=100, # Use a total for smooth animation + ) + # Update progress bar in a separate thread + def update_progress(): + while not self.finished: + self.pbar.update(1) + time.sleep(self.update_interval) + + self.progress_thread = threading.Thread(target=update_progress, daemon=True) + self.progress_thread.start() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.finished = True + if not self.silent_mode and self.pbar is not None: + self.pbar.close() + return False + TORCH_TO_NUMPY = { torch.float32: np.float32, @@ -105,9 +144,9 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= os.makedirs(os.path.join(runtime_path, "indirect_access"), exist_ok=True) os.makedirs(os.path.join(runtime_path, "dma_access"), exist_ok=True) run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}' - if not silent_mode and extension_config.CONFIG_DEBUG_MODE: - print("[Spike] cmd> ", run) - print("[Spike] Running Spike simulator") + if not silent_mode: + logger.debug(f"[Spike] cmd> {run}") + logger.info("[Spike] Running Spike simulator") run_cmd = shlex.split(run) try: stdout_setting = subprocess.DEVNULL if silent_mode else None @@ -115,7 +154,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= subprocess.check_call(run_cmd, stdout=stdout_setting, stderr=stderr_setting) except subprocess.CalledProcessError as e: if not silent_mode: - print("[Spike] Command failed with exit code", e.returncode) + logger.error(f"[Spike] Command failed with exit code {e.returncode}") error_msg = "" if e.returncode == 200: error_msg = "INVALID_SPAD_ACCESS" @@ -155,41 +194,23 @@ def __init__(self) -> None: pass def compile_and_simulate(self, target_binary, array_size, vectorlane_size, silent_mode=False): - def show_progress(): - i = 0 - while not finished: - i = (i + 1) % 3 - tail = "." * i + " " * (3-i) - with print_lock: - sys.stdout.write("\r[Gem5] Gem5 is running." + tail) - sys.stdout.flush() - time.sleep(1) - with print_lock: - print("") - dir_path = os.path.join(os.path.dirname(target_binary), "m5out") gem5_script_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "gem5_script/script_systolic.py") gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-r", "--stdout-file=sto.log", "-d", dir_path, gem5_script_path, "-c", target_binary, "--vlane", str(vectorlane_size)] + + is_dryrun = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) or silent_mode + + if not is_dryrun: + logger.debug(f"[Gem5] cmd> {' '.join(gem5_cmd)}") + logger.info("[Gem5] Gem5 simulation started") + try: - # Create progress thread - is_dryrun = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) or silent_mode - if not is_dryrun: - if extension_config.CONFIG_DEBUG_MODE: - print("[Gem5] cmd> ", " ".join(gem5_cmd)) - finished = False - progress_thread = threading.Thread(target=show_progress) - progress_thread.start() - output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL) - finished = True - progress_thread.join() - else: - output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL) + #with ProgressBar("[Gem5] Running simulation", silent_mode=is_dryrun): + output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL) except subprocess.CalledProcessError as e: - print(f"[Gem5] Gem5 simulation failed with error: \"{e.output.decode()}\"") - if not is_dryrun: - finished = True - progress_thread.join() - raise RuntimeError(f"Gem5 Simulation Failed: \"{e.output.decode()}\"") + output_error = e.output.decode() if isinstance(e.output, bytes) else str(e.output) + logger.error(f"[Gem5] Gem5 simulation failed with error: \"{output_error}\"") + raise RuntimeError(f"Gem5 Simulation Failed: \"{output_error}\"") with open(f"{dir_path}/stats.txt", "r") as stat_file: raw_list = stat_file.readlines() @@ -216,39 +237,21 @@ def get_togsim_command(self): return cmd def simulation(self, model_path, attribute_path="", silent_mode=False, autotune_mode=False): - def show_progress(): - i = 0 - while not finished: - i = (i + 1) % 3 - tail = "." * i + " " * (3-i) - sys.stdout.write("\r[TOGSim] TOGSim is running." + tail) - time.sleep(1) - print("") cmd = f"{self.get_togsim_command()} --models_list {model_path}" if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" if attribute_path: cmd = f"{cmd} --attributes_list {attribute_path}" - if not silent_mode and extension_config.CONFIG_DEBUG_MODE: - print("[TOGSim] cmd> ", cmd) - - # Create progress thread if not silent_mode: - finished = False - progress_thread = threading.Thread(target=show_progress) - progress_thread.start() + logger.debug(f"[TOGSim] cmd> {cmd}") + logger.info("[TOGSim] TOGSim simulation started") + try: - result = subprocess.check_output(shlex.split(cmd)) - if not silent_mode: - finished = True - progress_thread.join() + with ProgressBar("[TOGSim] Running simulation", silent_mode=silent_mode): + result = subprocess.check_output(shlex.split(cmd)) except subprocess.CalledProcessError as e: - if not silent_mode: - finished = True - progress_thread.join() - with print_lock: - print("[TOGSim] Command failed with exit code", e.returncode) - print("[TOGSim] Error output:", e.output) + logger.error(f"[TOGSim] Command failed with exit code {e.returncode}") + logger.error(f"[TOGSim] Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}") assert 0 # Separate Autotune logs @@ -271,10 +274,10 @@ def show_progress(): f.flush() os.fsync(f.fileno()) - if not silent_mode or extension_config.CONFIG_DEBUG_MODE: - model_path_log = f' of "{model_path}" ' if extension_config.CONFIG_DEBUG_MODE else " " - with print_lock: - print(f'[TOGSim] Simulation log{model_path_log}is stored to "{result_path}"') + if not silent_mode: + import logging as _logging + model_path_log = f' of "{model_path}" ' if logger.isEnabledFor(_logging.DEBUG) else " " + logger.info(f'[TOGSim] Simulation log{model_path_log}is stored to "{result_path}"') return result_path def interactive_simulation(self): @@ -282,8 +285,7 @@ def interactive_simulation(self): if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" - if extension_config.CONFIG_DEBUG_MODE: - print("[TOGSim] cmd> ", cmd) + logger.debug(f"[TOGSim] cmd> {cmd}") if self.process is None: self.process = subprocess.Popen( shlex.split(cmd), @@ -292,28 +294,27 @@ def interactive_simulation(self): universal_newlines=True ) else: - print("[TOGSim] Simulator is already running.") + logger.warning("[TOGSim] Simulator is already running.") def stop(self): if self.process: self.process.terminate() self.process.wait() self.process = None - print("[TOGSim] Simulator stopped.") + logger.info("[TOGSim] Simulator stopped.") def wait(self): if self.process: - print("[TOGSim] Waiting for simulation to complete...") + logger.info("[TOGSim] Waiting for simulation to complete...") self.quit() self.process.wait() self.process = None - print("[TOGSim] Simulation completed.") + logger.info("[TOGSim] Simulation completed.") def send_command(self, command): if self.process: try: - if extension_config.CONFIG_TORCHSIM_DEBUG_MODE: - print(command, flush=True) + logger.debug(command) self.process.stdin.write(command + '\n') self.process.stdin.flush() ret = self.process.stderr.readline().strip() @@ -321,11 +322,11 @@ def send_command(self, command): except BrokenPipeError: err = self.process.stderr.readlines() for line in err: - print(line) + logger.error(line.strip()) self.process = None exit(1) else: - print("Simulator is not running.") + logger.warning("Simulator is not running.") return None def launch(self, onnx_path, attribute_path, arrival_time=0, partion_id=0): @@ -440,7 +441,7 @@ def get_result_from_file(result_path): break if simulation_finished_idx == -1: - print(f"[TOGSim] Warning: Unable to parse the output file ({result_path}). The file may be improperly formatted.") + logger.warning(f"[TOGSim] Warning: Unable to parse the output file ({result_path}). The file may be improperly formatted.") return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time total_stat_lines = lines[simulation_finished_idx:] From 75207a45ad3940834aa4c20dac043b12a6f9bb95 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 9 Jan 2026 11:11:45 +0000 Subject: [PATCH 062/194] [Test] Wrap softmax module --- tests/test_softmax.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/test_softmax.py b/tests/test_softmax.py index e6e8cc1e..005c3ed2 100644 --- a/tests/test_softmax.py +++ b/tests/test_softmax.py @@ -42,8 +42,17 @@ def test_softmax(device, size=(128, 128), dim=1): #cpu_y = softmax3(x2, cpu_max, cpu_sum) #test_result("Softmax", y, cpu_y) - opt_fn = torch.compile(dynamic=False)(torch.nn.functional.softmax) - y = opt_fn(x1, dim=dim) + class SoftmaxModule(torch.nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + return torch.nn.functional.softmax(x, dim=self.dim) + + softmax_module = SoftmaxModule(dim=dim).to(device) + opt_fn = torch.compile(dynamic=False)(softmax_module) + y = opt_fn(x1) cpu_y = torch.nn.functional.softmax(x2, dim=dim) test_result("Softmax", y, cpu_y) From 8df5fef0291444c0f2feaa929983f4a5ca011c2b Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 9 Jan 2026 11:48:48 +0000 Subject: [PATCH 063/194] [Log] Add progress bar for auto-tuning --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 10 ++++++++-- Simulator/simulator.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index d0c8f815..28605e33 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -31,6 +31,8 @@ # Configure logger for mlir_codegen_backend module logger = extension_config.setup_logger() +from Simulator.simulator import ProgressBar + def reduction_init(reduction_type, dtype): if dtype in cpp.DTYPE_LOWP_FP: # Since load promotes all half-precision inputs to float, the initial @@ -983,8 +985,12 @@ def get_cycle(choice): return [None, None, None] # Get cycle time for each choice - with ThreadPoolExecutor(max_workers=8) as executor: - results = list(executor.map(get_cycle, choices)) + # Show progress bar only when CONFIG_DEBUG_MODE is off + show_progress = not extension_config.CONFIG_DEBUG_MODE + with ProgressBar("[Auto-tune] Running benchmarks", silent_mode=not show_progress) if show_progress else contextlib.nullcontext(): + with ThreadPoolExecutor(max_workers=8) as executor: + results = list(executor.map(get_cycle, choices)) + min_idx = results.index(min(results)) if min(results) == float("inf"): raise RuntimeError("Failed to find optimal tile size...") diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 6ed679d6..7a4f7e0d 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -209,7 +209,7 @@ def compile_and_simulate(self, target_binary, array_size, vectorlane_size, silen output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL) except subprocess.CalledProcessError as e: output_error = e.output.decode() if isinstance(e.output, bytes) else str(e.output) - logger.error(f"[Gem5] Gem5 simulation failed with error: \"{output_error}\"") + logger.debug(f"[Gem5] Gem5 simulation failed with error: \"{output_error}\"") raise RuntimeError(f"Gem5 Simulation Failed: \"{output_error}\"") with open(f"{dir_path}/stats.txt", "r") as stat_file: From d7c16b17c0aa082cb7c69b98b157ab66081809a4 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 9 Jan 2026 13:41:01 +0000 Subject: [PATCH 064/194] [Test/MoE] Disable compiling sparse dispatcher --- PyTorchSimFrontend/mlir/mlir_ops.py | 6 ++++++ tests/MoE/test_moe.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index dce59ed6..74629b00 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -943,6 +943,12 @@ def square(operand, *args, **kwargs): result = ops.mul(operand, operand) return result, V.kernel.var_info[result] + @staticmethod + def fma(operand1, operand2, operand3, *args, **kwargs): + result = ops.mul(operand1, operand2) + result = ops.add(result, operand3) + return result, V.kernel.var_info[result] + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # PyTorchSim specific operations diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py index ae16f0b0..1030e59f 100644 --- a/tests/MoE/test_moe.py +++ b/tests/MoE/test_moe.py @@ -4,7 +4,6 @@ import copy import matplotlib.pyplot as plt - import torch import torch.nn as nn from torch.distributions.normal import Normal @@ -64,6 +63,7 @@ class SparseDispatcher(object): `Tensor`s for expert i only the batch elements for which `gates[b, i] > 0`. """ + @torch.compiler.disable(recursive=True) def __init__(self, num_experts, gates): """Create a SparseDispatcher.""" gates = gates.cpu() From c88cabceff908be57649357d8be20055036c9c0d Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 12 Jan 2026 03:06:03 +0000 Subject: [PATCH 065/194] [Fix] Support identity in the dram_stride extraction --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 28605e33..e0a7d949 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1179,7 +1179,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe max_dim = len(self.ranges) if not store_reduction else len(self.ranges) - 1 for i in range(max_dim): target_dim = f"index{i}" - if target_dim not in str(index): + if sympy.Symbol(target_dim) not in index.free_symbols: dram_dict[target_dim] = [0] sorted_keys = sorted(dram_dict.keys()) dram_stride = sum((dram_dict[key] for key in sorted_keys), []) From 67612bb823be2992eaac36d7c9ddbbc24c017335 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 12 Jan 2026 03:39:17 +0000 Subject: [PATCH 066/194] [Fix] index to float casting --- PyTorchSimFrontend/mlir/mlir_ops.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index 74629b00..59a6be78 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -226,11 +226,25 @@ def binary_elementwise_common(operand1, operand2): if op_type1[1] != op_type2[1]: if op_type1[1] == "index" or op_type1 == "index": if op_type1[1] == "index": - operand1 = ops.index_cast(operand1, op_type2[1]) - op_type1 = V.kernel.var_info[operand1] + # index -> target type: 2-step casting if target is float + if op_type2[1][0] == "f": + operand1 = ops.index_cast(operand1, "i64") + operand1 = ops.to_dtype(operand1, op_type2[1]) + op_type1 = V.kernel.var_info[operand1] + else: + # index -> integer: direct casting + operand1 = ops.index_cast(operand1, op_type2[1]) + op_type1 = V.kernel.var_info[operand1] if op_type2[1] == "index": - operand2 = ops.index_cast(operand2, op_type1[1]) - op_type2 = V.kernel.var_info[operand2] + # index -> target type: 2-step casting if target is float + if op_type1[1][0] == "f": + operand2 = ops.index_cast(operand2, "i64") + operand2 = ops.to_dtype(operand2, op_type1[1]) + op_type2 = V.kernel.var_info[operand2] + else: + # index -> integer: direct casting + operand2 = ops.index_cast(operand2, op_type1[1]) + op_type2 = V.kernel.var_info[operand2] elif op_type1[1][0] == "i" and op_type2[1][0] == "f": operand1 = ops.to_dtype(operand1, op_type2[1]) op_type1 = V.kernel.var_info[operand1] From 50ceb5848baaa895230de9fb1cbe1f2e8ed44860 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 12 Jan 2026 12:22:10 +0000 Subject: [PATCH 067/194] [Fix] Change vlane_split_axis in case of group-dim --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 9 +++++++-- tests/Diffusion/test_diffusion.py | 16 ++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index e0a7d949..e5a1a273 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1196,14 +1196,19 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe dim_idx = int((str(sub.args[0])[5:])) if int(self.kernel_group.tile_desc.get_tile_size()[dim_idx] % sub.args[1]) != 0: # In this case, need to recompile - original_size = self.kernel_group.tile_desc.get_tile_size()[dim_idx] - divisor = sub.args[1] + original_tile = self.kernel_group.tile_desc.get_tile_size() + original_size = original_tile[dim_idx] + divisor = sub.args[1] * self.kernel_group.tile_desc.vmap.vlane_stride new_size = ((original_size + divisor - 1) // divisor) * divisor new_tile_sizes = list(self.kernel_group.tile_desc.get_tile_size()) new_tile_sizes[dim_idx] = new_size self.kernel_group.tile_desc.set_tile_size(new_tile_sizes) self.kernel_group.tile_desc.tile_constraint[dim_idx].fixed = True + # Can't use dim_idx as vlane_split_axis + if dim_idx == self.kernel_group.tile_desc.vmap.vlane_split_axis: + self.kernel_group.tile_desc.vmap.vlane_split_axis = (dim_idx + 1) % len(original_tile) + # Send recompile signal self.reset("recompile") raise mlir_common.RecompileSignal(f"Tile size {self.kernel_group.tile_desc.get_tile_size()[dim_idx]} is not divisible by {sub.args[1]}") diff --git a/tests/Diffusion/test_diffusion.py b/tests/Diffusion/test_diffusion.py index c5170209..d6d740fe 100644 --- a/tests/Diffusion/test_diffusion.py +++ b/tests/Diffusion/test_diffusion.py @@ -557,14 +557,14 @@ def test_upsample2d( module = PyTorchSimRunner.setup_device() device = module.custom_device() - #test_upsample2d(device) - #test_groupnorm(device) - #test_groupnorm(device, stride=[1, 1, 320*32, 320]) - #test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=320) - #test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=1280) - #test_cross_attn_down_block2d(device) - #test_unet_mid_block2d_cross_attn(device) - #test_cross_attn_up_block2d(device) + test_upsample2d(device) + test_groupnorm(device) + test_groupnorm(device, stride=[1, 1, 320*32, 320]) + test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=320) + test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=1280) + test_cross_attn_down_block2d(device) + test_unet_mid_block2d_cross_attn(device) + test_cross_attn_up_block2d(device) test_unet2d_condition_model(device) #test_unet_conditional( # device=device, From 319fd6cd6b98793573000bac138e976bae8cf22d Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 13 Jan 2026 06:54:16 +0000 Subject: [PATCH 068/194] [Frontend] Fix any operation codegen --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +- PyTorchSimFrontend/mlir/mlir_ops.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index e5a1a273..87c6a628 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -60,7 +60,7 @@ def reduction_partial_combine_vec(reduction_type, vector_value, init_value): if reduction_type == "min": return ops.minimum(vector_value, init_value) if reduction_type == "any": - return ops.logical_and(vector_value, init_value) + return ops.logical_or(vector_value, init_value) raise AssertionError(reduction_type) class ExtensionWrapperCodegen(wrapper.PythonWrapperCodegen): diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index 59a6be78..c3d3952e 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -18,7 +18,7 @@ def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape, if reduction_type == "min": return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" if reduction_type == "any": - return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" + return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" raise AssertionError(reduction_type) class ExtensionOverrides(common.OpOverrides): @@ -995,10 +995,10 @@ def to_bool(operand, *args, **kwargs): if ret_type == "i1": return operand, [tile_size, ret_type] - const_one = ops.constant(0, ret_type) + const_zero = ops.constant(0, ret_type) if tile_size > 1: - const_one = ops.broadcast(const_one, tile_size) - ret = ops.ne(operand, const_one) + const_zero = ops.broadcast(const_zero, tile_size) + ret = ops.ne(operand, const_zero) return ret, [tile_size, "i1"] @staticmethod def step(size, dtype, *args, **kwargs): From c223258d091ed4fe928ea11437a838b1e4de69d9 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 13 Jan 2026 06:55:25 +0000 Subject: [PATCH 069/194] [Decompose] Use F.softmax for decomposed SDPA --- PyTorchSimFrontend/mlir/mlir_decomposition.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_decomposition.py b/PyTorchSimFrontend/mlir/mlir_decomposition.py index 141fa9e4..284d25d7 100644 --- a/PyTorchSimFrontend/mlir/mlir_decomposition.py +++ b/PyTorchSimFrontend/mlir/mlir_decomposition.py @@ -137,15 +137,13 @@ def decompose_native_multi_head_attention( # Step 4: Apply mask if provided if mask is not None: - scores = scores + mask + if mask.dtype == torch.bool: + attn_bias.masked_fill_(mask.logical_not(), float("-inf")) + else: + attn_bias = mask + attn_bias # Step 5: Softmax along the last dimension (seq_len dimension) - # Stable softmax: subtract max, exp, divide by sum - scores_max = scores.amax(dim=-1, keepdim=True) # [batch, num_heads, seq_len, 1] - scores_shifted = scores - scores_max - scores_exp = scores_shifted.exp() - scores_sum = scores_exp.sum(dim=-1, keepdim=True) # [batch, num_heads, seq_len, 1] - attn_weights = scores_exp / scores_sum # [batch, num_heads, seq_len, seq_len] + attn_weights = F.softmax(scores, dim=-1) # [batch, num_heads, seq_len, seq_len] # Step 6: Attention @ V # [batch, num_heads, seq_len, seq_len] @ [batch, num_heads, seq_len, head_dim] From 07be94b0d47cd61a0170f360110fe440296b43c9 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 13 Jan 2026 09:41:00 +0000 Subject: [PATCH 070/194] [Frontend] Add recompiliation for ModularIndexing --- .../mlir/mlir_codegen_backend.py | 61 ++++++++++++++++--- PyTorchSimFrontend/mlir/mlir_common.py | 17 +++++- 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 87c6a628..3d65c0a4 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -20,7 +20,7 @@ is_welford_reduction, sympy_product ) -from torch.utils._sympy.functions import ModularIndexing, FloorDiv, Identity +from torch.utils._sympy.functions import ModularIndexing, FloorDiv from PyTorchSimFrontend import extension_codecache from PyTorchSimFrontend import extension_config from . import mlir_common @@ -365,13 +365,6 @@ def parse_indices(self, expr, buffer=None, comments="", indirect_dims=[]) -> com if len(expr.args) == 0 and len(indirect_dims) == 0: return expr - # Replace Identity arguments with Identity.args[0] - for arg in expr.args: - if arg.is_Mul and arg.args[0].is_number and isinstance(arg.args[1], Identity): - expr = expr.replace(arg.args[1], arg.args[1].args[0]) - if isinstance(arg, Identity): - expr = expr.replace(arg, arg.args[0] if arg.args else arg) - if len(expr.args) == 0: args = [expr] else: @@ -784,6 +777,7 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index): return accum def index_expr(self, index, dtype): + index = self.rename_indexing(index) base_tile_desc = self.kernel_group.tile_desc if len(self.ranges) != self.reduction_depth: # FIXME. This is a temporary solution to get tile stride of the reduction case @@ -1224,6 +1218,57 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe local_tile_desc.apply_divisor(dim_idx+offset, divisor, "split") offset = offset+1 + # Support ModularIndexing pattern + # This pattern can be used to broadcast ex) torch.cat([a,a]) + # ModularIndexing(x, y, z) means (x // y) % z + # tile_size must be: multiple of y (floorDiv divisor) and divisor of z (modular divisor) + if index.has(ModularIndexing): + for sub in sympy.preorder_traversal(index): + if isinstance(sub, ModularIndexing): + if not str(sub.args[0]).startswith("index"): + continue + dim_idx = int((str(sub.args[0])[5:])) + floor_divisor = sub.args[1] # y: floorDiv divisor + mod_divisor = sub.args[2] # z: modular divisor + current_tile_size = self.kernel_group.tile_desc.get_tile_size()[dim_idx] + + # Check if tile_size is multiple of floorDiv divisor + if int(current_tile_size % floor_divisor) != 0: + original_tile = self.kernel_group.tile_desc.get_tile_size() + original_size = original_tile[dim_idx] + divisor = floor_divisor * self.kernel_group.tile_desc.vmap.vlane_stride + new_size = ((original_size + divisor - 1) // divisor) * divisor + new_tile_sizes = list(self.kernel_group.tile_desc.get_tile_size()) + new_tile_sizes[dim_idx] = new_size + self.kernel_group.tile_desc.set_tile_size(new_tile_sizes) + self.kernel_group.tile_desc.tile_constraint[dim_idx].fixed = True + + self.reset("recompile") + raise mlir_common.RecompileSignal(f"Tile size {current_tile_size} is not a multiple of floorDiv divisor {floor_divisor} in ModularIndexing") + + # Check if tile_size is a divisor of modular divisor + if int((mod_divisor * floor_divisor) % current_tile_size) != 0: + original_tile = self.kernel_group.tile_desc.get_tile_size() + original_size = original_tile[dim_idx] + # Find the largest divisor of mod_divisor that is <= original_size + # and is a multiple of floor_divisor + new_size = original_size + while new_size > 0: + if mod_divisor % new_size == 0 and new_size % floor_divisor == 0: + break + new_size -= floor_divisor + + if new_size <= 0: + new_size = mod_divisor * floor_divisor + + new_tile_sizes = list(self.kernel_group.tile_desc.get_tile_size()) + new_tile_sizes[dim_idx] = new_size + self.kernel_group.tile_desc.set_tile_size(new_tile_sizes) + self.kernel_group.tile_desc.tile_constraint[dim_idx].fixed = True + + self.reset("recompile") + raise mlir_common.RecompileSignal(f"Tile size {current_tile_size} is not a divisor of modular divisor {mod_divisor} in ModularIndexing") + # FIXME. It will be nice to modify node instead of this exception handling... if len(self.itervars) == 1 and self.reduction_depth == 0: # In case of reduction loop only case, we will add dummy loop so shift it once diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index d96eb452..e31555ba 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -15,7 +15,7 @@ from torch._inductor.ir import MultiOutputLayout from torch._inductor.dependencies import MemoryDep, StarDep, WeakDep from torch._inductor.codegen.wrapper import KernelDefinitionLine -from torch.utils._sympy.functions import ModularIndexing, FloorDiv, Mod +from torch.utils._sympy.functions import ModularIndexing, FloorDiv, Mod, Identity import sympy import contextlib @@ -838,6 +838,21 @@ def rename_indexing(self, index) -> sympy.Expr: # and renames variables in index expressions to kernel arg names if isinstance(index, (list, tuple)): return [self.rename_indexing(x) for x in index] + + # FIXME. This is a temporary solution to remove Identity wrappers from index expression. + # Remove Identity wrappers from index expression + # Check if index itself is Identity + if isinstance(index, Identity): + index = index.args[0] if index.args else index + + # Replace Identity arguments with Identity.args[0] + if hasattr(index, 'args') and len(index.args) > 0: + for arg in index.args: + if arg.is_Mul and arg.args[0].is_number and isinstance(arg.args[1], Identity): + index = index.replace(arg.args[1], arg.args[1].args[0] if arg.args[1].args else arg.args[1]) + if isinstance(arg, Identity): + index = index.replace(arg, arg.args[0] if arg.args else arg) + index = V.graph.sizevars.simplify(index) sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name) replacements = { From e999bfc34b8527ea8253339eea9556c684758d65 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 13 Jan 2026 09:57:11 +0000 Subject: [PATCH 071/194] [Test] Fix minor bugs in the test folder --- tests/Llama/test_llama.py | 19 ++++++++++--------- tests/MoE/test_moe.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/tests/Llama/test_llama.py b/tests/Llama/test_llama.py index 443f3fc2..889e5fa8 100644 --- a/tests/Llama/test_llama.py +++ b/tests/Llama/test_llama.py @@ -101,7 +101,8 @@ def run_rotary_embedding_test( vocab_size=8192, _attn_implementation = "sdpa" ) - base_rope = LlamaRotaryEmbedding(cfg) + # Pass dim explicitly to avoid config parsing issues + base_rope = LlamaRotaryEmbedding(dim=head_dim, max_position_embeddings=cfg.max_position_embeddings, base=cfg.rope_theta, config=cfg) cpu_rope = copy.deepcopy(base_rope) @@ -375,14 +376,14 @@ def run_llama_model_test( torch.compiler.is_compiling = lambda: True # FIXME. How to fix this? #run_rmsnorm_test(device) #run_rotary_embedding_test(device) - #run_decoder_layer_test( - # device=device, - # batch=args.batch, - # seq_len=args.seq_len, - # dtype=args.dtype, - # rtol=args.rtol, - # atol=args.atol, - #) + run_decoder_layer_test( + device=device, + batch=args.batch, + seq_len=args.seq_len, + dtype=args.dtype, + rtol=args.rtol, + atol=args.atol, + ) run_llama_model_test(device) #run_custom_llama_test( # device=device, diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py index 1030e59f..9ebfb11e 100644 --- a/tests/MoE/test_moe.py +++ b/tests/MoE/test_moe.py @@ -16,6 +16,32 @@ sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) +# FIXME. This is a temporary solution to avoid is_forward conflict during backward +def patch_compile_event_logger(): + """Patch CompileEventLogger.compilation_metric to avoid is_forward conflict during backward.""" + from torch._dynamo.utils import CompileEventLogger + from torch._dynamo.utils import get_metrics_context + + original_compilation_metric = CompileEventLogger.compilation_metric + + @staticmethod + def patched_compilation_metric(is_forward=True, **kwargs): + """Patched version that clears is_forward before setting it if there's a conflict.""" + try: + metrics_context = get_metrics_context() + if metrics_context.in_progress() and hasattr(metrics_context, '_metrics'): + # If is_forward is already set and we're trying to set it to a different value, clear it first + current_is_forward = metrics_context._metrics.get('is_forward') + if current_is_forward is not None and current_is_forward != is_forward: + metrics_context._metrics.pop('is_forward', None) + except: + pass + # Call the original function + return original_compilation_metric(is_forward=is_forward, **kwargs) + + # Patch the method + CompileEventLogger.compilation_metric = patched_compilation_metric + def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): pass_message = f"|{name} Test Passed|" fail_message = f"|{name} Test Failed|" @@ -469,6 +495,9 @@ def test_moe(device): print("\n") def train_moe(device): + # Patch CompileEventLogger to avoid metric conflicts + patch_compile_event_logger() + def perceptron(a, b, c): return a * b + c @@ -589,6 +618,9 @@ def weight_update(a, b, lr): plt.savefig('result.png') def train_moe_mnist(device): + # Patch CompileEventLogger to avoid metric conflicts + patch_compile_event_logger() + torch.manual_seed(0) batch_size = 32 input_size = 28*28 @@ -670,6 +702,9 @@ def train(model, device, train_loader, optimizer, epochs): plt.savefig(f'{name}_result.png') def train_moe_single_iteration(device, iter_idx, is_evaluation=0): + # Patch CompileEventLogger to avoid metric conflicts + patch_compile_event_logger() + # Training moe with mnist dataset for sinlge iteration torch.manual_seed(0) batch_size = 128 From d747e7ee7e505f74d7abb0296e098567b951cb47 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 13 Jan 2026 10:42:18 +0000 Subject: [PATCH 072/194] [Log] Add progress bar in spike simulation --- Simulator/simulator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 7a4f7e0d..96a1fc86 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -151,7 +151,8 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= try: stdout_setting = subprocess.DEVNULL if silent_mode else None stderr_setting = subprocess.DEVNULL if silent_mode else None - subprocess.check_call(run_cmd, stdout=stdout_setting, stderr=stderr_setting) + with ProgressBar("[Spike] Running simulation", silent_mode=silent_mode): + subprocess.check_call(run_cmd, stdout=stdout_setting, stderr=stderr_setting) except subprocess.CalledProcessError as e: if not silent_mode: logger.error(f"[Spike] Command failed with exit code {e.returncode}") From b49b6795d92088489fc0a5fb685c35307ae968b6 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 15 Jan 2026 07:28:24 +0000 Subject: [PATCH 073/194] [Fix] Use extraction for vlane_offset + Register extract op --- .../mlir/mlir_codegen_backend.py | 13 +- PyTorchSimFrontend/mlir/mlir_ops.py | 280 +++++++++++++----- 2 files changed, 206 insertions(+), 87 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 3d65c0a4..912c618a 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -534,8 +534,8 @@ def store(self, name: str, index: sympy.Expr, value, mode=None, *args, **kwargs) value = ops.to_dtype(value, mlir_dtype) if compute_vec_size < self.var_info[value][0]: - value = self.cse.generate(self.stores, f"vector.extract_strided_slice %{value} {{offsets = [0], sizes = [{compute_vec_size}], strides = [1]}}: vector<{self.var_info[value][0]}x{self.var_info[value][1]}> to {vshape}") - self.register_var_info(value, [compute_vec_size, mlir_dtype]) + with self.override_buffer_cse(buffer=self.stores): + value = ops.extract_strided_slice(value, compute_vec_size) with self.override_buffer_cse(buffer=self.stores): ops._store(value, sram_var, compute_index_var, tile_shape, buffer_name=name) @@ -729,9 +729,11 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index): outer_dim = ops.remainder(ops.truncdiv(dim, vlane_stride_vec), vlane_outer_vec) dim = ops.add(stride_dim, ops.mul(outer_dim, nr_vector_lane_vec)) - vlane_offset = self.const_cse.generate(self.const_buffer, f"arith.addi %{vlane_vec}, %{vlane_vec} {{ vlane_offset={offset} }} : vector<{vlane_vec_size}xi64> // vlane offset") - self.register_var_info(vlane_offset, [vlane_vec_size, "i64"]) - vlane_offset = ops.index_cast(vlane_offset, "index") + with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse): + vlane_offset = ops.vlane_offset(vlane_vec, vlane_vec, attributes={"vlane_offset": offset}, comment="vlane offset") + if compute_vec_size < self.var_info[vlane_offset][0]: + vlane_offset = ops.extract_strided_slice(vlane_offset, compute_vec_size) + vlane_offset = ops.index_cast(vlane_offset, "index") dim = ops.add(dim, vlane_offset) dim_list.append(dim) @@ -795,7 +797,6 @@ def index_expr(self, index, dtype): tile_desc = base_tile_desc compute_vec_size = tile_desc.get_compute_vec_size() - tile_shape = f"memref<{compute_vec_size*self.vector_lane}xindex, 1>" vshape = f"vector<{compute_vec_size}xindex>" diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index c3d3952e..4cf031d2 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -21,6 +21,35 @@ def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape, return f"vector.multi_reduction , %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}" raise AssertionError(reduction_type) +def format_mlir_op(op_str, shape, **kwargs): + """ + Format MLIR operation string with optional attributes and comment. + + Args: + op_str: Base operation string (e.g., "arith.addi %0, %1") + shape: Type shape string (e.g., "vector<4xi64>" or "i64") + **kwargs: May contain 'attributes' (dict or str) and 'comment' (str) + + Returns: + Formatted MLIR operation string + """ + result = op_str + attributes = kwargs.get('attributes', None) + comment = kwargs.get('comment', None) + + if attributes: + if isinstance(attributes, dict): + # Format: { key1=value1, key2=value2 } + attrs_str = ", ".join(f"{k}={v}" for k, v in attributes.items()) + result += f" {{ {attrs_str} }}" + elif isinstance(attributes, str): + # Direct string format + result += f" {{ {attributes} }}" + result += f" : {shape}" + if comment: + result += f" // {comment}" + return result + class ExtensionOverrides(common.OpOverrides): @staticmethod def constant(value, src_type, *args, **kwargs): @@ -36,8 +65,8 @@ def constant(value, src_type, *args, **kwargs): elif src_type[0] == "f": value = format(float(value), ".20f") elif src_type[0] == "i": - value = int(float(value)) - return f'arith.constant {value} : {src_type}', [1, src_type] + value = int(float(value)) + return format_mlir_op(f'arith.constant {value}', src_type, **kwargs), [1, src_type] @staticmethod def broadcast(operand, target_size, *args, **kwargs): @@ -54,16 +83,18 @@ def broadcast(operand, target_size, *args, **kwargs): outer_dim = target_size // src_size unflat_shape = f"vector<{outer_dim}x{src_size}x{dtype}>" # Flatten back to 1D - op_str = f"vector.shape_cast %{unflat_operand} : {unflat_shape} to {dst_shape}" + op_str = f"vector.shape_cast %{unflat_operand}" + shape = f"{unflat_shape} to {dst_shape}" else: raise NotImplementedError( f"Vector broadcast size mismatch: src={src_size} cannot broadcast to target={target_size}" ) elif src_size == 1: - op_str = f"vector.broadcast %{operand} : {src_shape} to {dst_shape}" + op_str = f"vector.broadcast %{operand}" + shape = f"{src_shape} to {dst_shape}" else: raise ValueError(f"Invalid source size: {src_size}") - return op_str, [target_size, dtype] + return format_mlir_op(op_str, shape, **kwargs), [target_size, dtype] @staticmethod def broadcast_unflat(operand, target_size, *args, **kwargs): @@ -73,8 +104,9 @@ def broadcast_unflat(operand, target_size, *args, **kwargs): src_shape = f"vector<{src_size}x{dtype}>" dst_shape = f"vector<{outer_dim}x{src_size}x{dtype}>" - op_str = f"vector.broadcast %{operand} : {src_shape} to {dst_shape}" - return op_str, [target_size, dtype] + op_str = f"vector.broadcast %{operand}" + shape = f"{src_shape} to {dst_shape}" + return format_mlir_op(op_str, shape, **kwargs), [target_size, dtype] def load_seed(self, *args, **kwargs): raise NotImplementedError @@ -110,7 +142,10 @@ def where(condition, operand1, operand2, *args, **kwargs): tile_size, ret_type = V.kernel.var_info[operand1] shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type cond_shape = f"vector<{tile_size}xi1>" if tile_size > 1 else "" - return f"arith.select %{condition}, %{operand1}, %{operand2} : {cond_shape}, {shape}", [tile_size, ret_type] + + op_str = f"arith.select %{condition}, %{operand1}, %{operand2}" + shape = f"{cond_shape}, {shape}" + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def to_dtype(operand, dst_mlir_dtype, *args, **kwargs): @@ -157,7 +192,7 @@ def to_dtype(operand, dst_mlir_dtype, *args, **kwargs): op_str = f"arith.extsi %{operand} : {src_shape} to {shape}" elif dst_bits < src_bits: # Use arith.trunci for integer truncation - op_str = f"arith.trunci %{operand} : {src_shape} to {shape}" + op_str = f"arith.trunci %{operand} : {src_shape} to {shape}" else: return operand, [tile_size, dst_mlir_dtype] # Case D: Float -> Float (Extension / Truncation) @@ -166,7 +201,7 @@ def to_dtype(operand, dst_mlir_dtype, *args, **kwargs): op_str = f"arith.extf %{operand} : {src_shape} to {shape}" elif dst_bits < src_bits: # Corrected 'trunf' to 'truncf' - op_str = f"arith.truncf %{operand} : {src_shape} to {shape}" + op_str = f"arith.truncf %{operand} : {src_shape} to {shape}" else: return operand, [tile_size, dst_mlir_dtype] else: @@ -200,7 +235,9 @@ def to_dtype_bitcast(operand, dtype, *args, **kwargs): src_shape = f"vector<{tile_size}x{current_src_type}>" if tile_size > 1 else current_src_type dst_shape = f"vector<{tile_size}x{dst_mlir_type}>" if tile_size > 1 else dst_mlir_type - return f"arith.bitcast %{operand} : {src_shape} to {dst_shape}", [tile_size, dst_mlir_type] + op_str = f"arith.bitcast %{operand}" + shape = f"{src_shape} to {dst_shape}" + return format_mlir_op(op_str, shape, **kwargs), [tile_size, dst_mlir_type] # Binary element wise operations @staticmethod @@ -283,7 +320,7 @@ def exp(operand, *args, **kwargs): tile_size = op_type[0] dtype = op_type[1] shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.exp %{operand} : {shape}', [tile_size, dtype] + return format_mlir_op(f'math.exp %{operand}', shape, **kwargs), [tile_size, dtype] @staticmethod def exp2(operand, *args, **kwargs): @@ -315,7 +352,7 @@ def sqrt(operand, *args, **kwargs): operand = ops.to_dtype(operand, "f32") shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.sqrt %{operand} : {shape}', [tile_size, dtype] + return format_mlir_op(f'math.sqrt %{operand}', shape, **kwargs), [tile_size, dtype] @staticmethod def relu(operand, *args, **kwargs): @@ -331,7 +368,8 @@ def minimum(operand1, operand2, *args, **kwargs): opcode = f'arith.minimumf' else: opcode = f'arith.minsi' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + op_str = f'{opcode} %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def maximum(operand1, operand2, *args, **kwargs): @@ -341,7 +379,8 @@ def maximum(operand1, operand2, *args, **kwargs): opcode = f'arith.maximumf' else: opcode = f'arith.maxsi' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + op_str = f'{opcode} %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def cos(operand, *args, **kwargs): @@ -362,7 +401,7 @@ def cos(operand, *args, **kwargs): if dtype.startswith("f"): operand = ops.to_dtype(operand, "f32") shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.cos %{operand} : {shape}', [tile_size, dtype] + return format_mlir_op(f'math.cos %{operand}', shape, **kwargs), [tile_size, dtype] @staticmethod def sin(operand, *args, **kwargs): @@ -383,7 +422,7 @@ def sin(operand, *args, **kwargs): if dtype.startswith("f"): operand = ops.to_dtype(operand, "f32") shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.sin %{operand} : {shape}', [tile_size, dtype] + return format_mlir_op(f'math.sin %{operand}', shape, **kwargs), [tile_size, dtype] @staticmethod def tan(operand, *args, **kwargs): @@ -409,7 +448,7 @@ def erf(operand, *args, **kwargs): tile_size = op_type[0] dtype = op_type[1] shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.erf %{operand} : {shape}', [tile_size, dtype] + return format_mlir_op(f'math.erf %{operand}', shape, **kwargs), [tile_size, dtype] @staticmethod def cosh(operand, *args, **kwargs): @@ -438,7 +477,7 @@ def tanh(operand, *args, **kwargs): if dtype.startswith("f"): operand = ops.to_dtype(operand, "f32") shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.tanh %{operand} : {shape}', [tile_size, dtype] + return format_mlir_op(f'math.tanh %{operand}', shape, **kwargs), [tile_size, dtype] @staticmethod def acos(operand, *args, **kwargs): @@ -491,11 +530,11 @@ def hypot(operand1, operand2, *args, **kwargs): @staticmethod def log10(operand, *args, **kwargs): val_ln = ops.log(operand) - + tile_size, dtype = V.kernel.var_info[val_ln] inv_ln10 = 1/math.log(10) const_op = ops.constant(inv_ln10, dtype) - + # Multiply: ln(x) * (1/ln(10)) result = ops.mul(val_ln, const_op) return result, V.kernel.var_info[result] @@ -503,11 +542,10 @@ def log10(operand, *args, **kwargs): @staticmethod def log2(operand, *args, **kwargs): val_ln = ops.log(operand) - tile_size, dtype = V.kernel.var_info[val_ln] inv_ln10 = 1/math.log(2) const_op = ops.constant(inv_ln10, dtype) - + # Multiply: ln(x) * (1/ln(10)) result = ops.mul(val_ln, const_op) return result, V.kernel.var_info[result] @@ -523,7 +561,7 @@ def log(operand, *args, **kwargs): operand = ops.to_dtype(operand, "f32") shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.log %{operand} : {shape}', [tile_size, dtype] + return format_mlir_op(f'math.log %{operand}', shape, **kwargs), [tile_size, dtype] @staticmethod def log1p(operand, *args, **kwargs): @@ -542,7 +580,6 @@ def nextafter(operand1, operand2, *args, **kwargs): def logical_and(operand1, operand2, *args, **kwargs): if V.kernel.var_info[operand1][1] != "i1": operand1 = ops.to_bool(operand1) - if V.kernel.var_info[operand2][1] != "i1": operand2 = ops.to_bool(operand2) result = ops.and_(operand1, operand2) @@ -552,7 +589,6 @@ def logical_and(operand1, operand2, *args, **kwargs): def logical_or(operand1, operand2, *args, **kwargs): if V.kernel.var_info[operand1][1] != "i1": operand1 = ops.to_bool(operand1) - if V.kernel.var_info[operand2][1] != "i1": operand2 = ops.to_bool(operand2) result = ops.or_(operand1, operand2) @@ -562,18 +598,16 @@ def logical_or(operand1, operand2, *args, **kwargs): def logical_xor(operand1, operand2, *args, **kwargs): if V.kernel.var_info[operand1][1] != "i1": operand1 = ops.to_bool(operand1) - if V.kernel.var_info[operand2][1] != "i1": operand2 = ops.to_bool(operand2) result = ops.xor(operand1, operand2) return result, V.kernel.var_info[result] - + @staticmethod def logical_not(operand, *args, **kwargs): op_info = V.kernel.var_info[operand] tile_size = op_info[0] dtype = op_info[1] - zero_const = ops.constant(0, dtype) result = ops.eq(operand, zero_const) return result, V.kernel.var_info[result] @@ -583,7 +617,6 @@ def bitwise_and(operand1, operand2, *args, **kwargs): # Float check if V.kernel.var_info[operand1][1].startswith("f") or V.kernel.var_info[operand2][1].startswith("f"): raise ValueError("Bitwise AND not supported for floats") - result = ops.and_(operand1, operand2) return result, V.kernel.var_info[result] @@ -593,9 +626,8 @@ def bitwise_not(operand, *args, **kwargs): # Float check if V.kernel.var_info[operand][1].startswith("f"): raise ValueError("Bitwise NOT not supported for floats") - neg_one = ops.constant(-1, dtype) - result = ops.xor(operand, neg_one) + result = ops.xor(operand, neg_one) return result, V.kernel.var_info[result] @staticmethod @@ -603,7 +635,7 @@ def bitwise_or(operand1, operand2, *args, **kwargs): # Float check if V.kernel.var_info[operand1][1].startswith("f") or V.kernel.var_info[operand2][1].startswith("f"): raise ValueError("Bitwise AND not supported for floats") - + result = ops.or_(operand1, operand2) return result, V.kernel.var_info[result] @@ -612,7 +644,6 @@ def bitwise_xor(operand1, operand2, *args, **kwargs): # Float check if V.kernel.var_info[operand1][1].startswith("f") or V.kernel.var_info[operand2][1].startswith("f"): raise ValueError("Bitwise AND not supported for floats") - result = ops.xor(operand1, operand2) return result, V.kernel.var_info[result] @@ -635,7 +666,7 @@ def rsqrt(operand, *args, **kwargs): operand = ops.to_dtype(operand, "f32") shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'math.rsqrt %{operand} : {shape}', [tile_size, dtype] + return format_mlir_op(f'math.rsqrt %{operand}', shape, **kwargs), [tile_size, dtype] @staticmethod def sigmoid(operand, *args, **kwargs): @@ -663,7 +694,8 @@ def round(operand, *args, **kwargs): shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype if dtype.startswith("f"): - return f"math.roundeven %{operand} : {shape}", [tile_size, dtype] + op_str = f"math.roundeven %{operand}" + return format_mlir_op(op_str, shape, **kwargs), [tile_size, dtype] else: return operand, [tile_size, dtype] @@ -673,7 +705,8 @@ def floor(operand, *args, **kwargs): shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype if dtype.startswith("f"): - return f"math.floor %{operand} : {shape}", [tile_size, dtype] + op_str = f"math.floor %{operand}" + return format_mlir_op(op_str, shape, **kwargs), [tile_size, dtype] else: return operand, [tile_size, dtype] @@ -687,7 +720,8 @@ def trunc(operand, *args, **kwargs): shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype if dtype.startswith("f"): - return f"math.trunc %{operand} : {shape}", [tile_size, dtype] + op_str = f"math.trunc %{operand}" + return format_mlir_op(op_str, shape, **kwargs), [tile_size, dtype] else: return operand, [tile_size, dtype] @@ -697,7 +731,8 @@ def ceil(operand, *args, **kwargs): shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype if dtype.startswith("f"): - return f"math.ceil %{operand} : {shape}", [tile_size, dtype] + op_str = f"math.ceil %{operand}" + return format_mlir_op(op_str, shape, **kwargs), [tile_size, dtype] else: return operand, [tile_size, dtype] @@ -711,19 +746,18 @@ def neg(operand, *args, **kwargs): # Type check & auto cast if dtype.startswith("f"): operand = ops.to_dtype(operand, "f32") - + op_str = f"arith.negf %{operand}" shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f'arith.negf %{operand} : {shape}', [tile_size, dtype] + return format_mlir_op(op_str, shape, **kwargs), [tile_size, dtype] @staticmethod def reciprocal(operand, *args, **kwargs): op_type = V.kernel.var_info[operand] - tile_size = op_type[0] - dtype = op_type[1] - - # Type check & auto cast - if dtype.startswith("f"): - operand = ops.to_dtype(operand, "f32") + tile_size, dtype = op_type[0], op_type[1] + if dtype.startswith("i"): + openand = ops.to_dtype(operand, "f32") + op_type = V.kernel.var_info[operand] + tile_size, dtype = op_type[0], op_type[1] return ops.truediv(ops.constant(1.0, dtype), operand), [tile_size, dtype] @@ -739,8 +773,9 @@ def eq(operand1, operand2, *args, **kwargs): else: raise ValueError(f"Unsupported data type for 'eq' operation: {ret_type}") + op_str = f'{op_type} {attribute}, %{operand1}, %{operand2}' shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] + return format_mlir_op(op_str, shape, **kwargs), [tile_size, "i1"] @staticmethod def ne(operand1, operand2, *args, **kwargs): @@ -754,8 +789,9 @@ def ne(operand1, operand2, *args, **kwargs): else: raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}") + op_str = f'{op_type} {attribute}, %{operand1}, %{operand2}' shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] + return format_mlir_op(op_str, shape, **kwargs), [tile_size, "i1"] @staticmethod def lt(operand1, operand2, *args, **kwargs): @@ -769,8 +805,9 @@ def lt(operand1, operand2, *args, **kwargs): else: raise ValueError(f"Unsupported data type for 'lt' operation: {ret_type}") + op_str = f'{op_type} {attribute}, %{operand1}, %{operand2}' shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] + return format_mlir_op(op_str, shape, **kwargs), [tile_size, "i1"] @staticmethod def gt(operand1, operand2, *args, **kwargs): @@ -784,8 +821,9 @@ def gt(operand1, operand2, *args, **kwargs): else: raise ValueError(f"Unsupported data type for 'gt' operation: {ret_type}") + op_str = f'{op_type} {attribute}, %{operand1}, %{operand2}' shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] + return format_mlir_op(op_str, shape, **kwargs), [tile_size, "i1"] @staticmethod def le(operand1, operand2, *args, **kwargs): @@ -799,8 +837,9 @@ def le(operand1, operand2, *args, **kwargs): else: raise ValueError(f"Unsupported data type for 'le' operation: {ret_type}") + op_str = f'{op_type} {attribute}, %{operand1}, %{operand2}' shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] + return format_mlir_op(op_str, shape, **kwargs), [tile_size, "i1"] @staticmethod def ge(operand1, operand2, *args, **kwargs): @@ -814,29 +853,33 @@ def ge(operand1, operand2, *args, **kwargs): else: raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}") + op_str = f'{op_type} {attribute}, %{operand1}, %{operand2}' shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"] + return format_mlir_op(op_str, shape, **kwargs), [tile_size, "i1"] @staticmethod def add(operand1, operand2, *args, **kwargs): tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type opcode = f'arith.add{ret_type[0]}' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + op_str = f'{opcode} %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def sub(operand1, operand2, *args, **kwargs): tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type opcode = f'arith.sub{ret_type[0]}' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + op_str = f'{opcode} %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def mul(operand1, operand2, *args, **kwargs): tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type opcode = f'arith.mul{ret_type[0]}' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + op_str = f'{opcode} %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def pow(operand1, operand2, *args, **kwargs): @@ -850,28 +893,32 @@ def pow(operand1, operand2, *args, **kwargs): operand2 = ops.to_dtype(operand2, "f32") shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f"math.pow{ret_type[0]} %{operand1}, %{operand2} : {shape}", [tile_size, ret_type] + op_str = f"math.pow{ret_type[0]} %{operand1}, %{operand2}" + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def and_(operand1, operand2, *args, **kwargs): tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) - + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'arith.andi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + op_str = f'arith.andi %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def or_(operand1, operand2, *args, **kwargs): tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) - + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'arith.ori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + op_str = f'arith.ori %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def xor(operand1, operand2, *args, **kwargs): tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) - + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type - return f'arith.xori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + op_str = f'arith.xori %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def lshift(operand1, operand2, *args, **kwargs): @@ -888,9 +935,10 @@ def truncdiv(operand1, operand2, *args, **kwargs): if ret_type.startswith("f"): raise ValueError("truncdiv is strictly for integers. Use truediv for floats.") - + # arith.divsi: Signed Integer Division (Result is truncated) - return f'arith.divsi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + op_str = f'arith.divsi %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def floordiv(operand1, operand2, *args, **kwargs): @@ -902,7 +950,8 @@ def floordiv(operand1, operand2, *args, **kwargs): raise ValueError("floordiv implementation expects integers based on definition.") # arith.floordivsi: Floor Division for Signed Integers - return f'arith.floordivsi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + op_str = f'arith.floordivsi %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def truediv(operand1, operand2, *args, **kwargs): @@ -912,7 +961,8 @@ def truediv(operand1, operand2, *args, **kwargs): if not ret_type.startswith("f"): raise ValueError(f"truediv expects float inputs, but got {ret_type}. Use int_truediv for integers.") - return f'arith.divf %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + op_str = f'arith.divf %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def int_truediv(operand1, operand2, *args, **kwargs): @@ -938,7 +988,8 @@ def mod(operand1, operand2, *args, **kwargs): raise NotImplementedError("Not support remainder operation for floating point") else: opcode = f'arith.remsi' - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + op_str = f'{opcode} %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def remainder(operand1, operand2, *args, **kwargs): @@ -950,7 +1001,8 @@ def remainder(operand1, operand2, *args, **kwargs): else: opcode = 'arith.remsi' # Signed Integer Remainder (LHS sign) - return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type] + op_str = f'{opcode} %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def square(operand, *args, **kwargs): @@ -964,7 +1016,7 @@ def fma(operand1, operand2, operand3, *args, **kwargs): return result, V.kernel.var_info[result] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # PyTorchSim specific operations + # PyTorchSim specific operations @staticmethod def alloc(size, src_type, *args, **kwargs): @@ -976,7 +1028,9 @@ def extractelement(operand, idx, *args, **kwargs): tile_size = op_type[0] dtype = op_type[1] shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype - return f"vector.extract %{operand}[{idx}]: {dtype} from {shape}", [1, dtype] + op_str = f"vector.extract %{operand}[{idx}]" + shape = f"{dtype} from {shape}" + return format_mlir_op(op_str, shape, **kwargs), [1, dtype] @staticmethod def ext(operand, dtype, *args, **kwargs): @@ -987,7 +1041,9 @@ def ext(operand, dtype, *args, **kwargs): opcode = f'arith.extf' else: opcode = f'arith.extui' - return f'{opcode} %{operand} : {shape} to {target_type}', [op_type[0], dtype] + op_str = f'{opcode} %{operand}' + shape = f"{shape} to {target_type}" + return format_mlir_op(op_str, shape, **kwargs), [op_type[0], dtype] @staticmethod def to_bool(operand, *args, **kwargs): @@ -1003,19 +1059,76 @@ def to_bool(operand, *args, **kwargs): @staticmethod def step(size, dtype, *args, **kwargs): index_shape = f"vector<{size}x{dtype}>" - return f"vector.step : {index_shape}", [size, dtype] + op_str = f"vector.step" + return format_mlir_op(op_str, index_shape, **kwargs), [size, dtype] @staticmethod - def index_cast(operand, target_type, *args, **kwrags): + def index_cast(operand, target_type, *args, **kwargs): op_type = V.kernel.var_info[operand] src_shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else op_type[1] des_shape = f"vector<{op_type[0]}x{target_type}>" if op_type[0] > 1 else target_type - return f"arith.index_cast %{operand} : {src_shape} to {des_shape}", [op_type[0], target_type] + op_str = f"arith.index_cast %{operand}" + shape = f"{src_shape} to {des_shape}" + return format_mlir_op(op_str, shape, **kwargs), [op_type[0], target_type] @staticmethod def shape_cast(operand, src_shape, dst_shape, *args, **kwargs): operand_type = V.kernel.var_info[operand] - return f"vector.shape_cast %{operand} : {src_shape} to {dst_shape}", operand_type + op_str = f"vector.shape_cast %{operand}" + shape = f"{src_shape} to {dst_shape}" + return format_mlir_op(op_str, shape, **kwargs), operand_type + + @staticmethod + def extract_strided_slice(operand, target_size, offsets=None, sizes=None, strides=None, *args, **kwargs): + op_type = V.kernel.var_info[operand] + src_size = op_type[0] + dtype = op_type[1] + + if offsets is None: + offsets = [0] + if sizes is None: + sizes = [target_size] + if strides is None: + strides = [1] + + src_shape = f"vector<{src_size}x{dtype}>" + dst_shape = f"vector<{target_size}x{dtype}>" + + offsets_str = ", ".join(str(o) for o in offsets) + sizes_str = ", ".join(str(s) for s in sizes) + strides_str = ", ".join(str(s) for s in strides) + + # Build attributes dict for offsets, sizes, strides + built_attributes = { + "offsets": f"[{offsets_str}]", + "sizes": f"[{sizes_str}]", + "strides": f"[{strides_str}]" + } + + # Merge with any existing attributes from kwargs + existing_attributes = kwargs.get('attributes', {}) + if isinstance(existing_attributes, dict): + merged_attributes = {**built_attributes, **existing_attributes} + elif isinstance(existing_attributes, str): + built_attrs_str = ", ".join(f"{k}={v}" for k, v in built_attributes.items()) + merged_attributes = f"{built_attrs_str}, {existing_attributes}" + else: + merged_attributes = built_attributes + + op_str = f"vector.extract_strided_slice %{operand}" + shape = f"{src_shape} to {dst_shape}" + + # Pass merged attributes to format_mlir_op + updated_kwargs = {**kwargs, 'attributes': merged_attributes} + return format_mlir_op(op_str, shape, **updated_kwargs), [target_size, dtype] + + @staticmethod + def vlane_offset(operand1, operand2, *args, **kwargs): + tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2) + shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type + opcode = f'arith.add{ret_type[0]}' + op_str = f'{opcode} %{operand1}, %{operand2}' + return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type] @staticmethod def multi_reduction(acc, init, vec_size, red_size, red_shape, red_type, type_name, *args, **kwargs): @@ -1034,12 +1147,14 @@ def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, ** if compute_vec_size == 1: vshape = f"{mlir_dtype}" operation = "affine.load" - line = f"{operation} %{buffer}[{indices}] : {buffer_shape}" + line = f"{operation} %{buffer}[{indices}]" + shape = buffer_shape else: vshape = f"vector<{compute_vec_size}x{mlir_dtype}>" operation = "affine.vector_load" - line = f"{operation} %{buffer}[{indices}] : {buffer_shape}, {vshape}" - return line, [compute_vec_size, mlir_dtype] + line = f"{operation} %{buffer}[{indices}]" + shape = f"{buffer_shape}, {vshape}" + return format_mlir_op(line, shape, **kwargs), [compute_vec_size, mlir_dtype] @staticmethod def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, **kwargs): @@ -1048,11 +1163,14 @@ def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, **kw if compute_vec_size == 1: vshape = f"{mlir_dtype}" operation = "affine.store" - line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}" + line = f"{operation} %{operand}, %{buffer}[{indices}]" + shape = buffer_shape else: vshape = f"vector<{compute_vec_size}x{mlir_dtype}>" operation = "affine.vector_store" - line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}, {vshape}" + line = f"{operation} %{operand}, %{buffer}[{indices}]" + shape = f"{buffer_shape}, {vshape}" + line = format_mlir_op(line, shape, **kwargs) if buffer_name is not None: return common.DeferredLine(buffer_name, line), [None, None] From 729b999d37f563cdd51a1ef112965645b4ec8db9 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 15 Jan 2026 07:35:40 +0000 Subject: [PATCH 074/194] [Tests/Diffusion] Add embedding test case --- tests/Diffusion/test_diffusion.py | 122 +++++++++++++++++++++++++----- 1 file changed, 104 insertions(+), 18 deletions(-) diff --git a/tests/Diffusion/test_diffusion.py b/tests/Diffusion/test_diffusion.py index d6d740fe..082ed865 100644 --- a/tests/Diffusion/test_diffusion.py +++ b/tests/Diffusion/test_diffusion.py @@ -8,6 +8,7 @@ from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel from diffusers.models.upsampling import Upsample2D from diffusers.models.resnet import ResnetBlock2D +from diffusers.models.embeddings import Timesteps def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -313,7 +314,7 @@ def test_cross_attn_down_block2d( dual_cross_attention=False ): print(f"Testing CrossAttnDownBlock2D on device: {device}") - + # 1. Initialize the module on CPU cpu_block = CrossAttnDownBlock2D( in_channels=in_channels, @@ -338,7 +339,7 @@ def test_cross_attn_down_block2d( temb=temb_cpu, encoder_hidden_states=encoder_hidden_states_cpu, ) - + # 4. Initialize the module on the custom device device_block = cpu_block.to(device).eval() device_block = torch.compile(device_block, dynamic=False) @@ -347,7 +348,7 @@ def test_cross_attn_down_block2d( hidden_states_dev = hidden_states_cpu.to(device) temb_dev = temb_cpu.to(device) encoder_hidden_states_dev = encoder_hidden_states_cpu.to(device) - + # 6. Get the output from the custom device module with torch.no_grad(): dev_out, _ = device_block( @@ -442,9 +443,9 @@ def test_groupnorm( # 1. Initialize the module on CPU cpu_norm = torch.nn.GroupNorm( - num_groups=num_groups, - num_channels=channels, - eps=eps, + num_groups=num_groups, + num_channels=channels, + eps=eps, affine=True ).to("cpu").eval() @@ -462,13 +463,13 @@ def test_groupnorm( # 4. Initialize the module on the custom device device_norm = torch.nn.GroupNorm( - num_groups=num_groups, - num_channels=channels, - eps=eps, + num_groups=num_groups, + num_channels=channels, + eps=eps, affine=True ).to(device).eval() device_norm = torch.compile(device_norm, dynamic=False) - + # Copy the weights from the CPU module to ensure they are identical device_norm.weight.data.copy_(cpu_norm.weight.data) device_norm.bias.data.copy_(cpu_norm.bias.data) @@ -541,6 +542,89 @@ def test_upsample2d( print("Max diff >", torch.max(torch.abs(y_dev.cpu() - y_cpu)).item()) print("Upsample2D simulation done.") + +def test_flip_sin_to_cos_embedding( + device, + batch=1, + embedding_dim=256, + rtol=1e-4, + atol=1e-4, +): + def create_embeddings(timesteps, embedding_dim, scale=1.0, flip_sin_to_cos=False): + """ + Replicate the embedding creation logic from Timesteps class. + """ + half_dim = embedding_dim // 2 + exponent = -math.log(10000) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device) + exponent = exponent / half_dim + emb = torch.exp(exponent) + emb = timesteps[:, None].float() * emb[None, :] + emb = scale * emb + + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1) + # flip sine and cosine embeddings + if flip_sin_to_cos: + new_emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1) + return emb, new_emb + return emb, emb + + g = torch.Generator().manual_seed(0) + timesteps_cpu = torch.randint(low=0, high=1000, size=(batch,), generator=g, dtype=torch.long) + + # Test with flip_sin_to_cos=True + with torch.no_grad(): + emb_flip_cpu = create_embeddings(timesteps_cpu, embedding_dim, flip_sin_to_cos=True) + + # Move to device and test + timesteps_dev = timesteps_cpu.to(device) + @torch.compile(dynamic=False) + def create_embeddings_compiled(timesteps, embedding_dim, scale=1.0, flip_sin_to_cos=False): + return create_embeddings(timesteps, embedding_dim, scale, flip_sin_to_cos) + + with torch.no_grad(): + emb_flip_dev = create_embeddings_compiled(timesteps_dev, embedding_dim, flip_sin_to_cos=True) + + # Verify flip case + test_result("Embedding (flip_sin_to_cos=True)", emb_flip_dev[0], emb_flip_cpu[0], rtol=rtol, atol=atol) + print("Max diff (flip) >", torch.max(torch.abs(emb_flip_dev[0].cpu() - emb_flip_cpu[0])).item()) + test_result("Embedding (flip_sin_to_cos=True)", emb_flip_dev[1], emb_flip_cpu[1], rtol=rtol, atol=atol) + print("Max diff (flip) >", torch.max(torch.abs(emb_flip_dev[1].cpu() - emb_flip_cpu[1])).item()) + + +def test_timesteps( + device, + batch=1, + num_channels=64, + flip_sin_to_cos=True, + downscale_freq_shift=1.0, + rtol=1e-4, + atol=1e-4, +): + print(f"Testing Timesteps on device: {device}") + + cpu_timesteps = Timesteps( + num_channels=num_channels, + flip_sin_to_cos=flip_sin_to_cos, + downscale_freq_shift=downscale_freq_shift, + ).to("cpu").eval() + + g = torch.Generator().manual_seed(0) + timesteps_cpu = torch.randint(low=0, high=1000, size=(batch,), generator=g, dtype=torch.long) + + with torch.no_grad(): + cpu_out = cpu_timesteps(timesteps_cpu) + + dev_timesteps = cpu_timesteps.to(device).eval() + dev_timesteps = torch.compile(dev_timesteps, dynamic=False) + + timesteps_dev = timesteps_cpu.to(device) + with torch.no_grad(): + dev_out = dev_timesteps(timesteps_dev) + + test_result("Timesteps", dev_out, cpu_out, rtol=rtol, atol=atol) + print("Max diff >", torch.max(torch.abs(dev_out.cpu() - cpu_out)).item()) + print("Timesteps simulation done.") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run UNet (diffusers) test with comparison") parser.add_argument("--model", type=str, default="runwayml/stable-diffusion-v1-5", @@ -557,14 +641,16 @@ def test_upsample2d( module = PyTorchSimRunner.setup_device() device = module.custom_device() - test_upsample2d(device) - test_groupnorm(device) - test_groupnorm(device, stride=[1, 1, 320*32, 320]) - test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=320) - test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=1280) - test_cross_attn_down_block2d(device) - test_unet_mid_block2d_cross_attn(device) - test_cross_attn_up_block2d(device) + #test_upsample2d(device) + #test_groupnorm(device) + #test_groupnorm(device, stride=[1, 1, 320*32, 320]) + #test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=256, resnet_act_fn='silu') + #test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=1280) + #test_cross_attn_down_block2d(device) + #test_unet_mid_block2d_cross_attn(device) + #test_cross_attn_up_block2d(device) + #test_flip_sin_to_cos_embedding(device) + #test_timesteps(device) test_unet2d_condition_model(device) #test_unet_conditional( # device=device, From 7fa8d5425b94a60fcb6b25c1d2f0bebb63cfba56 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 15 Jan 2026 08:10:55 +0000 Subject: [PATCH 075/194] [Tests/MoE] Add patch to avoid dynamo bug --- tests/MoE/test_moe.py | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py index 9ebfb11e..f9c96aff 100644 --- a/tests/MoE/test_moe.py +++ b/tests/MoE/test_moe.py @@ -16,31 +16,19 @@ sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) -# FIXME. This is a temporary solution to avoid is_forward conflict during backward -def patch_compile_event_logger(): - """Patch CompileEventLogger.compilation_metric to avoid is_forward conflict during backward.""" - from torch._dynamo.utils import CompileEventLogger +# FIXME. This is a Dynamo bug. Solution to avoid is_forward conflict during backward +def patch_metrics_context_update(): + """Patch MetricsContext.update to set overwrite=True by default.""" from torch._dynamo.utils import get_metrics_context + ctx = get_metrics_context() + original_update = ctx.update - original_compilation_metric = CompileEventLogger.compilation_metric - - @staticmethod - def patched_compilation_metric(is_forward=True, **kwargs): - """Patched version that clears is_forward before setting it if there's a conflict.""" - try: - metrics_context = get_metrics_context() - if metrics_context.in_progress() and hasattr(metrics_context, '_metrics'): - # If is_forward is already set and we're trying to set it to a different value, clear it first - current_is_forward = metrics_context._metrics.get('is_forward') - if current_is_forward is not None and current_is_forward != is_forward: - metrics_context._metrics.pop('is_forward', None) - except: - pass - # Call the original function - return original_compilation_metric(is_forward=is_forward, **kwargs) + def patched_update(values, overwrite=True): + """Patched version that sets overwrite=True by default.""" + return original_update(values, overwrite=True) # Patch the method - CompileEventLogger.compilation_metric = patched_compilation_metric + get_metrics_context().update = patched_update def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): pass_message = f"|{name} Test Passed|" @@ -469,6 +457,7 @@ def test_moe(device): total_cpu_loss = cpu_loss + cpu_aux_loss total_loss.to(device) + patch_metrics_context_update() print("Backward Started!") total_loss.backward() total_cpu_loss.backward() @@ -496,7 +485,7 @@ def test_moe(device): def train_moe(device): # Patch CompileEventLogger to avoid metric conflicts - patch_compile_event_logger() + patch_metrics_context_update() def perceptron(a, b, c): return a * b + c @@ -619,7 +608,7 @@ def weight_update(a, b, lr): def train_moe_mnist(device): # Patch CompileEventLogger to avoid metric conflicts - patch_compile_event_logger() + patch_metrics_context_update() torch.manual_seed(0) batch_size = 32 @@ -703,7 +692,7 @@ def train(model, device, train_loader, optimizer, epochs): def train_moe_single_iteration(device, iter_idx, is_evaluation=0): # Patch CompileEventLogger to avoid metric conflicts - patch_compile_event_logger() + patch_metrics_context_update() # Training moe with mnist dataset for sinlge iteration torch.manual_seed(0) From 7919094fe10c40434f0cf7ecd599a09cd12c08d9 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 15 Jan 2026 11:20:24 +0000 Subject: [PATCH 076/194] [Fix] Change wrong TORCHSIM_DUMP_PATH usage --- README.md | 6 +++--- experiments/BERT.py | 2 +- experiments/attention.py | 2 +- experiments/conv.py | 2 +- experiments/gemm.py | 2 +- experiments/layernorm.py | 2 +- experiments/resnet18.py | 2 +- experiments/resnet50.py | 2 +- experiments/softmax.py | 2 +- scripts/chiplet_prep.py | 2 +- scripts/chiplet_prep.sh | 2 +- scripts/sparsity_experiment/run.sh | 2 +- tutorial/session2/Hands_on.ipynb | 2 +- 13 files changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 4d98baa4..4a3ef145 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ The `tests` directory contains several AI workloads examples. ```bash python tests/test_matmul.py ``` -The result is stored to `TORCHSIM_DUMP_PATH/hash/togsim_result/`. The log file contains detailed core, memory, and interconnect stats. +The result is stored to `TORCHSIM_LOG_PATH/hash/togsim_result/`. The log file contains detailed core, memory, and interconnect stats. ### Run Your Own Model on PyTorchSim You can run your own PyTorch model on PyTorchSim by setting up a custom NPU device. @@ -197,9 +197,9 @@ Log contains memory & core stats. [2025-12-05 08:05:52.538] [info] Total execution cycles: 2065 [2025-12-05 08:05:52.538] [info] Wall-clock time for simulation: 0.147463 seconds ``` -The log is dumped in `TORCHSIM_DUMP_PATH` and you can set the path as below. +The log is dumped in `TORCHSIM_LOG_PATH` and you can set the path as below. ```bash -export TORCHSIM_DUMP_PATH=/tmp/torchinductor # output file dump path +export TORCHSIM_LOG_PATH=/tmp/torchinductor # output file dump path ``` ## Training diff --git a/experiments/BERT.py b/experiments/BERT.py index 5ccd3084..fd671833 100644 --- a/experiments/BERT.py +++ b/experiments/BERT.py @@ -48,7 +48,7 @@ def run_BERT(size, input_seq, config): input_seq = args.input_size result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"BERT_{size}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") # setting environment variables - os.environ['TORCHSIM_DUMP_PATH'] = result_path + os.environ['TORCHSIM_LOG_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" if 'pytorchsim_functional_mode' in os.environ: diff --git a/experiments/attention.py b/experiments/attention.py index 842f105a..211433f1 100644 --- a/experiments/attention.py +++ b/experiments/attention.py @@ -47,7 +47,7 @@ def attention(query, key, value): size_str = "x".join([str(i) for i in size]) result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"attention_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") # setting environment variables - os.environ['TORCHSIM_DUMP_PATH'] = result_path + os.environ['TORCHSIM_LOG_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" if 'pytorchsim_functional_mode' in os.environ: diff --git a/experiments/conv.py b/experiments/conv.py index 25952fb0..61f7ad80 100644 --- a/experiments/conv.py +++ b/experiments/conv.py @@ -48,7 +48,7 @@ def custom_conv2d(a, b, bias): size_str = "_".join([str(i) for i in size]) result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"CONV_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") # setting environment variables - os.environ['TORCHSIM_DUMP_PATH'] = result_path + os.environ['TORCHSIM_LOG_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" if 'pytorchsim_functional_mode' in os.environ: diff --git a/experiments/gemm.py b/experiments/gemm.py index 3090e331..44be689a 100644 --- a/experiments/gemm.py +++ b/experiments/gemm.py @@ -42,7 +42,7 @@ def custom_matmul(a, b): size_str = "x".join([str(i) for i in size]) result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"GEMM_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") # setting environment variables - os.environ['TORCHSIM_DUMP_PATH'] = result_path + os.environ['TORCHSIM_LOG_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" if 'pytorchsim_functional_mode' in os.environ: diff --git a/experiments/layernorm.py b/experiments/layernorm.py index 9c9934a1..a6b16986 100644 --- a/experiments/layernorm.py +++ b/experiments/layernorm.py @@ -38,7 +38,7 @@ def run_layernorm(size, config): size_str = "x".join([str(i) for i in size]) result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"LayerNorm_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") # setting environment variables - os.environ['TORCHSIM_DUMP_PATH'] = result_path + os.environ['TORCHSIM_LOG_PATH'] = result_path os.environ['TORCHSIM_FUSION_REDUCTION_REDUCTION'] = "0" # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" diff --git a/experiments/resnet18.py b/experiments/resnet18.py index 5451e0f5..c7763d86 100644 --- a/experiments/resnet18.py +++ b/experiments/resnet18.py @@ -39,7 +39,7 @@ def run_resnet(batch, config): batch = args.batch result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet18_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") # setting environment variables - os.environ['TORCHSIM_DUMP_PATH'] = result_path + os.environ['TORCHSIM_LOG_PATH'] = result_path os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1" # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" diff --git a/experiments/resnet50.py b/experiments/resnet50.py index 83d82db4..4e611541 100644 --- a/experiments/resnet50.py +++ b/experiments/resnet50.py @@ -39,7 +39,7 @@ def run_resnet(batch, config): batch = args.batch result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet50_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") # setting environment variables - os.environ['TORCHSIM_DUMP_PATH'] = result_path + os.environ['TORCHSIM_LOG_PATH'] = result_path os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1" # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" diff --git a/experiments/softmax.py b/experiments/softmax.py index 580d56ca..d30559f7 100644 --- a/experiments/softmax.py +++ b/experiments/softmax.py @@ -38,7 +38,7 @@ def run_softmax(size, config, dim=1): size_str = "x".join([str(i) for i in size]) result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"Softmax_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") # setting environment variables - os.environ['TORCHSIM_DUMP_PATH'] = result_path + os.environ['TORCHSIM_LOG_PATH'] = result_path # only timing simulation os.environ['TORCHSIM_VALIDATION_MODE'] = "0" if 'pytorchsim_functional_mode' in os.environ: diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py index 4f8b7f7c..213eb85b 100644 --- a/scripts/chiplet_prep.py +++ b/scripts/chiplet_prep.py @@ -73,7 +73,7 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None): folder = int(args.size) print("Taget size: ", folder) - folder_path = os.environ.get("TORCHSIM_DUMP_PATH") + folder_path = os.environ.get("TORCHSIM_LOG_PATH") print(folder_path) os.makedirs(folder_path, exist_ok=True) test_matmul(device, folder, folder, folder) diff --git a/scripts/chiplet_prep.sh b/scripts/chiplet_prep.sh index cddf1a58..f3bd1a1c 100755 --- a/scripts/chiplet_prep.sh +++ b/scripts/chiplet_prep.sh @@ -8,7 +8,7 @@ for size in "${sizes[@]}"; do export TORCHSIM_TILE_M=$((size / 2)) export TORCHSIM_TILE_K=$((size / 2)) export TORCHSIM_TILE_N=$((size / 2)) - export TORCHSIM_DUMP_PATH=$(pwd)/chiplet_result/$size + export TORCHSIM_LOG_PATH=$(pwd)/chiplet_result/$size python3 chiplet_prep.py $size #python3 chiplet_run.py $(pwd)/chiplet_result done \ No newline at end of file diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh index 84c818ac..da9b73cc 100755 --- a/scripts/sparsity_experiment/run.sh +++ b/scripts/sparsity_experiment/run.sh @@ -1,4 +1,4 @@ -export TORCHSIM_DUMP_PATH=$(pwd)/result +export TORCHSIM_LOG_PATH=$(pwd)/result export SPIKE_DUMP_SPARSE_TILE=1 export TORCHSIM_FORCE_TIME_K=8 export TORCHSIM_FORCE_TIME_M=8 diff --git a/tutorial/session2/Hands_on.ipynb b/tutorial/session2/Hands_on.ipynb index 2d5a5cdc..2964f293 100644 --- a/tutorial/session2/Hands_on.ipynb +++ b/tutorial/session2/Hands_on.ipynb @@ -32,7 +32,7 @@ "import torch._dynamo\n", "import torch.utils.cpp_extension\n", "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", - "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"togsim_results\")\n", + "os.environ['TORCHSIM_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")\n", "sys.path.append(base_dir)\n", "\n", "from Scheduler.scheduler import PyTorchSimRunner\n", From 1ca33488eb464bd766d042764f562d6b3fe616d1 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 15 Jan 2026 11:34:12 +0000 Subject: [PATCH 077/194] [Scheduler] Validate pytorchsim_timing_mode != 0 in Scheduler constructor --- Scheduler/scheduler.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 3f5673a8..dfd4aab6 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -376,6 +376,12 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") self.tog_simulator = TOGSimulator(togsim_path, togsim_config) + if self.tog_simulator.config_yaml['pytorchsim_timing_mode'] == 0: + # Scheduler requires timing mode to be enabled (pytorchsim_timing_mode != 0). + logger.error(f"pytorchsim_timing_mode is set to 0 in config file '{togsim_config}'. ") + logger.error(f"Scheduler requires timing mode to be enabled (pytorchsim_timing_mode != 0).") + exit(0) + os.environ['TOGSIM_CONFIG'] = togsim_config self.tog_simulator.interactive_simulation() if engine_select == Scheduler.FIFO_ENGINE: From 8df3beeab76256b2dbb2472bd6b73b80c43d1aa8 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 15 Jan 2026 14:51:44 +0000 Subject: [PATCH 078/194] [Fix] Move rename_indexing before load cacheing --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 4 ---- PyTorchSimFrontend/mlir/mlir_common.py | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 912c618a..01485d2e 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -435,7 +435,6 @@ def parse_index_list(self, expr_list:list, buffer=None, offset=sympy.Number(0)) return index def load(self, name: str, index: sympy.Expr): - index = self.rename_indexing(index) index, comptute_depedency = self.convert_indirect_indexing(index) padding = self.get_padding_type() @@ -489,7 +488,6 @@ def load(self, name: str, index: sympy.Expr): return out def store(self, name: str, index: sympy.Expr, value, mode=None, *args, **kwargs): - index = self.rename_indexing(index) dtype = V.graph.get_dtype(name) mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype] @@ -642,7 +640,6 @@ def store_reduction(self, name, index, value): dram_var = self.kernel_group.args.output(name) dtype = V.graph.get_dtype(name) mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype] - index = self.rename_indexing(index) with self.override_buffer_cse(cse=self.reduction_cse): # Tile is always reuduced in inner loop @@ -779,7 +776,6 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index): return accum def index_expr(self, index, dtype): - index = self.rename_indexing(index) base_tile_desc = self.kernel_group.tile_desc if len(self.ranges) != self.reduction_depth: # FIXME. This is a temporary solution to get tile stride of the reduction case diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index e31555ba..ad755c6e 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -913,6 +913,7 @@ def indirect_indexing(index_var, size, check=True, wrap_neg=True): @staticmethod def load(name: str, index: sympy.Expr): + index = self.rename_indexing(index) if name in self.cse.invalidated_stores: # A load from an invalidated store requires us to # keep the actual buffer around @@ -937,6 +938,7 @@ def store(name, index, value, mode=None): for other_name in self.current_node.get_output(name).get_mutations(): self.cse.store_cache[other_name] = value if name not in V.graph.removed_buffers: + index = self.rename_indexing(index) return self.store(name, index, value, mode=mode) @staticmethod @@ -948,6 +950,7 @@ def store_reduction(name, index, value): self.cse.store_cache[other_name] = value if name not in V.graph.removed_buffers: + index = self.rename_indexing(index) return self.store_reduction(name, index, value) @staticmethod @@ -960,6 +963,7 @@ def _index_expr(tile_size, buffer, renamed_expression, index): @staticmethod def index_expr(index, dtype): + index = self.rename_indexing(index) return self.index_expr(index, dtype) @staticmethod From ea79ad0cda4ddffa0ba8e1abca78bfd92a285463 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 16 Jan 2026 10:27:17 +0000 Subject: [PATCH 079/194] [Fusion] Fix template codegen + Add custom fusion hook --- PyTorchSimFrontend/mlir/mlir_scheduling.py | 51 +++++++++++++++++++--- PyTorchSimFrontend/mlir/mlir_template.py | 23 ++++++---- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index f2bcba7e..aff2f0b0 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -25,13 +25,48 @@ class MLIRScheduling(BaseScheduling): target_kernel = MLIRKernel def __init__(self, scheduler): self.scheduler = scheduler - #self.scheduler.enter_context = self.enter_context_fixed # FIXME. Monkey patch: For fixing the inductor bug + if scheduler is not None: + self.scheduler.can_fuse_origin = self.scheduler.can_fuse + self.scheduler.can_fuse = self.can_fuse_with_exceptions # FIXME. Monkey patch: For prolouge fusion self.kernel_group = mlir_common.MLIRWrapperKenrelGroup() self._ready_to_flush = False self.outer_function = set() config.inplace_buffers = False # FIXME. inout kernel makes trouble.. So disabled it! self.max_fusion_size = 5 + def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool: + if not extension_config.CONFIG_FUSION: + return False + + # Extract base template node + base_template_node1 = [node for node in node1.get_nodes() if node.is_template()] + base_template_node2 = [node for node in node2.get_nodes() if node.is_template()] + + # Case 3: Prologue(Pointwise) + Tempalte + if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE: + from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate + from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate + + target_node = base_template_node2[0].node + # Currently only BMM, MM support prologue fusion + if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)): + return False + + if len(node1.read_writes.writes) != 1: + return False + if node1.node not in target_node.inputs or any(["view" in str(ori) for ori in node1.node.origins]): #FIXME + return False + + # We don't fuse this edge case... + if base_template_node2[0].group[1][0][0] == 1: + return False + + if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]: + node1 = self.revert_group(node1) + return True + return self.scheduler.can_fuse_origin(node1, node2) + + def _set_flush_status(self, status: bool): self._ready_to_flush = status @@ -45,6 +80,9 @@ def get_backend_features(self, device): def can_fuse_vertical(self, node1, node2): return self.can_fuse_horizontal(node1, node2) + def can_fuse_multi_outputs_template(self, node1, node2): + return self.can_fuse_horizontal(node1, node2) + def can_fuse_horizontal(self, node1, node2): if not extension_config.CONFIG_FUSION: return False @@ -88,7 +126,7 @@ def can_fuse_horizontal(self, node1, node2): return same_iter and no_dependency # Case 1: Template + Pointwise fusion - if len(base_template_node1) == 1 and len(base_template_node2) == 0 and not node2.is_reduction(): + if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(base_template_node2) == 0 and not node2.is_reduction(): # Don't fuse maxpool template code from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate @@ -132,9 +170,10 @@ def can_fuse_horizontal(self, node1, node2): return True # Case 2: Tempalte + Reduction fusion - if len(base_template_node1) == 1 and len(base_template_node2) == 0 and node2.is_reduction() and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE: + if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(base_template_node2) == 0 and node2.is_reduction() and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE: from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate + target_node = base_template_node1[0].node if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)): return False @@ -149,7 +188,7 @@ def can_fuse_horizontal(self, node1, node2): # We can't fuse dim=-1 layout_possible = stride != 1 # Directed linked? - dependency_check = node2.get_nodes()[0] in [node.node for node in base_template_node1[0].users]# and len(node2.read_writes.reads)==1 + dependency_check = writes1 & reads2 dependency_size = all([i.get_numel() == node1.get_nodes()[0].node.get_numel() for i in node2.read_writes.reads]) return size_match and layout_possible and dependency_check and dependency_size @@ -177,8 +216,8 @@ def can_fuse_horizontal(self, node1, node2): return True # Check elementwise fusion - if vars1 == vars2 and reduce1 == reduce2: - return True + if vars1 == vars2 and reduce1 == reduce2 and not node1.is_reduction() and not node2.is_reduction(): + return writes1 & reads2 return False def revert_group(self, act_nodes, args=None, var_ranges=None): diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 304d0090..31796a8b 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -573,8 +573,8 @@ def template_store(): with contextlib.ExitStack() as stack: stack.enter_context(compute_body.indent(attribute="{inner_loop=false}",suffix=self.compute_body_loop.epilogue_line())) if self.reduction_fusion: - compute_body.writelines(self.reduction_body_loop.lines()) compute_body.splice(self.masks) + compute_body.writelines(self.reduction_body_loop.lines()) stack.enter_context(compute_body.indent(attribute="{inner_loop=false}")) compute_body.splice(self.loads) compute_body.splice(self.compute) @@ -848,7 +848,6 @@ def get_spad_size_per_lane(self, tile_m, tile_n): return max(size, 2) # vector load/store def load_epilogue(self, name: str, index: sympy.Expr): - index = self.rename_indexing(index) dram_var = self.kernel_group.args.input(name) dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name]) dtype = V.graph.get_dtype(name) @@ -898,7 +897,6 @@ def load_epilogue(self, name: str, index: sympy.Expr): return out def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs): - index = self.rename_indexing(index) dram_var = self.kernel_group.args.output(name) dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name]) dtype = V.graph.get_dtype(name) @@ -1000,7 +998,6 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value): return sram_var def store_reduction_epilogue(self, name, index, value): - index = self.rename_indexing(index) dram_var = self.kernel_group.args.output(name) dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name]) dtype = V.graph.get_dtype(name) @@ -1119,11 +1116,19 @@ def set_tile_size(self, template_fusion_info, prologue=False): return tile_desc def rename_indexing(self, index) -> sympy.Expr: - for dim_name, dim_aliased_name in self.dim_aliasing.items(): - index = index.subs(sympy.Symbol(dim_name), sympy.Symbol("tmp_"+dim_aliased_name)) - # To avoid this case ({"index0":"index1", "index1":"index0"}) - for dim_aliased_name in self.dim_aliasing.values(): - index = index.subs(sympy.Symbol("tmp_"+dim_aliased_name), sympy.Symbol(dim_aliased_name)) + # First step: replace dim_name with tmp_+dim_aliased_name to avoid circular dependencies + # (e.g., {"index0":"index1", "index1":"index0"}) + tmp_subs = { + sympy.Symbol(dim_name): sympy.Symbol("tmp_"+dim_aliased_name) + for dim_name, dim_aliased_name in self.dim_aliasing.items() + } + index = index.subs(tmp_subs) + # Second step: replace tmp_+dim_aliased_name with dim_aliased_name + final_subs = { + sympy.Symbol("tmp_"+dim_aliased_name): sympy.Symbol(dim_aliased_name) + for dim_aliased_name in self.dim_aliasing.values() + } + index = index.subs(final_subs) return index class MLIRTemplateCaller(CUDATemplateCaller): From 0c6175fdc0e354284cbc3f12cf64405dfb319113 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 19 Jan 2026 15:23:17 +0000 Subject: [PATCH 080/194] [Template] Fix template fusion codegen --- .../mlir/mlir_codegen_backend.py | 87 +++++++++++-------- PyTorchSimFrontend/mlir/mlir_common.py | 18 ++-- PyTorchSimFrontend/mlir/mlir_gemm_template.py | 4 +- PyTorchSimFrontend/mlir/mlir_ops.py | 39 ++++++++- PyTorchSimFrontend/mlir/mlir_scheduling.py | 62 +++++++------ PyTorchSimFrontend/mlir/mlir_template.py | 34 +++++--- 6 files changed, 153 insertions(+), 91 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 01485d2e..671d0e09 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -327,7 +327,7 @@ def get_padding_type(self): # return 1 return 0 - def convert_index(self, expr, buffer): + def convert_index(self, expr): if len(expr.free_symbols) != 1: raise NotImplementedError("Not supporting this view operation...!") @@ -346,17 +346,37 @@ def convert_index(self, expr, buffer): first_arg = expr.args[0] if len(first_arg.free_symbols) != 1: raise NotImplementedError("What is this case?") + + # Create affine.apply operation indices = [list(first_arg.free_symbols)[0]] - args = ", ".join(map(str, indices)) - map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args}) -> ({expr_str})>") - args = ", ".join([f"%{i}" for i in indices]) - index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})") + with self.override_buffer_cse(buffer=self.global_vars, cse=self.map_cse): + map_var = ops.affine_map(indices, expr_str) + index = ops.affine_apply(map_var, indices) return index - def parse_indices(self, expr, buffer=None, comments="", indirect_dims=[]) -> common.CSEVariable: - if buffer is None: - buffer = self.applys + def _convert_sympy_to_mlir_expr(self, expr, sorted_args): + """ + Convert sympy expression to MLIR affine map expression by replacing index variables. + """ + indices = [] + + for arg in sorted_args: + if arg.is_Mul and arg.args[0].is_number: + target_arg = arg.args[1] + elif not arg.is_number: + target_arg = arg + else: + continue + new_arg = sympy.Symbol(str(self.convert_index(target_arg))) + expr = expr.replace(target_arg, new_arg) + indices.append(str(new_arg)) + + expr_str = str(expr) + if "//" in expr_str: + expr_str = expr_str.replace("//", " floordiv ") + return expr_str, indices + def parse_indices(self, expr, comments="", indices=None, indirect_dims=[]) -> common.CSEVariable: # Constant case if expr.is_number and len(indirect_dims) == 0: return self.get_const_cse(int(expr)) @@ -372,33 +392,25 @@ def parse_indices(self, expr, buffer=None, comments="", indirect_dims=[]) -> com # Sort index variable.. ex) (%index1, %index0) args_dict = {term: list(term.free_symbols)[0] for term in args if term.free_symbols} sorted_args = sorted(args_dict.keys(), key=lambda term: str(args_dict[term])) - indices = [] - for arg in sorted_args: - if arg.is_Mul and arg.args[0].is_number: - new_arg = sympy.Symbol(str(self.convert_index(arg.args[1], buffer))) - expr = expr.replace(arg.args[1], new_arg) - indices.append(str(new_arg)) - elif not arg.is_number: - new_arg = sympy.Symbol(str(self.convert_index(arg, buffer))) - expr = expr.replace(arg, new_arg) - indices.append(str(new_arg)) + + # Convert sympy expression to affine map expression + expr_str, indices = self._convert_sympy_to_mlir_expr(expr, sorted_args) # Extract index var - indirect_args = [f"%{i}" for i in indirect_dims] - if len(indirect_args): + if len(indirect_dims): comments = "{indirect_access} " + comments # Add indirect access attribute - expr_str = str(expr) - if "//" in expr_str: - expr_str = expr_str.replace("//", " floordiv ") - args = ", ".join(map(str, indices)) - map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args})[{','.join(indirect_dims)}] -> ({expr_str})>") - args = ", ".join([f"%{i}" for i in indices]) - index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})[{','.join(indirect_args)}] {comments}") + indirect_args = [f"%{i}" for i in indirect_dims] + # Create affine.apply operation + with self.override_buffer_cse(buffer=self.global_vars, cse=self.map_cse): + map_var = ops.affine_map(indices, expr_str, symbol_names=indirect_dims) + + if hasattr(self, "dim_aliasing"): + indices = [self.dim_aliasing.get(index, index) for index in indices] + index = ops.affine_apply(map_var, indices, indirect_dims=indirect_args, comment=comments) return index - def parse_index_list(self, expr_list:list, buffer=None, offset=sympy.Number(0)) -> common.CSEVariable: - if buffer is None: - buffer = self.applys + def parse_index_list(self, expr_list:list, offset=sympy.Number(0)) -> common.CSEVariable: + """ Need to override buffer and cse to use this function. """ expr_list = [arg for arg in expr_list] dim_list = [f"d{i}" for i in range(len(expr_list))] @@ -413,11 +425,11 @@ def parse_index_list(self, expr_list:list, buffer=None, offset=sympy.Number(0)) new_expr_list = [0] * len(expr_list) for idx, arg in enumerate(expr_list): if arg.is_Mul and arg.args[0].is_number: - new_arg = sympy.Symbol(str(self.convert_index(arg.args[1], buffer))) + new_arg = sympy.Symbol(str(self.convert_index(arg.args[1]))) new_expr_list[idx] = arg.subs(arg.args[1], dim_list[idx]) indices.append(str(new_arg)) elif not arg.is_number: - new_arg = sympy.Symbol(str(self.convert_index(arg, buffer))) + new_arg = sympy.Symbol(str(self.convert_index(arg))) new_expr_list[idx] = new_arg.subs(new_arg, dim_list[idx]) indices.append(str(new_arg)) else: @@ -427,11 +439,11 @@ def parse_index_list(self, expr_list:list, buffer=None, offset=sympy.Number(0)) indices.append(str(new_arg)) # Extract index var + # Create affine.apply operation expr_str = str(sum(new_expr_list) + offset) - args = ", ".join(map(str, dim_list)) - map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args})[] -> ({expr_str})>") - args = ", ".join([f"%{i}" for i in indices]) - index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})[]") + with self.override_buffer_cse(buffer=self.global_vars, cse=self.map_cse): + map_var = ops.affine_map(dim_list, expr_str) + index = ops.affine_apply(map_var, indices) return index def load(self, name: str, index: sympy.Expr): @@ -1080,7 +1092,8 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe if broadcast and (total_dims != local_dims or (self.reduction_depth!=len(total_dims) and total_dims[:self.reduction_depth] == local_dims)): local_dims = total_dims # Brodatcast tile shape - index_var = self.parse_indices(index, buffer=buffer, indirect_dims=indirect_dims, comments=f"// store_reduction={store_reduction}") + with self.override_buffer_cse(buffer=buffer, cse=self.apply_cse): + index_var = self.parse_indices(index, indirect_dims=indirect_dims, comments=f"// store_reduction={store_reduction}") if kg_tile_desc.vmap.vlane_split_axis in local_dims: local_vlane_split_axis = local_dims.index(kg_tile_desc.vmap.vlane_split_axis) diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index ad755c6e..0717333a 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -612,6 +612,7 @@ def __init__(self, kernel_group, reason=None): instance_id = id(self) self.target_buffer_override = contextvars.ContextVar(f"Handler_compute_override_{instance_id}", default=self.compute) self.target_cse_override = contextvars.ContextVar(f"Handler_cse_override_{instance_id}", default=self.cse) + self._nested_context_depth = 0 def set_ranges(self, lengths, reduction_lengths): if self.call_ranges: @@ -992,13 +993,20 @@ def bucketize( values, offsets_name, offsets_size, indexing_dtype, right ) - super().__enter__() - assert self.overrides - parent_handler = self.overrides() - self.exit_stack.enter_context(V.set_ops_handler(CSEProxy())) - self.exit_stack.enter_context(V.set_kernel_handler(self)) + if self._nested_context_depth == 0: + self.exit_stack.__enter__() + assert self.overrides + parent_handler = self.overrides() + + self.exit_stack.enter_context(V.set_ops_handler(CSEProxy())) + self.exit_stack.enter_context(V.set_kernel_handler(self)) + self._nested_context_depth += 1 return self + def __exit__(self, exc_type, exc_val, exc_tb): + self._nested_context_depth -= 1 + if self._nested_context_depth == 0: + super().__exit__(exc_type, exc_val, exc_tb) @dataclasses.dataclass class LoopLevel: diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py index bbc63b45..0158caa6 100644 --- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py @@ -154,7 +154,7 @@ def render(self, W_tile_desc.set_tile_size_stride(W_tile_size, W_tile_stride) W_tile_desc.set_name("W_buffer") W_tile_desc.offset = W.get_layout().offset - W_stride = W.get_layout().stride + W_stride = W.get_layout().stride if N>1 else [Y.get_layout().stride[0], 0] W_idx = [sympy.Symbol("index2") * W_stride[0], sympy.Symbol("index1") * W_stride[1]] vlane_split_axis = vlane_split_axis if nr_rdim==0 else 0 @@ -163,7 +163,7 @@ def render(self, Y_tile_desc = mlir_common.MLIRMultiDimTile(Y_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride) Y_tile_desc.set_tile_size_stride(Y_tile_size, Y_tile_stride) Y_tile_desc.set_name("Y_buffer") - Y_stride = Y.get_layout().stride + Y_stride = Y.get_layout().stride if N>1 else [Y.get_layout().stride[0], 0] if nr_rdim == 0: Y_idx = [sympy.Symbol("index0") * Y_stride[0], sympy.Symbol("index1") * Y_stride[1]] else: diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index 4cf031d2..fd0114e1 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -1175,4 +1175,41 @@ def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, **kw if buffer_name is not None: return common.DeferredLine(buffer_name, line), [None, None] else: - return line, [None, None] \ No newline at end of file + return line, [None, None] + + @staticmethod + def affine_apply(map_var, indices, indirect_dims=None, comment=None, *args, **kwargs): + # Format indices arguments + indices_str = ", ".join([f"%{i}" for i in indices]) + op_str = f"affine.apply #{map_var}({indices_str})" + + # Add indirect dimensions if provided + if indirect_dims: + indirect_str = ", ".join(indirect_dims) + op_str += f"[{indirect_str}]" + if comment: + op_str += f" // {comment}" + return op_str, [1, "index"] + + @staticmethod + def affine_map(dim_names, expr_str, symbol_names=None, comment=None, *args, **kwargs): + # Handle dim_names as list or string + if isinstance(dim_names, list): + dims_str = ", ".join([str(dim) for dim in dim_names]) + else: + dims_str = dim_names + + # Build the map string + if symbol_names: + if isinstance(symbol_names, list): + symbols_str = ", ".join(symbol_names) + else: + symbols_str = symbol_names + map_str = f"affine_map<({dims_str})[{symbols_str}] -> ({expr_str})>" + else: + map_str = f"affine_map<({dims_str}) -> ({expr_str})>" + + if comment: + map_str += f" // {comment}" + + return map_str, [1, "map"] diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index aff2f0b0..6c103829 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -35,15 +35,15 @@ def __init__(self, scheduler): self.max_fusion_size = 5 def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool: - if not extension_config.CONFIG_FUSION: - return False + if not extension_config.CONFIG_FUSION_PROLOGUE: + return self.scheduler.can_fuse_origin(node1, node2) # Extract base template node base_template_node1 = [node for node in node1.get_nodes() if node.is_template()] base_template_node2 = [node for node in node2.get_nodes() if node.is_template()] # Case 3: Prologue(Pointwise) + Tempalte - if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE: + if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and len(node2.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE: from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate @@ -126,7 +126,7 @@ def can_fuse_horizontal(self, node1, node2): return same_iter and no_dependency # Case 1: Template + Pointwise fusion - if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(base_template_node2) == 0 and not node2.is_reduction(): + if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(node2.get_nodes())==1 and len(base_template_node2) == 0 and not node2.is_reduction(): # Don't fuse maxpool template code from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate @@ -170,7 +170,7 @@ def can_fuse_horizontal(self, node1, node2): return True # Case 2: Tempalte + Reduction fusion - if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(base_template_node2) == 0 and node2.is_reduction() and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE: + if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(node2.get_nodes())==1 and len(base_template_node2) == 0 and node2.is_reduction() and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE: from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate target_node = base_template_node1[0].node @@ -185,39 +185,35 @@ def can_fuse_horizontal(self, node1, node2): except: return False - # We can't fuse dim=-1 - layout_possible = stride != 1 + # We can't fuse dim=-1 & N == 1 + layout_possible = stride != 1 and (1 not in node1.node.get_size()) # Directed linked? dependency_check = writes1 & reads2 dependency_size = all([i.get_numel() == node1.get_nodes()[0].node.get_numel() for i in node2.read_writes.reads]) return size_match and layout_possible and dependency_check and dependency_size # Case 3: Prologue(Pointwise) + Tempalte - if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE: - from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate - from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate - - target_node = base_template_node2[0].node - # Currently only BMM, MM support prologue fusion - if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)): - return False - - if len(node1.read_writes.writes) != 1: - return False - if node1.node not in target_node.inputs or any(["view" in str(ori) for ori in node1.node.origins]): #FIXME - return False - - # We don't fuse this edge case... - if base_template_node2[0].group[1][0][0] == 1: - return False - - if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]: - node1 = self.revert_group(node1) - return True - - # Check elementwise fusion - if vars1 == vars2 and reduce1 == reduce2 and not node1.is_reduction() and not node2.is_reduction(): - return writes1 & reads2 + # if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE: + # from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate + # from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate + + # target_node = base_template_node2[0].node + # # Currently only BMM, MM support prologue fusion + # if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)): + # return False + + # if len(node1.read_writes.writes) != 1: + # return False + # if node1.node not in target_node.inputs or any(["view" in str(ori) for ori in node1.node.origins]): #FIXME + # return False + + # # We don't fuse this edge case... + # if base_template_node2[0].group[1][0][0] == 1: + # return False + + # if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]: + # node1 = self.revert_group(node1) + # return True return False def revert_group(self, act_nodes, args=None, var_ranges=None): @@ -340,7 +336,7 @@ def codegen_template(self, template_node, epilogue_nodes, prologue_nodes): _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs() src_code, meta_code = kernel.codegen_nodes(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) - with V.set_kernel_handler(kernel): + with kernel: kernel_name = self.define_kernel(src_code, meta_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info, kernel.loop_size, origins={str(i) for i in template_node.node.origins}) self.define_function(kernel) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 31796a8b..6ec043fb 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -473,7 +473,6 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_ for node in epilogue_nodes: node.codegen((vars, reduction_vars)) - with V.set_kernel_handler(kernel): src_code = ( partial_code if isinstance(partial_code, str) @@ -785,8 +784,8 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com subtile_size:list=[], async_type=None, indent_size=0): # Prepare code block local_code = IndentedBuffer() - with V.set_kernel_handler(self): - index_var = self.parse_index_list(index_list, local_code, offset=tile_desc.offset) + with self, self.override_buffer_cse(buffer=local_code, cse=self.apply_cse): + index_var = self.parse_index_list(index_list, offset=tile_desc.offset) node_layout = self.named_nodes[dram_var].get_layout() if dram_var in self.exception_nodes: numel = self.exception_nodes[dram_var]["numel"] @@ -826,7 +825,7 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com def def_sram_buffer(self, dram_name, tile_desc, id=0, indent_size=0): # Prepare code block - with V.set_kernel_handler(self): + with self: dtype = self.named_nodes[dram_name].get_layout().dtype tile_shape = tile_desc.get_mlir_shape(mlir_common.DTYPE_TO_MLIR[dtype]) buffer_name = self.allocate_sram_buffer(dtype, dram_name, tile_desc, id, forced_name=dram_name) @@ -854,8 +853,9 @@ def load_epilogue(self, name: str, index: sympy.Expr): mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype] # Want to use tile_desc from epilogue_info - index_var = self.parse_indices(index) - dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()] + with self.override_buffer_cse(buffer=self.applys, cse=self.apply_cse): + index_var = self.parse_indices(index) + dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.keys()] vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype) @@ -888,7 +888,11 @@ def load_epilogue(self, name: str, index: sympy.Expr): vsize = compute_vec_size//reduce_size if compute_vec_size > 1: - offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.r_tile_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})") + with self.override_buffer_cse(buffer=self.global_vars, cse=self.map_cse): + map_var = ops.affine_map(["d0", "d1"], f"d0 + d1*{(self.r_tile_size)}") + with self.override_buffer_cse(buffer=self.loads): + offset = ops.affine_apply(map_var, [self.compute_idx, self.reduction_loop_idx]) + #offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.r_tile_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})") compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{offset}"]) with self.override_buffer_cse(buffer=self.loads): @@ -902,8 +906,9 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs): dtype = V.graph.get_dtype(name) mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype] - index_var = self.parse_indices(index) - dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()] + with self.override_buffer_cse(buffer=self.applys, cse=self.apply_cse): + index_var = self.parse_indices(index) + dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.keys()] vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype) @@ -981,15 +986,17 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value): compute_index_var = ", ".join(zero_var_list) with self.override_buffer_cse(buffer=self.loads): out = ops._load(vec_size, type_name, sram_var, compute_index_var, tile_shape) - # Reduction body codegen with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse): init = ops.constant(reduction_init(reduction_type, dtype), type_name) init_vec = ops.broadcast(init, compute_vec_size) + init_vec2 = ops.broadcast(init, local_tile_desc.get_numel_per_lane()) + ops._store(init_vec2, sram_var, ", ".join([f"%{self.get_const_cse(0)}"] * local_tile_desc.get_nr_dim()), tile_shape) mask_shape, mask_var = self.get_mask() if mask_var is not None: value = ops.where(mask_var, value, init_vec) + result = reduction_partial_combine_vec(reduction_type, value, out) # Store partial result @@ -1003,8 +1010,9 @@ def store_reduction_epilogue(self, name, index, value): dtype = V.graph.get_dtype(name) mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype] - index_var = self.parse_indices(index, self.reductions_suffix, comments="// Store reduction") - dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()][:-1] # Assume that there is only one reduction axis + with self.override_buffer_cse(buffer=self.reductions_suffix, cse=self.apply_cse): + index_var = self.parse_indices(index, comments="// Store reduction") + dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.keys()][:-1] # Assume that there is only one reduction axis vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride @@ -1100,7 +1108,7 @@ def set_tile_size(self, template_fusion_info, prologue=False): self.r_tile_size = tile_desc.get_tile_size()[-1] self.r_dim_size = template_fusion_info['r_dim_size'] self.reduction_nr_outer_loop = nr_outer_loop - self.reduction_loop_idx = "reduce_loop_idx" + self.reduction_loop_idx = self.register_var_cse("reduce_loop_idx", 1, "index") self.compute_body_loop.size = r_tile_size self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop) From a90f11483be095ca39928a91c11765f98d9285b0 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 20 Jan 2026 11:17:38 +0000 Subject: [PATCH 081/194] [Fix] Fusion axis mechanism change --- .../mlir/mlir_codegen_backend.py | 8 ++---- PyTorchSimFrontend/mlir/mlir_common.py | 27 ++++++++++++------ PyTorchSimFrontend/mlir/mlir_ops.py | 2 +- PyTorchSimFrontend/mlir/mlir_scheduling.py | 2 +- PyTorchSimFrontend/mlir/mlir_template.py | 28 ++++--------------- 5 files changed, 28 insertions(+), 39 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 671d0e09..34ba1031 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -313,7 +313,9 @@ def __init__(self, kernel_group, reason=None): self.base_vector_initialized = False def reset(self, reason): + save = self.exit_stack, self._nested_context_depth self.__init__(self.kernel_group, reason=reason) + self.exit_stack, self._nested_context_depth = save # padding type 0: zero-padding 1: negative-padding(-inf) ... def get_padding_type(self): @@ -395,17 +397,11 @@ def parse_indices(self, expr, comments="", indices=None, indirect_dims=[]) -> co # Convert sympy expression to affine map expression expr_str, indices = self._convert_sympy_to_mlir_expr(expr, sorted_args) - - # Extract index var - if len(indirect_dims): - comments = "{indirect_access} " + comments # Add indirect access attribute indirect_args = [f"%{i}" for i in indirect_dims] # Create affine.apply operation with self.override_buffer_cse(buffer=self.global_vars, cse=self.map_cse): map_var = ops.affine_map(indices, expr_str, symbol_names=indirect_dims) - if hasattr(self, "dim_aliasing"): - indices = [self.dim_aliasing.get(index, index) for index in indices] index = ops.affine_apply(map_var, indices, indirect_dims=indirect_args, comment=comments) return index diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 0717333a..be491925 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -614,7 +614,7 @@ def __init__(self, kernel_group, reason=None): self.target_cse_override = contextvars.ContextVar(f"Handler_cse_override_{instance_id}", default=self.cse) self._nested_context_depth = 0 - def set_ranges(self, lengths, reduction_lengths): + def set_ranges(self, lengths, reduction_lengths, index_names=None): if self.call_ranges: assert self.call_ranges == tuple(lengths) + tuple( reduction_lengths @@ -623,7 +623,12 @@ def set_ranges(self, lengths, reduction_lengths): else: self.call_ranges = tuple(lengths) + tuple(reduction_lengths) self.ranges = [self.rename_indexing(x) for x in self.call_ranges] - self.itervars = [sympy.Symbol(f"index{n}") for n in range(len(self.ranges))] + if index_names is None: + self.itervars = [sympy.Symbol(f"index{n}") for n in range(len(self.ranges))] + else: + assert len(index_names) == len(self.ranges), f"Index names length mismatch: {len(index_names)} != {len(self.ranges)}" + self.itervars = [sympy.Symbol(str(n)) for n in index_names] + self.itervar_cses = {str(index) : self.register_var_cse(str(index), 1, "index") for index in self.itervars} self.reduction_depth = len(lengths) return ( @@ -867,18 +872,22 @@ def rename_indexing(self, index) -> sympy.Expr: def override_buffer_cse(self, *, buffer=None, cse=None): buffer_override = self.target_buffer_override cse_override = self.target_cse_override - target_buffer = target_cse = None + buffer_token = cse_token = None try: + # Store tokens for proper restoration in nested contexts + # contextvars.set() returns the previous value (token) which can be used for reset() if buffer is not None: - target_buffer = buffer_override.set(buffer) + buffer_token = buffer_override.set(buffer) if cse is not None: - target_cse = cse_override.set(cse) + cse_token = cse_override.set(cse) yield self finally: - if target_cse is not None: - cse_override.reset(target_cse) - if target_buffer is not None: - buffer_override.reset(target_buffer) + # Restore using tokens - contextvars automatically handles nested contexts + # Each level restores to its own previous value + if cse_token is not None: + cse_override.reset(cse_token) + if buffer_token is not None: + buffer_override.reset(buffer_token) def __enter__(self): class CSEProxy: diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index fd0114e1..9edd2e44 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -1186,7 +1186,7 @@ def affine_apply(map_var, indices, indirect_dims=None, comment=None, *args, **kw # Add indirect dimensions if provided if indirect_dims: indirect_str = ", ".join(indirect_dims) - op_str += f"[{indirect_str}]" + op_str += f"[{indirect_str}] {{indirect_access}}" if comment: op_str += f" // {comment}" return op_str, [1, "index"] diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 6c103829..faf5e69c 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -178,7 +178,7 @@ def can_fuse_horizontal(self, node1, node2): return False size_match = node1.get_nodes()[0].node.get_numel() == reduce(operator.mul, node2.get_nodes()[0].node.get_size(), 1) * reduce(operator.mul, node2.get_nodes()[0].node.get_reduction_size(), 1) - target_symbol = symbols("r0") + target_symbol = symbols("r0_0") try: stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.get_nodes()[0].node).split("\n") if "r0" in i][1] stride = int(sympify(stride).coeff(target_symbol)) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 6ec043fb..b864e5f2 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -429,7 +429,7 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_ ).group prologue_tile_desc = kernel.set_tile_size(kernel.prologue_info, prologue=True) kernel.kernel_group.set_tile_info(prologue_tile_desc) - vars, reduction_vars = kernel.set_ranges(group, reduction_group) + vars, reduction_vars = kernel.set_ranges(group, reduction_group, list(self.dim_aliasing.values())) for node in prologue_nodes: # Reuse created spad read_list = sorted([i.name for i in node.read_writes.reads]) @@ -469,10 +469,11 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_ _, (group, reduction_group) = max( epilogue_nodes, key=lambda x: int(x.is_reduction()) ).group - vars, reduction_vars = kernel.set_ranges(group, reduction_group) + vars, reduction_vars = kernel.set_ranges(group, reduction_group, list(self.dim_aliasing.values())) for node in epilogue_nodes: node.codegen((vars, reduction_vars)) + with self as kernel: src_code = ( partial_code if isinstance(partial_code, str) @@ -855,7 +856,7 @@ def load_epilogue(self, name: str, index: sympy.Expr): # Want to use tile_desc from epilogue_info with self.override_buffer_cse(buffer=self.applys, cse=self.apply_cse): index_var = self.parse_indices(index) - dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.keys()] + dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()] vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype) @@ -892,7 +893,6 @@ def load_epilogue(self, name: str, index: sympy.Expr): map_var = ops.affine_map(["d0", "d1"], f"d0 + d1*{(self.r_tile_size)}") with self.override_buffer_cse(buffer=self.loads): offset = ops.affine_apply(map_var, [self.compute_idx, self.reduction_loop_idx]) - #offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.r_tile_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})") compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{offset}"]) with self.override_buffer_cse(buffer=self.loads): @@ -908,7 +908,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs): with self.override_buffer_cse(buffer=self.applys, cse=self.apply_cse): index_var = self.parse_indices(index) - dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.keys()] + dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()] vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype) @@ -1012,7 +1012,7 @@ def store_reduction_epilogue(self, name, index, value): with self.override_buffer_cse(buffer=self.reductions_suffix, cse=self.apply_cse): index_var = self.parse_indices(index, comments="// Store reduction") - dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.keys()][:-1] # Assume that there is only one reduction axis + dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()][:-1] # Assume that there is only one reduction axis vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride @@ -1123,22 +1123,6 @@ def set_tile_size(self, template_fusion_info, prologue=False): self.compute_body_loop.step = tile_desc.get_compute_vec_size() return tile_desc - def rename_indexing(self, index) -> sympy.Expr: - # First step: replace dim_name with tmp_+dim_aliased_name to avoid circular dependencies - # (e.g., {"index0":"index1", "index1":"index0"}) - tmp_subs = { - sympy.Symbol(dim_name): sympy.Symbol("tmp_"+dim_aliased_name) - for dim_name, dim_aliased_name in self.dim_aliasing.items() - } - index = index.subs(tmp_subs) - # Second step: replace tmp_+dim_aliased_name with dim_aliased_name - final_subs = { - sympy.Symbol("tmp_"+dim_aliased_name): sympy.Symbol(dim_aliased_name) - for dim_aliased_name in self.dim_aliasing.values() - } - index = index.subs(final_subs) - return index - class MLIRTemplateCaller(CUDATemplateCaller): def __str__(self): return f"MLIRTemplateCaller(source_file={self.bmreq.source_file})" From 78613ad5e21441b1a6c9221410a8c5b83ff3cc46 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 22 Jan 2026 04:30:31 +0000 Subject: [PATCH 082/194] [Test] Fix syntax error in experiment scripts --- experiments/gemm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/gemm.py b/experiments/gemm.py index 44be689a..6b6ece4d 100644 --- a/experiments/gemm.py +++ b/experiments/gemm.py @@ -31,7 +31,7 @@ def custom_matmul(a, b): import os import sys base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml) + config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path sys.path.append(base_dir) args = argparse.ArgumentParser() From 21d08f219b2cb25ce5cb4da0b173c6340bb94f02 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 22 Jan 2026 06:13:11 +0000 Subject: [PATCH 083/194] [CI] Change base image for OpenReg build --- .github/workflows/docker-base-image-2-8.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-base-image-2-8.yml b/.github/workflows/docker-base-image-2-8.yml index 3a1d97a1..74e81e07 100644 --- a/.github/workflows/docker-base-image-2-8.yml +++ b/.github/workflows/docker-base-image-2-8.yml @@ -63,7 +63,7 @@ jobs: file: ./Dockerfile.base push: true build-args: | - PYTORCH_IMAGE=pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime + PYTORCH_IMAGE=pytorch/pytorch:2.8.0-cuda12.6-cudnn9-devel GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }} LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }} SPIKE_ASSET_ID=${{ env.SPIKE_ASSET_ID }} From 24e67eded3496b011f73472c5fcac06de35f8e1a Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 22 Jan 2026 06:15:25 +0000 Subject: [PATCH 084/194] [OpenReg] Use OpenReg style Custom device --- .gitignore | 1 - Dockerfile | 5 +- PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp | 8 - PyTorchSimDevice/ExtensionDeviceGuardImpl.h | 127 ---- PyTorchSimDevice/extension_device.cpp | 711 ------------------ PyTorchSimDevice/extension_hooks.cpp | 48 -- PyTorchSimDevice/extension_hooks.h | 30 - PyTorchSimDevice2/CMakeLists.txt | 44 ++ PyTorchSimDevice2/README.md | 175 +++++ .../cmake/TorchPythonTargets.cmake | 22 + PyTorchSimDevice2/csrc/CMakeLists.txt | 16 + PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp | 195 +++++ .../csrc/aten/OpenRegMinimal.cpp | 148 ++++ PyTorchSimDevice2/csrc/aten/native/Common.h | 97 +++ PyTorchSimDevice2/csrc/aten/native/Extra.cpp | 210 ++++++ PyTorchSimDevice2/csrc/aten/native/Extra.h | 69 ++ .../csrc/aten/native/Minimal.cpp | 185 +++++ PyTorchSimDevice2/csrc/aten/native/Minimal.h | 61 ++ .../csrc/runtime/OpenRegDeviceAllocator.cpp | 8 + .../csrc/runtime/OpenRegDeviceAllocator.h | 43 ++ PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h | 146 ++++ .../csrc/runtime/OpenRegException.cpp | 9 + .../csrc/runtime/OpenRegException.h | 20 + .../csrc/runtime/OpenRegFunctions.cpp | 74 ++ .../csrc/runtime/OpenRegFunctions.h | 18 + .../csrc/runtime/OpenRegGenerator.cpp | 28 + .../csrc/runtime/OpenRegGenerator.h | 21 + .../csrc/runtime/OpenRegGuard.cpp | 7 + PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h | 197 +++++ .../csrc/runtime/OpenRegHooks.cpp | 11 + PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h | 41 + .../csrc/runtime/OpenRegHostAllocator.cpp | 8 + .../csrc/runtime/OpenRegHostAllocator.h | 48 ++ .../csrc/runtime/OpenRegSerialization.cpp | 48 ++ .../csrc/runtime/OpenRegSerialization.h | 10 + .../csrc/runtime/OpenRegStream.cpp | 253 +++++++ .../csrc/runtime/OpenRegStream.h | 162 ++++ PyTorchSimDevice2/include/Macros.h | 7 + PyTorchSimDevice2/pyproject.toml | 35 + PyTorchSimDevice2/setup.py | 148 ++++ .../third_party/openreg/CMakeLists.txt | 21 + .../third_party/openreg/README.md | 151 ++++ .../openreg/cmake/GTestTargets.cmake | 12 + .../third_party/openreg/csrc/device.cpp | 37 + .../third_party/openreg/csrc/memory.cpp | 259 +++++++ .../third_party/openreg/csrc/memory.h | 96 +++ .../third_party/openreg/csrc/stream.cpp | 313 ++++++++ .../third_party/openreg/example/example.cpp | 112 +++ .../third_party/openreg/include/openreg.h | 109 +++ .../third_party/openreg/include/openreg.inl | 42 ++ .../_C.cpython-311-x86_64-linux-gnu.so | Bin 0 -> 15312 bytes PyTorchSimDevice2/torch_openreg/__init__.py | 24 + PyTorchSimDevice2/torch_openreg/_utils.py | 42 ++ .../torch_openreg/csrc/CMakeLists.txt | 24 + .../torch_openreg/csrc/Module.cpp | 99 +++ PyTorchSimDevice2/torch_openreg/csrc/stub.c | 20 + .../torch_openreg/lib/libopenreg.so | Bin 0 -> 59728 bytes .../torch_openreg/lib/libtorch_bindings.so | Bin 0 -> 166144 bytes .../torch_openreg/lib/libtorch_openreg.so | Bin 0 -> 569736 bytes .../torch_openreg/openreg/__init__.py | 86 +++ .../openreg}/extension_device_interface.py | 0 .../openreg}/extension_device_op_overrides.py | 0 .../torch_openreg/openreg/meta.py | 13 + .../torch_openreg/openreg/random.py | 61 ++ .../mlir/mlir_codegen_backend.py | 2 +- Scheduler/scheduler.py | 51 +- 66 files changed, 4100 insertions(+), 968 deletions(-) delete mode 100644 PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp delete mode 100644 PyTorchSimDevice/ExtensionDeviceGuardImpl.h delete mode 100644 PyTorchSimDevice/extension_device.cpp delete mode 100644 PyTorchSimDevice/extension_hooks.cpp delete mode 100644 PyTorchSimDevice/extension_hooks.h create mode 100644 PyTorchSimDevice2/CMakeLists.txt create mode 100644 PyTorchSimDevice2/README.md create mode 100644 PyTorchSimDevice2/cmake/TorchPythonTargets.cmake create mode 100644 PyTorchSimDevice2/csrc/CMakeLists.txt create mode 100644 PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp create mode 100644 PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp create mode 100644 PyTorchSimDevice2/csrc/aten/native/Common.h create mode 100644 PyTorchSimDevice2/csrc/aten/native/Extra.cpp create mode 100644 PyTorchSimDevice2/csrc/aten/native/Extra.h create mode 100644 PyTorchSimDevice2/csrc/aten/native/Minimal.cpp create mode 100644 PyTorchSimDevice2/csrc/aten/native/Minimal.h create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.cpp create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.h create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegException.cpp create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegException.h create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.cpp create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.h create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.cpp create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.h create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegGuard.cpp create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegHooks.cpp create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.cpp create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.h create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.cpp create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.h create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegStream.cpp create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegStream.h create mode 100644 PyTorchSimDevice2/include/Macros.h create mode 100644 PyTorchSimDevice2/pyproject.toml create mode 100644 PyTorchSimDevice2/setup.py create mode 100644 PyTorchSimDevice2/third_party/openreg/CMakeLists.txt create mode 100644 PyTorchSimDevice2/third_party/openreg/README.md create mode 100644 PyTorchSimDevice2/third_party/openreg/cmake/GTestTargets.cmake create mode 100644 PyTorchSimDevice2/third_party/openreg/csrc/device.cpp create mode 100644 PyTorchSimDevice2/third_party/openreg/csrc/memory.cpp create mode 100644 PyTorchSimDevice2/third_party/openreg/csrc/memory.h create mode 100644 PyTorchSimDevice2/third_party/openreg/csrc/stream.cpp create mode 100644 PyTorchSimDevice2/third_party/openreg/example/example.cpp create mode 100644 PyTorchSimDevice2/third_party/openreg/include/openreg.h create mode 100644 PyTorchSimDevice2/third_party/openreg/include/openreg.inl create mode 100755 PyTorchSimDevice2/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so create mode 100644 PyTorchSimDevice2/torch_openreg/__init__.py create mode 100644 PyTorchSimDevice2/torch_openreg/_utils.py create mode 100644 PyTorchSimDevice2/torch_openreg/csrc/CMakeLists.txt create mode 100644 PyTorchSimDevice2/torch_openreg/csrc/Module.cpp create mode 100644 PyTorchSimDevice2/torch_openreg/csrc/stub.c create mode 100644 PyTorchSimDevice2/torch_openreg/lib/libopenreg.so create mode 100644 PyTorchSimDevice2/torch_openreg/lib/libtorch_bindings.so create mode 100644 PyTorchSimDevice2/torch_openreg/lib/libtorch_openreg.so create mode 100644 PyTorchSimDevice2/torch_openreg/openreg/__init__.py rename {PyTorchSimDevice => PyTorchSimDevice2/torch_openreg/openreg}/extension_device_interface.py (100%) rename {PyTorchSimDevice => PyTorchSimDevice2/torch_openreg/openreg}/extension_device_op_overrides.py (100%) create mode 100644 PyTorchSimDevice2/torch_openreg/openreg/meta.py create mode 100644 PyTorchSimDevice2/torch_openreg/openreg/random.py diff --git a/.gitignore b/.gitignore index b42d5f6b..3ca1e54b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ __pycache__/ TOGSim/build/ .vscode -*.txt *.ipynb_checkpoints output togsim_results/* diff --git a/Dockerfile b/Dockerfile index 088daa43..1b4d08f3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,4 +10,7 @@ RUN cd PyTorchSim/TOGSim && \ cd build && \ conan install .. --build=missing && \ cmake .. && \ - make -j$(nproc) \ No newline at end of file + make -j$(nproc) + +RUN cd PyTorchSim/PyTorchSimDevice2 && \ + python -m pip install --no-build-isolation -e . \ No newline at end of file diff --git a/PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp b/PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp deleted file mode 100644 index a0b1395d..00000000 --- a/PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp +++ /dev/null @@ -1,8 +0,0 @@ -#include "ExtensionDeviceGuardImpl.h" -#include - -namespace c10::extension_device::impl { - -C10_REGISTER_GUARD_IMPL(extension_device, ExtensionDeviceGuardImpl); - -} // namespace c10::extension_device::impl diff --git a/PyTorchSimDevice/ExtensionDeviceGuardImpl.h b/PyTorchSimDevice/ExtensionDeviceGuardImpl.h deleted file mode 100644 index 6d35677b..00000000 --- a/PyTorchSimDevice/ExtensionDeviceGuardImpl.h +++ /dev/null @@ -1,127 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace c10::extension_device::impl { - -struct ExtensionDeviceGuardImpl final : public c10::impl::DeviceGuardImplInterface { - static constexpr DeviceType static_type = DeviceType::PrivateUse1; // ✅ your backend type - - ExtensionDeviceGuardImpl() = default; - - explicit ExtensionDeviceGuardImpl(DeviceType t) { - TORCH_CHECK( - t == static_type, - "ExtensionDeviceGuardImpl initialized with non-extension_device DeviceType: ", - t); - } - - // -------------------------------------------------------------------------- - // 기본적인 device guard (CPU처럼 동작) - // -------------------------------------------------------------------------- - DeviceType type() const override { - return static_type; - } - - Device exchangeDevice(Device d) const override { - TORCH_CHECK(d.type() == static_type, "Expected extension_device but got ", d); - return d; // nothing to exchange, CPU-like - } - - Device getDevice() const override { - return Device(static_type, 0); - } - - void setDevice(Device d) const override { - TORCH_CHECK(d.type() == static_type, "Expected extension_device but got ", d); - } - - void uncheckedSetDevice(Device d) const noexcept override {} - - DeviceIndex deviceCount() const noexcept override { - return 1; // pretend single device - } - - // -------------------------------------------------------------------------- - // Stream handling (동기식이므로 기본 stream만 사용) - // -------------------------------------------------------------------------- - Stream getStream(Device d) const override { - return Stream(Stream::DEFAULT, d); - } - - Stream getNewStream(Device d, int priority = 0) const override { - return Stream(Stream::DEFAULT, d); - } - - Stream getStreamFromGlobalPool(Device d, bool = false) const override { - return Stream(Stream::DEFAULT, d); - } - - Stream exchangeStream(Stream s) const override { - return s; - } - - bool queryStream(const Stream& stream) const override { - (void)stream; - return true; - } - - void synchronizeStream(const Stream& stream) const override { - (void)stream; - } - - void synchronizeDevice(DeviceIndex device_index) const override { - (void)device_index; - } - - // -------------------------------------------------------------------------- - // Event handling (전부 no-op) - // -------------------------------------------------------------------------- - void destroyEvent(void* event, const DeviceIndex device_index) const noexcept override { - (void)event; - (void)device_index; - } - - void record(void** event, const Stream& stream, const DeviceIndex device_index, const EventFlag flag) const override { - (void)event; - (void)stream; - (void)device_index; - (void)flag; - } - - void block(void* event, const Stream& stream) const override { - (void)event; - (void)stream; - } - - bool queryEvent(void* event) const override { - (void)event; - return true; - } - - void synchronizeEvent(void* event) const override { - (void)event; - } - - double elapsedTime(void* start_event, void* end_event, const DeviceIndex device_index) const override { - (void)start_event; - (void)end_event; - (void)device_index; - return 0.0; - } - - // -------------------------------------------------------------------------- - // Misc (allocator integration) - // -------------------------------------------------------------------------- - void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream) const override { - (void)data_ptr; - (void)stream; - } -}; - -} // namespace c10::extension_device::impl diff --git a/PyTorchSimDevice/extension_device.cpp b/PyTorchSimDevice/extension_device.cpp deleted file mode 100644 index a1dcfcf4..00000000 --- a/PyTorchSimDevice/extension_device.cpp +++ /dev/null @@ -1,711 +0,0 @@ -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -namespace py = pybind11; - -namespace { - bool g_amp_enabled = false; - at::ScalarType g_amp_dtype = at::kFloat; -} - -static at::ScalarType to_scalar_type(const py::object& dtype_obj) { - py::module torch_mod = py::module::import("torch"); - if (dtype_obj.is(torch_mod.attr("bfloat16"))) return at::kBFloat16; - if (dtype_obj.is(torch_mod.attr("float16"))) return at::kHalf; - if (dtype_obj.is(torch_mod.attr("float32"))) return at::kFloat; - if (dtype_obj.is(torch_mod.attr("float64"))) return at::kDouble; - throw std::runtime_error("Unsupported dtype for extension_device AMP"); -} - -static py::object to_torch_dtype(at::ScalarType st) { - py::module torch_mod = py::module::import("torch"); - switch (st) { - case at::kBFloat16: return torch_mod.attr("bfloat16"); - case at::kHalf: return torch_mod.attr("float16"); - case at::kFloat: return torch_mod.attr("float32"); - case at::kDouble: return torch_mod.attr("float64"); - default: - throw std::runtime_error("Unsupported scalar type in get_autocast_dtype"); - } -} - -static inline at::MemoryFormat fix_memory_format(c10::optional mf_opt) { - if (!mf_opt.has_value()) return at::MemoryFormat::Contiguous; - - auto mf = mf_opt.value(); - if (mf == at::MemoryFormat::Preserve) { - return at::MemoryFormat::Contiguous; - } - return mf; -} - -#include "ExtensionDeviceGuardImpl.h" - -static uint64_t op_counter = 0; -static uint64_t last_saved_value = 0; - -C10_REGISTER_GUARD_IMPL(PrivateUse1, c10::extension_device::impl::ExtensionDeviceGuardImpl); - -// basic dummy add function -at::Tensor custom_add_Tensor(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) { - op_counter += 1; - // Since this custom device is just for testing, not bothering to implement kernels. - return at::empty(self.sizes(), self.options()); -} - -// basic dummy mul function -at::Tensor custom_mul_Tensor(const at::Tensor & self, const at::Tensor & other) { - op_counter += 1; - // Since this custom device is just for testing, not bothering to implement kernels. - return at::empty(self.sizes(), self.options()); -} - -at::Tensor _reinterpret_tensor( - const at::Tensor& self, - c10::IntArrayRef size, - c10::IntArrayRef stride, - int64_t offset_increment) { - at::Tensor self_ = at::detail::make_tensor( - c10::Storage(self.storage()), self.key_set(), self.dtype()); - auto* self_tmp_ = self_.unsafeGetTensorImpl(); - self_tmp_->set_storage_offset(self.storage_offset() + offset_increment); - self_tmp_->set_sizes_and_strides(size, stride); - return self_; -} - -at::Tensor& zero_inplace_batching_rule(at::Tensor &self) { - op_counter += 1; - // Since this custom device is just for testing, not bothering to implement kernels. - return self; -} - -const at::Tensor& custom_resize_(const at::Tensor& self, at::IntArrayRef size, - std::optional optional_memory_format) { - at::TensorImpl* tensor_impl = self.unsafeGetTensorImpl(); - tensor_impl->set_sizes_contiguous(size); - const auto itemsize = tensor_impl->dtype().itemsize(); - const auto offset = tensor_impl->storage_offset(); - const auto storage_size = at::detail::computeStorageNbytesContiguous(size, itemsize, offset); - // Dummy device is using cpu allocator, so here just call cpu - // function maybe_resize_storage_cpu in aten/src/ATen/native/Resize.h - // to get a sufficient memory space. - at::native::maybe_resize_storage_cpu(tensor_impl, storage_size); - if (optional_memory_format.has_value()) { - auto memory_format = - optional_memory_format.value(); - TORCH_CHECK( - memory_format != at::MemoryFormat::Preserve, - "Unsupported memory format", - memory_format); - tensor_impl->empty_tensor_restride(memory_format); - } - return self; -} - -// basic dummy eq function: Only support CPU -at::Tensor custom_to_device( - const at::Tensor & self, - at::Device device, - at::ScalarType dtype, - bool non_blocking, - bool copy, - c10::optional memory_format) { - TORCH_CHECK(self.is_cpu() || self.device().type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device."); - TORCH_CHECK(device.is_cpu() || device.type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device."); - // Some dummy asserts for the basic use case: inputs are the same size / dtype, all contiguous. - TORCH_CHECK(self.scalar_type() == dtype); - TORCH_CHECK(self.is_contiguous()); - - op_counter += 1; - if (device.type() == at::DeviceType::CPU) { - auto out = at::empty(self.sizes(), dtype, self.options().layout(), - device, false, memory_format); - std::memcpy(out.mutable_data_ptr(), self.data_ptr(), self.nbytes()); - return out; - } else { - auto opts = self.options().device(device).dtype(dtype); - auto out = at::empty(self.sizes(), opts); - std::memcpy(out.mutable_data_ptr(), self.data_ptr(), self.nbytes()); - return out; - } - - auto out = at::empty(self.sizes(), dtype, self.options().layout(), device, false, memory_format); - memcpy(out.mutable_data_ptr(), self.mutable_data_ptr(), self.nbytes()); - // Since this custom device is just for testing, not bothering to implement kernels. - return out; -} - - -// A dummy allocator for our custom device, that secretly uses the CPU -struct DummyCustomAllocator final : at::Allocator { - DummyCustomAllocator() = default; - at::DataPtr allocate(size_t nbytes) override { - void* data = c10::alloc_cpu(nbytes); - return {data, data, &ReportAndDelete, at::Device(at::DeviceType::PrivateUse1, 0)}; - } - - static void ReportAndDelete(void* ptr) { - if (!ptr) { - return; - } - c10::free_cpu(ptr); - } - - at::DeleterFnPtr raw_deleter() const override { - return &ReportAndDelete; - } - - void copy_data(void* dest, const void* src, std::size_t count) const override { - std::memcpy(dest, src, count); - } -}; - -// Register our dummy allocator -static DummyCustomAllocator global_custom_alloc; -REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_custom_alloc); - -at::Tensor & custom_fill__scalar(at::Tensor & self, const at::Scalar & value) { - TORCH_CHECK(self.device().type() == c10::DeviceType::PrivateUse1, - "Dummy test only allows dummy device."); - TORCH_CHECK(self.is_contiguous()); - - op_counter += 1; - - switch (self.scalar_type()) { - case c10::ScalarType::Float: { - auto* data = self.mutable_data_ptr(); - for (int64_t i = 0; i < self.numel(); i++) { - data[i] = value.toFloat(); - } - break; - } - case c10::ScalarType::Double: { - auto* data = self.mutable_data_ptr(); - for (int64_t i = 0; i < self.numel(); i++) { - data[i] = value.toDouble(); - } - break; - } - case c10::ScalarType::Half: { - auto* data = self.mutable_data_ptr(); - for (int64_t i = 0; i < self.numel(); i++) { - data[i] = at::Half(value.toHalf()); - } - break; - } - case c10::ScalarType::BFloat16: { - auto* data = self.mutable_data_ptr(); - for (int64_t i = 0; i < self.numel(); i++) { - data[i] = at::BFloat16(value.toBFloat16()); - } - break; - } - case c10::ScalarType::Int: { - auto* data = self.mutable_data_ptr(); - for (int64_t i = 0; i < self.numel(); i++) { - data[i] = value.toInt(); - } - break; - } - case c10::ScalarType::Long: { - auto* data = self.mutable_data_ptr(); - for (int64_t i = 0; i < self.numel(); i++) { - data[i] = value.toLong(); - } - break; - } - case c10::ScalarType::Short: { - auto* data = self.mutable_data_ptr(); - for (int64_t i = 0; i < self.numel(); i++) { - data[i] = static_cast(value.toShort()); - } - break; - } - case c10::ScalarType::Char: { - auto* data = self.mutable_data_ptr(); - for (int64_t i = 0; i < self.numel(); i++) { - data[i] = static_cast(value.toChar()); - } - break; - } - case c10::ScalarType::Byte: { - auto* data = self.mutable_data_ptr(); - for (int64_t i = 0; i < self.numel(); i++) { - data[i] = static_cast(value.toByte()); - } - break; - } - case c10::ScalarType::Bool: { - auto* data = self.mutable_data_ptr(); - for (int64_t i = 0; i < self.numel(); i++) { - data[i] = value.toBool(); - } - break; - } - default: - TORCH_CHECK(false, "Unsupported scalar type: ", self.scalar_type()); - } - return self; -} - -at::Tensor unsafe_create_cpu_tensor_from_dummy_tensor(const at::Tensor& src) { - // TORCH_CHECK(src.device().type() == c10::DeviceType::PrivateUse1, - // "Only support dummy device."); - const auto& sizes_ = src.sizes(); - const auto& strides_ = src.strides(); - auto storage_offset_ = src.storage_offset(); - at::detail::check_size_nonnegative(sizes_); - - size_t size_bytes = at::detail::computeStorageNbytes(sizes_, strides_, - src.element_size(), - storage_offset_); - - at::DataPtr data_ptr = - c10::InefficientStdFunctionContext::makeDataPtr(src.storage().mutable_data_ptr().get(), - [](void*){}, at::kCPU); - - c10::Storage storage{c10::Storage::use_byte_size_t{}, size_bytes, std::move(data_ptr), - /*allocator=*/&global_custom_alloc, /*resizeable=*/false}; - - constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU); - at::Tensor tensor = at::detail::make_tensor( - std::move(storage), cpu_ks, src.dtype()); - - c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl(); - tensor_impl->set_sizes_and_strides(sizes_, strides_); - tensor_impl->set_storage_offset(storage_offset_); - return tensor; -} - -// basic dummy copy_() function, so we can copy from the custom device to/from CPU -at::Tensor custom__copy_from(const at::Tensor& self, const at::Tensor& dst, bool non_blocking) { - TORCH_CHECK( - self.is_cpu() || self.device().type() == c10::DeviceType::PrivateUse1, - "Dummy test only allows copy from cpu -> dummy device."); - TORCH_CHECK( - dst.is_cpu() || dst.device().type() == c10::DeviceType::PrivateUse1, - "Dummy test only allows copy from cpu -> dummy device."); - - // Some dummy asserts for the basic use case: inputs are the same size / dtype, all contiguous. - if (self.numel() != dst.numel()) { - custom_resize_(dst, self.sizes(), c10::nullopt); - } - TORCH_CHECK(self.sizes() == dst.sizes()); - - const bool same_dtype = (self.scalar_type() == dst.scalar_type()); - const bool both_contig = self.is_contiguous() && dst.is_contiguous(); - - // 1) fast path - if (same_dtype && both_contig) { - std::memcpy(dst.mutable_data_ptr(), - self.data_ptr(), - dst.storage().nbytes()); - return dst; - } - - // 2) slow path - at::Tensor cpu_self = unsafe_create_cpu_tensor_from_dummy_tensor(self); - at::Tensor cpu_dst = unsafe_create_cpu_tensor_from_dummy_tensor(dst); - if (!same_dtype) { - cpu_self = cpu_self.to(cpu_dst.scalar_type(), /*non_blocking=*/false, /*copy=*/true); - } - cpu_dst.copy_(cpu_self); - return dst; -} - -at::Tensor custom__copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst) { - return custom__copy_from(self, dst, false); -} - -at::Tensor& custom_abs_out(const at::Tensor& self, at::Tensor& out) { - return at::native::abs_out(self, out); -} - -at::Tensor custom_empty_strided(c10::IntArrayRef size, c10::IntArrayRef stride, c10::optional dtype_opt, c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt) { - op_counter += 1; - constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1); - auto dtype = c10::dtype_or_default(dtype_opt); - return at::detail::empty_strided_generic(size, stride, &global_custom_alloc, private_use_ks, dtype); -} - -at::Tensor custom_empty(c10::IntArrayRef size, c10::optional dtype_opt, c10::optional layout_opt, c10::optional device_opt, c10::optional pin_memory_opt, c10::optional optional_memory_format) { - op_counter += 1; - - constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1); - auto dtype = c10::dtype_or_default(dtype_opt); - return at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, dtype, fix_memory_format(optional_memory_format)); -} - -at::Tensor& custom_arange_start_out_impl( - const c10::Scalar& start, - const c10::Scalar& end, - const c10::Scalar& step, - at::Tensor& out) { - double s = start.toDouble(); - double e = end.toDouble(); - double st = step.toDouble(); - TORCH_CHECK(st != 0.0, "step must be nonzero"); - - int64_t length = 0; - if (st > 0) { - if (e > s) length = static_cast(std::ceil((e - s) / st)); - } else { - if (e < s) length = static_cast(std::ceil((e - s) / st)); - } - - // Resize out tensor - custom_resize_(out, {length}, c10::nullopt); - - if (out.scalar_type() == at::kFloat || out.scalar_type() == at::kDouble) { - double* data = out.mutable_data_ptr(); - for (int64_t i = 0; i < length; i++) { - data[i] = s + i * st; - } - } else if (out.scalar_type() == at::kLong) { - int64_t* data = out.mutable_data_ptr(); - for (int64_t i = 0; i < length; i++) { - data[i] = static_cast(s + i * st); - } - } else { - TORCH_CHECK(false, "Unsupported dtype for arange on dummy device"); - } - - return out; -} - -static at::Tensor custom_to_dtype_impl(const at::Tensor& self, - c10::ScalarType dtype, - bool non_blocking, bool copy, - c10::optional memory_format) { - return at::native::to(self, dtype, non_blocking, copy, memory_format); -} - -at::Tensor custom_zeros_like( - const at::Tensor& input, - c10::optional dtype_opt, - c10::optional layout_opt, - c10::optional device_opt, - c10::optional pin_memory_opt, - c10::optional memory_format_opt) -{ - // dtype / layout / device fallback - auto dtype = dtype_opt.value_or(input.scalar_type()); - auto layout = layout_opt.value_or(input.layout()); - auto device = device_opt.value_or(input.device()); - auto memfmt = memory_format_opt.value_or(c10::MemoryFormat::Contiguous); - - TORCH_CHECK( - device.type() == c10::DeviceType::PrivateUse1, - "custom_zeros_like: device must be PrivateUse1"); - - at::Tensor out = custom_empty( - input.sizes(), - dtype, - layout, - device, - pin_memory_opt, - memfmt - ); - size_t nbytes = out.numel() * out.element_size(); - void* ptr = out.mutable_data_ptr(); - - TORCH_CHECK(ptr != nullptr, - "custom_zeros_like: out.mutable_data_ptr() returned NULL"); - std::memset(ptr, 0, nbytes); - return out; -} - -at::Tensor& custom_zero_impl(at::Tensor& self) -{ - TORCH_CHECK( - self.device().type() == c10::DeviceType::PrivateUse1, - "custom_zero_: expected a PrivateUse1 device tensor"); - - if (self.numel() == 0) { - return self; - } - - void* data = self.mutable_data_ptr(); - TORCH_CHECK(data != nullptr, - "custom_zero_: self.mutable_data_ptr() returned NULL " - "(storage was not allocated)"); - - size_t nbytes = self.numel() * self.element_size(); - std::memset(data, 0, nbytes); - - return self; -} - -// With TORCH_LIBRARY_IMPL, you can register custom kernels for your backend. -// For open registration, we're registering all of our kernels to the PrivateUse1 dispatch key. -// Later in this file, we map a custom device to the PrivateUse1 device type, -// which allows user code that puts a tensor on your custom_device to eventually get plumbed -// into the kernels registered here. -// -// This macro registers your kernels to the PyTorch Dispatcher. -// More details on the dispatcher can be found at http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/. -TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { - m.impl("to.Device", &custom_to_device); - m.impl("to.dtype", &custom_to_dtype_impl); - m.impl("fill_.Scalar", &custom_fill__scalar); - m.impl("_copy_from", &custom__copy_from); - m.impl("_copy_from_and_resize", &custom__copy_from_and_resize); - m.impl("empty_strided", &custom_empty_strided); - m.impl("empty.memory_format", &custom_empty); - m.impl("as_strided", at::native::as_strided_tensorimpl); - m.impl("view", at::native::view); - m.impl("arange.start_out", &custom_arange_start_out_impl); - m.impl("zeros_like", &custom_zeros_like); - m.impl("zero_", &custom_zero_impl); -} - -TORCH_LIBRARY_IMPL(aten, AutogradPrivateUse1, m) { - m.impl("to.dtype", &custom_to_dtype_impl); -} - -TORCH_LIBRARY_FRAGMENT(aten, m) { - m.def( - "_reinterpret_tensor(Tensor self, int[] size, int[] stride, int offset_increment=0) -> Tensor", - torch::dispatch(c10::DispatchKey::AutogradPrivateUse1, _reinterpret_tensor), - {at::Tag::pt2_compliant_tag} - ); -} - -void custom_cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { - at::native::cpu_fallback(op, stack); -} - -TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { - m.impl("abs", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("abs.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("abs_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("absolute", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("absolute.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("absolute_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("add.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("add.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("add.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("add_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("add_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("cat", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("cat.names", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("cat.names_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("cat.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("div.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("div.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("div.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("div_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("div_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("eq.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("eq.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("eq.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("eq.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("equal", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("erf", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("erf.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("erf_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("erfc", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("erfc.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("erfc_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("exp", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("exp.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("ge.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("ge.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("ge.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("ge.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("gt.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("gt.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("gt.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("gt.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("le.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("le.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("le.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("le.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("lt.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("lt.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("lt.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("lt.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("ne.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("ne.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("ne.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("ne.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("logical_and", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_and.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_and_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_not", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_not.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_not_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_or", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_or.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_or_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_xor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_xor.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("logical_xor_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("neg", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("neg.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("neg_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("mul.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("mul.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("mul_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("pow.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("pow.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("pow.Tensor_Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("pow.Tensor_Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("pow.Tensor_Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("pow.Tensor_Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("pow_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("pow_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("sub.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("sub.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("sub.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("sub_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("sub_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("sum", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("sum.DimnameList_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("sum.IntList_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("sum.dim_DimnameList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("sum.dim_IntList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("resize_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("resize_as_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - // Foreach ops - m.impl("_foreach_add.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_add_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_add_.ScalarList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_add.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_foreach_add_.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - // Indexed - m.impl("index_add.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("index_add_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("index_copy.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("index_copy_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("index_fill.int_Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("index_fill.int_Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("index_fill.int_Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("index_fill.int_Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("index_fill_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("tril", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("tril_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("triu", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("triu_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("triu_indices", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("nll_loss2d_forward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("nll_loss2d_backward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("nll_loss_backward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("nll_loss_forward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("scatter.src_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("scatter.value_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("index_put.Default", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("index.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("mm.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("sigmoid.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("gather.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("silu.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - - m.impl("all.all_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_local_scalar_dense", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_log_softmax", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_log_softmax_backward_data", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("mse_loss.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("_native_multi_head_attention", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("where.self", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("min", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("max", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("index_select", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); - m.impl("nonzero", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>()); -} - -// This basic implementation doesn't bother dealing with different device indices -// (e.g. custom_device:0 vs. custom_device:1). -// We could do that by letting the user pass in a device index in our exposed device function. -// Note that if you do that, you'll also need to register a device guard to core. -// See `c10/core/impl/DeviceGuardImplInterface.h:C10_REGISTER_GUARD_IMPL`. -c10::Device get_custom_device() { - return c10::Device(c10::DeviceType::PrivateUse1, 0); -} - -bool custom_op_called() { - bool called = false; - if (op_counter > last_saved_value) { - called = true; - last_saved_value = op_counter; - } - return called; -} - -class PrivateGeneratorImpl : public at::CPUGeneratorImpl { -public: - PrivateGeneratorImpl(c10::DeviceIndex device_index) { - device_ = c10::Device(c10::DeviceType::PrivateUse1, device_index); - key_set_ = c10::DispatchKeySet(c10::DispatchKey::PrivateUse1); - } - ~PrivateGeneratorImpl() override = default; -}; - -// this is used to register generator -at::Generator make_generator_privateuse1(c10::DeviceIndex device_index) { - return at::make_generator(device_index); -} - -void register_generator() { - REGISTER_GENERATOR_PRIVATEUSE1(make_generator_privateuse1) -} - -// Here, we're exposing a custom device object that corresponds to our custom backend. -// We do this using pybind: exposing an "extension_name.custom_device()" function in python, -// that's implemented in C++. -// The implementation in this file maps directly to the `PrivateUse1` device type. -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("custom_device", &get_custom_device, "get custom device object"); - m.def("custom_op_called", &custom_op_called, "check if our custom function was called"); - m.def("register_generator", ®ister_generator, "register generator for custom device"); - m.def("is_autocast_enabled", []() -> bool { return g_amp_enabled;}); - m.def("set_autocast_enabled", [](bool flag) -> void {g_amp_enabled = flag;}); - m.def("get_autocast_dtype", []() -> py::object { return to_torch_dtype(g_amp_dtype); }); - m.def("set_autocast_dtype", [](py::object dtype_obj) -> void { - auto st = to_scalar_type(dtype_obj); - g_amp_dtype = st; - }); - m.def("get_amp_supported_dtype", []() -> py::list { - py::module torch_mod = py::module::import("torch"); - py::list lst; - lst.append(torch_mod.attr("float16")); - lst.append(torch_mod.attr("float32")); - return lst; - }); -} \ No newline at end of file diff --git a/PyTorchSimDevice/extension_hooks.cpp b/PyTorchSimDevice/extension_hooks.cpp deleted file mode 100644 index aadd6d2a..00000000 --- a/PyTorchSimDevice/extension_hooks.cpp +++ /dev/null @@ -1,48 +0,0 @@ -#include "extension_hooks.h" - -bool ExtensionPU1Hooks::isBuilt() const { return true; } -bool ExtensionPU1Hooks::isAvailable() const { return true; } - -const at::Generator& ExtensionPU1Hooks::getDefaultGenerator(c10::DeviceIndex idx) const { - if (idx < 0) idx = 0; - static std::vector gens; - static std::mutex m; - std::lock_guard g(m); - if (gens.size() <= (size_t)idx) gens.resize((size_t)idx + 1); - if (!gens[idx].defined()) gens[idx] = at::GetGeneratorForPrivateuse1(idx); - return gens[idx]; // 영속 객체 참조 반환 -} - -at::Generator ExtensionPU1Hooks::getNewGenerator(c10::DeviceIndex idx) const { - if (idx < 0) idx = 0; - return at::GetGeneratorForPrivateuse1(idx); -} - -at::Device ExtensionPU1Hooks::getDeviceFromPtr(void* data) const { - return at::Device(at::kPrivateUse1, 0); // MVP: 단일 디바이스 가정 -} - -bool ExtensionPU1Hooks::isPinnedPtr(const void* data) const { - return false; -} - -at::Allocator* ExtensionPU1Hooks::getPinnedMemoryAllocator() const { - return at::getHostAllocator(at::kPrivateUse1); -} - -bool ExtensionPU1Hooks::hasPrimaryContext(c10::DeviceIndex device_index) const { return true; } - -void ExtensionPU1Hooks::resizePrivateUse1Bytes(const c10::Storage&, size_t) const { - TORCH_CHECK(false, "resizePrivateUse1Bytes not implemented"); -} - -// REGISTER_EXTENSION_HOOKS(ExtensionPU1Hooks); - -namespace { -struct AutoRegistrar { - AutoRegistrar() { - at::RegisterPrivateUse1HooksInterface(new ExtensionPU1Hooks()); - } -}; -static AutoRegistrar _auto_registrar; -} diff --git a/PyTorchSimDevice/extension_hooks.h b/PyTorchSimDevice/extension_hooks.h deleted file mode 100644 index fdf3505a..00000000 --- a/PyTorchSimDevice/extension_hooks.h +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once - -#include -#include - -#include -#include -#include -#include -#include - -struct ExtensionPU1Hooks final : public at::PrivateUse1HooksInterface { - ExtensionPU1Hooks() {} - bool isBuilt() const; - bool isAvailable() const; - - const at::Generator& getDefaultGenerator(c10::DeviceIndex device_index) const override; - - at::Generator getNewGenerator(c10::DeviceIndex device_index = -1) const override; - - at::Device getDeviceFromPtr(void* data) const override; - - bool isPinnedPtr(const void* data) const override; - - at::Allocator* getPinnedMemoryAllocator() const override; - - bool hasPrimaryContext(c10::DeviceIndex device_index) const override; - - void resizePrivateUse1Bytes(const c10::Storage& /*storage*/, size_t /*newsize*/) const override; -}; \ No newline at end of file diff --git a/PyTorchSimDevice2/CMakeLists.txt b/PyTorchSimDevice2/CMakeLists.txt new file mode 100644 index 00000000..2c207ca6 --- /dev/null +++ b/PyTorchSimDevice2/CMakeLists.txt @@ -0,0 +1,44 @@ +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) + +project(TORCH_OPENREG CXX C) + +include(GNUInstallDirs) +include(CheckCXXCompilerFlag) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_EXTENSIONS OFF) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_SKIP_BUILD_RPATH FALSE) +set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) +set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE) +set(CMAKE_CXX_VISIBILITY_PRESET hidden) + +set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) + +if(APPLE) + set(CMAKE_INSTALL_RPATH "@loader_path/lib;@loader_path") +elseif(UNIX) + set(CMAKE_INSTALL_RPATH "$ORIGIN/lib:$ORIGIN") +elseif(WIN32) + set(CMAKE_INSTALL_RPATH "") +endif() +set(CMAKE_INSTALL_LIBDIR lib) +set(CMAKE_INSTALL_MESSAGE NEVER) + +set(Torch_DIR ${PYTORCH_INSTALL_DIR}/share/cmake/Torch) +find_package(Torch REQUIRED) + +if(DEFINED PYTHON_INCLUDE_DIR) + include_directories(${PYTHON_INCLUDE_DIR}) +else() + message(FATAL_ERROR "Cannot find Python directory") +endif() + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +include(${PROJECT_SOURCE_DIR}/cmake/TorchPythonTargets.cmake) + +add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/openreg) +add_subdirectory(${PROJECT_SOURCE_DIR}/csrc) +add_subdirectory(${PROJECT_SOURCE_DIR}/torch_openreg/csrc) diff --git a/PyTorchSimDevice2/README.md b/PyTorchSimDevice2/README.md new file mode 100644 index 00000000..83ec85b1 --- /dev/null +++ b/PyTorchSimDevice2/README.md @@ -0,0 +1,175 @@ +# PyTorch OpenReg + +## Background + +The third-party device integration mechanism based on PrivateUse1 has become the official mainstream method for new backends to integrate with PyTorch. Ensuring the availability of this mechanism is crucial for enriching PyTorch's hardware ecosystem. + +**Note:** + +The goal of `torch_openreg` is **not to implement a fully functional, high-performance PyTorch backend**, but to serve as a **minimalist reference implementation for mechanism verification**. + +### Purpose + +- **Test Backend**: To serve as an in-tree test backend for PrivateUse1, ensuring quality stability through CI/CD. +- **Integration Example**: To serve as a reference example for new backend integration. +- **Integration Documentation**: To provide module-level integration documentation that corresponds with the code. + +### Design Principles + +- **Minimality Principle**: The fundamental goal is to enable/verify all integration paths/mechanisms for a new backend to integrate to PyTorch. All functions follow a "just right" strategy to ensure the correctness of relevant integration capabilities. +- **Authenticity Principle**: To complete the OpenReg integration in the same way a real accelerator backend would integrate with PyTorch. + +## Directory Structure + +```shell +torch_openreg/ +├── CMakeLists.txt +├── csrc +│ ├── aten +│ │ ├── native +│ │ │ ├── Extra.cpp +│ │ │ ├── Minimal.cpp +│ │ │ └── ... +│ │ ├── OpenRegExtra.cpp +│ │ └── OpenRegMinimal.cpp +│ ├── CMakeLists.txt +│ └── runtime +│ ├── OpenRegDeviceAllocator.cpp +│ ├── OpenRegDeviceAllocator.h +│ ├── OpenRegFunctions.cpp +│ ├── OpenRegFunctions.h +│ ├── OpenRegGenerator.cpp +│ ├── OpenRegGenerator.h +│ ├── OpenRegGuard.cpp +│ ├── OpenRegGuard.h +│ ├── OpenRegHooks.cpp +│ ├── OpenRegHooks.h +│ ├── OpenRegHostAllocator.cpp +│ ├── OpenRegHostAllocator.h +│ └── ... +├── pyproject.toml +├── README.md +├── setup.py +├── third_party +│ └── openreg +└── torch_openreg + ├── csrc + │ ├── CMakeLists.txt + │ ├── Module.cpp + │ └── stub.c + ├── __init__.py + └── openreg + ├── __init__.py + ├── meta.py + └── random.py +``` + +**Dependencies**: + +```mermaid +graph LR + A[Python] + B[_C.so] + C[libtorch_bindings.so] + D[libtorch_openreg.so] + E[libopenreg.so] + + A --> B --> C --> D --> E +``` + +There are 4 DSOs in torch_openreg, and the dependencies between them are as follows: + +- `_C.so`: + - **sources**: torch_openreg/csrc/stub.c + - **description**: Python C module entry point. +- `libtorch_bindings.so`: The bridging code between Python and C++ should go here. + - **sources**: torch_openreg/csrc + - **description**: A thin glue layer between Python and C++. +- `libtorch_openreg.so`: All core implementations should go here. + - **sources**: csrc + - **description**: All core functionality, such as device runtime, operators, etc. +- `libopenreg.so`: A DSO that uses the CPU to emulate a CUDA-like device, you can ignore it. + - **sources**: third_party/openreg + - **description**: Provides low-level device functionality similar to libcudart.so. + +**Key Directories**: + +- `csrc/`: Core device implementation, including operator registration, runtime, etc. + - `csrc/aten/`: Operator registration + - `csrc/aten/native/`: Specific operator implementations for the OpenReg device. + - `csrc/aten/OpenRegMinimal.cpp`: The most minimal set of operator implementations (allowing for the creation of Tensors and related operations upon completion). + - `csrc/aten/OpenRegExtra.cpp`: Implementations for other types of operators. + - `csrc/runtime/`: Implementations for Host memory, device memory, Guard, Hooks, etc. +- `third_party/`: A C++ library that simulates a CUDA-like device using the CPU. +- `torch_openreg/`: Python interface implementation (Python code and C++ Bindings). + - `torch_openreg/csrc/`: Python C++ binding code. + - `torch_openreg/openreg/`: Python API. + +## Currently Implemented Features + +### Operator Registration + +- Operator Implementation + + - Register for builtin PyTorch Operators + - `TORCH_LIBRARY_IMPL` form: See `empty.memory_format + - `STUB` form: See `abs_stub` + - Register for custom operators + - Schema Registration: See `custom_abs` + - Kernel Registration: See `custom_abs` + - Fallback Registration for `AutogradPriavateUse1`: See `custom_abs` + - Meta Registration: See `custom_abs` + - `torch.autograd.Function`: See `custom_autograd_fn_aliasing` + - Register for fallback + - Per-operator Fallback: See `sub.Tensor` + - Global Fallback: See `wrapper_cpu_fallback` + +## Installation and Usage + +### Installation + +```python +pip3 install --no-build-isolation -e . # for develop +pip3 install --no-build-isolation . # for install +``` + +### Usage Example + +After installation, you can use the `openreg` device in Python just like any other regular device. + +```python +import torch +import torch_openreg + +if not torch.openreg.is_available(): + print("OpenReg backend is not available in this build.") + exit() + +print("OpenReg backend is available!") + +device = torch.device("openreg") + +x = torch.tensor([[1., 2.], [3., 4.]], device=device) +y = x + 2 +print("Result y:\n", y) +print(f"Device of y: {y.device}") + +z = y.cpu() +print("Result z:\n", z) +print(f"Device of z: {z.device}") +``` + +## Future Plans + +- **Enhance Features**: + - Autoload + - AMP + - Device-agnostic APIs + - Memory Management + - Generator + - Distrubuted + - Custom Tensor&Storage + - ... +- **Improve Tests**: Add more test cases related to the integration mechanism. +- **Improve Documentation**: Add a new chapter on third-party device integration in the `Developer Notes` section of the PyTorch documentation. +- **Real-time Synchronization**: Keep the code and documentation updated iteratively and in sync. diff --git a/PyTorchSimDevice2/cmake/TorchPythonTargets.cmake b/PyTorchSimDevice2/cmake/TorchPythonTargets.cmake new file mode 100644 index 00000000..b7a807d2 --- /dev/null +++ b/PyTorchSimDevice2/cmake/TorchPythonTargets.cmake @@ -0,0 +1,22 @@ +if(WIN32) + set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/torch_python.lib") +elseif(APPLE) + set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.dylib") +else() + set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.so") +endif() + +add_library(torch_python SHARED IMPORTED) + +set_target_properties(torch_python PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${PYTORCH_INSTALL_DIR}/include" + INTERFACE_LINK_LIBRARIES "c10;torch_cpu" + IMPORTED_LOCATION "${TORCH_PYTHON_IMPORTED_LOCATION}" +) + +add_library(torch_python_library INTERFACE IMPORTED) + +set_target_properties(torch_python_library PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "\$" + INTERFACE_LINK_LIBRARIES "\$;\$" +) diff --git a/PyTorchSimDevice2/csrc/CMakeLists.txt b/PyTorchSimDevice2/csrc/CMakeLists.txt new file mode 100644 index 00000000..e2ae2b3f --- /dev/null +++ b/PyTorchSimDevice2/csrc/CMakeLists.txt @@ -0,0 +1,16 @@ +set(LIBRARY_NAME torch_openreg) + +file(GLOB_RECURSE SOURCE_FILES + "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" +) + +add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES}) + +target_link_libraries(${LIBRARY_NAME} PRIVATE torch_cpu_library openreg) +target_include_directories(${LIBRARY_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +install(TARGETS ${LIBRARY_NAME} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR} +) diff --git a/PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp b/PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp new file mode 100644 index 00000000..04ba6d48 --- /dev/null +++ b/PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp @@ -0,0 +1,195 @@ +#include "native/Extra.h" + +#include +#include + +#include +#include + +namespace at::openreg { + +namespace { +at::Tensor wrapper_quantize_per_tensor( + const at::Tensor& self, + double scale, + int64_t zero_point, + at::ScalarType dtype) { + return at::native::openreg::quantize_per_tensor( + self, scale, zero_point, dtype); +} + +int64_t wrapper__fused_sdp_choice( + const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value, + const std::optional& attn_mask, + double dropout_p, + bool is_causal, + std::optional scale, + bool enable_gqa) { + return at::native::openreg::_fused_sdp_choice( + query, key, value, attn_mask, dropout_p, is_causal, scale, enable_gqa); +} + +void wrapper_quantize_tensor_per_tensor_affine_stub( + const at::Tensor& rtensor, + at::Tensor& qtensor, + double scale, + int64_t zero_point) { + at::native::openreg::quantize_tensor_per_tensor_affine_stub( + rtensor, qtensor, scale, zero_point); +} + +std::tuple< + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + c10::SymInt, + c10::SymInt, + at::Tensor, + at::Tensor, + at::Tensor> +wrapper__scaled_dot_product_fused_attention_overrideable( + const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value, + const std::optional& attn_bias, + double dropout_p, + bool is_causal, + bool return_debug_mask, + std::optional scale) { + return at::native::openreg::_scaled_dot_product_fused_attention_overrideable( + query, + key, + value, + attn_bias, + dropout_p, + is_causal, + return_debug_mask, + scale); +} + +std::tuple +wrapper_scaled_dot_product_fused_attention_overrideable_backward( + const at::Tensor& grad_out, + const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value, + const at::Tensor& attn_bias, + std::array grad_input_mask, + const at::Tensor& out, + const at::Tensor& logsumexp, + const at::Tensor& cum_seq_q, + const at::Tensor& cum_seq_k, + int64_t max_q, + int64_t max_k, + double dropout_p, + bool is_causal, + const at::Tensor& philox_seed, + const at::Tensor& philox_offset, + std::optional scale) { + return at::native::openreg:: + _scaled_dot_product_fused_attention_overrideable_backward( + grad_out, + query, + key, + value, + attn_bias, + grad_input_mask, + out, + logsumexp, + cum_seq_q, + cum_seq_k, + max_q, + max_k, + dropout_p, + is_causal, + philox_seed, + philox_offset, + scale); +} + +at::Tensor wrapper_custom_autograd_fn_returns_self(at::Tensor x) { + return at::native::openreg::custom_autograd_fn_returns_self(x); +} + +at::Tensor wrapper_custom_autograd_fn_aliasing(at::Tensor x) { + return at::native::openreg::custom_autograd_fn_aliasing(x); +} + +at::Tensor& wrapper_abs_out(const at::Tensor& self, at::Tensor& out) { + return at::native::openreg::abs_out(self, out); +} + +void wrapper_abs_stub(at::TensorIteratorBase& iter) { + at::native::openreg::abs_kernel(iter); +} + +at::Tensor wrapper_custom_abs(at::Tensor x) { + return at::native::openreg::custom_abs(x); +} +} // namespace + +using namespace at::native; +// Registration via STUB +// LITERALINCLUDE START: STUB DEFAULT +REGISTER_PRIVATEUSE1_DISPATCH(abs_stub, &wrapper_abs_stub); +REGISTER_PRIVATEUSE1_DISPATCH( + quantize_tensor_per_tensor_affine_stub, + &wrapper_quantize_tensor_per_tensor_affine_stub); +REGISTER_PRIVATEUSE1_DISPATCH( + _fused_sdp_choice_stub, + &wrapper__fused_sdp_choice); +// LITERALINCLUDE END: STUB DEFAULT + +// Registration of custom operators +// LITERALINCLUDE START: CUSTOM OPERATOR SCHEMA +TORCH_LIBRARY(openreg, m) { + m.def("custom_abs(Tensor input)-> Tensor"); +} +// LITERALINCLUDE END: CUSTOM OPERATOR SCHEMA + +// LITERALINCLUDE START: CUSTOM OPERATOR DEFAULT +TORCH_LIBRARY_IMPL(openreg, PrivateUse1, m) { + m.impl("custom_abs", &wrapper_custom_abs); +} +// LITERALINCLUDE END: CUSTOM OPERATOR DEFAULT + +// LITERALINCLUDE START: CUSTOM OPERATOR FALLBACK +TORCH_LIBRARY_IMPL(_, AutogradPrivateUse1, m) { + m.fallback(torch::autograd::autogradNotImplementedFallback()); +} +// LITERALINCLUDE END: CUSTOM OPERATOR FALLBACK + +// The rest is for testing purposes +TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { + /* + abs_stub only works if abs.out is also registered with PrivateUse1, because + abs.default is designed to redirect directly to abs.out, which calls + abs_stub. + */ + m.impl("abs.out", &wrapper_abs_out); + m.impl("quantize_per_tensor", &wrapper_quantize_per_tensor); + m.impl("_fused_sdp_choice", &wrapper__fused_sdp_choice); + m.impl( + "_scaled_dot_product_fused_attention_overrideable", + &wrapper__scaled_dot_product_fused_attention_overrideable); + m.impl( + "_scaled_dot_product_fused_attention_overrideable_backward", + &wrapper_scaled_dot_product_fused_attention_overrideable_backward); +} + +TORCH_LIBRARY_FRAGMENT(openreg, m) { + m.def("custom_autograd_fn_returns_self(Tensor input)-> Tensor"); + m.def("custom_autograd_fn_aliasing(Tensor(a) input)-> Tensor(a)"); +} + +TORCH_LIBRARY_IMPL(openreg, AutogradPrivateUse1, m) { + m.impl( + "custom_autograd_fn_returns_self", + &wrapper_custom_autograd_fn_returns_self); + m.impl("custom_autograd_fn_aliasing", &wrapper_custom_autograd_fn_aliasing); +} + +} // namespace at::openreg diff --git a/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp b/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp new file mode 100644 index 00000000..d54ae552 --- /dev/null +++ b/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp @@ -0,0 +1,148 @@ +#include "native/Minimal.h" + +#include +#include + +#include + +namespace at::openreg { + +namespace { + +// LITERALINCLUDE START: EMPTY.MEMORY_FORMAT WRAPPER +at::Tensor wrapper_empty_memory_format( + c10::IntArrayRef size, + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt) { + return at::native::openreg::empty_memory_format( + size, + dtype_opt, + layout_opt, + device_opt, + pin_memory_opt, + memory_format_opt); +} +// LITERALINCLUDE END: EMPTY.MEMORY_FORMAT WRAPPER + +at::Tensor wrapper_empty_strided( + c10::IntArrayRef size, + c10::IntArrayRef stride, + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) { + return at::native::openreg::empty_strided( + size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt); +} + +at::Tensor wrapper_as_strided( + const at::Tensor& self, + c10::SymIntArrayRef size, + c10::SymIntArrayRef stride, + std::optional storage_offset) { + return at::native::openreg::as_strided(self, size, stride, storage_offset); +} + +const at::Tensor& wrapper_resize_( + const at::Tensor& self, + c10::SymIntArrayRef size, + ::std::optional memory_format) { + return at::native::openreg::resize_(self, size, memory_format); +} + +at::Tensor wrapper__reshape_alias( + const at::Tensor& self, + c10::SymIntArrayRef size, + c10::SymIntArrayRef stride) { + return at::native::openreg::_reshape_alias(self, size, stride); +} + +at::Tensor wrapper__copy_from( + const at::Tensor& self, + const at::Tensor& dst, + bool non_blocking) { + return at::native::openreg::_copy_from(self, dst, non_blocking); +} + +at::Tensor wrapper__copy_from_and_resize( + const at::Tensor& self, + const at::Tensor& dst) { + return at::native::openreg::_copy_from_and_resize(self, dst); +} + +at::Scalar wrapper__local_scalar_densor(const at::Tensor& self) { + return at::native::openreg::_local_scalar_dense(self); +} + +at::Tensor& wrapper_set_source_Tensor_( + at::Tensor& self, + const at::Tensor& source) { + return at::native::openreg::set_source_Tensor_(self, source); +} + +at::Tensor& wrapper_set_source_Storage_(at::Tensor& self, at::Storage source) { + return at::native::openreg::set_source_Storage_(self, source); +} + +at::Tensor& wrapper_set_source_Storage_storage_offsetset_( + at::Tensor& result, + at::Storage storage, + int64_t storage_offset, + c10::IntArrayRef size, + c10::IntArrayRef stride) { + return at::native::openreg::set_source_Storage_storage_offset_( + result, storage, storage_offset, size, stride); +} + +at::Tensor wrapper_view(const at::Tensor& self, c10::SymIntArrayRef size) { + return at::native::openreg::view(self, size); +} + +// LITERALINCLUDE START: FALLBACK WRAPPER +void wrapper_cpu_fallback( + const c10::OperatorHandle& op, + torch::jit::Stack* stack) { + at::native::openreg::cpu_fallback(op, stack); +} +// LITERALINCLUDE END: FALLBACK WRAPPER + +} // namespace + +// LITERALINCLUDE START: TORCH_LIBRARY_IMPL DEFAULT +TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { + m.impl("empty.memory_format", wrapper_empty_memory_format); + m.impl("empty_strided", wrapper_empty_strided); + m.impl("as_strided", wrapper_as_strided); + m.impl("resize_", wrapper_resize_); + m.impl("_reshape_alias", wrapper__reshape_alias); + m.impl("_copy_from", wrapper__copy_from); + m.impl("_copy_from_and_resize", wrapper__copy_from_and_resize); + m.impl("_local_scalar_dense", wrapper__local_scalar_densor); + m.impl("set_.source_Tensor", wrapper_set_source_Tensor_); + m.impl("set_.source_Storage", wrapper_set_source_Storage_); + m.impl( + "set_.source_Storage_storage_offset", + wrapper_set_source_Storage_storage_offsetset_); + m.impl("view", wrapper_view); +} +// LITERALINCLUDE END: TORCH_LIBRARY_IMPL DEFAULT + +// LITERALINCLUDE START: FALLBACK GLOBAL +TORCH_LIBRARY_IMPL(_, PrivateUse1, m) { + m.fallback( + torch::CppFunction::makeFromBoxedFunction<&wrapper_cpu_fallback>()); +} +// LITERALINCLUDE END: FALLBACK GLOBAL + +// LITERALINCLUDE START: FALLBACK SINGLE +TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { + m.impl( + "sub.Tensor", + torch::CppFunction::makeFromBoxedFunction<&wrapper_cpu_fallback>()); +} +// LITERALINCLUDE END: FALLBACK SINGLE + +} // namespace at::openreg diff --git a/PyTorchSimDevice2/csrc/aten/native/Common.h b/PyTorchSimDevice2/csrc/aten/native/Common.h new file mode 100644 index 00000000..c17196d0 --- /dev/null +++ b/PyTorchSimDevice2/csrc/aten/native/Common.h @@ -0,0 +1,97 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include + +namespace at::native::openreg { + +class MemoryGuard { + public: + template + explicit MemoryGuard(const Args&... args) { + (find_and_unprotect_tensors(args), ...); + } + + ~MemoryGuard() noexcept { + for (void* ptr : unprotected_pointers_) { + orMemoryProtect(ptr); + } + } + + MemoryGuard(const MemoryGuard&) = delete; + MemoryGuard& operator=(const MemoryGuard&) = delete; + MemoryGuard(MemoryGuard&&) = delete; + MemoryGuard& operator=(MemoryGuard&&) = delete; + + private: + template + void find_and_unprotect_tensors(const T& item) { + if constexpr (std::is_base_of_v) { + unprotect_if_needed(item); + } else if constexpr (std::is_same_v) { + if (item.isTensor()) { + unprotect_if_needed(item.toTensor()); + } else if (item.isTensorList()) { + for (const at::Tensor& tensor : item.toTensorListRef()) { + unprotect_if_needed(tensor); + } + } else if (item.isList()) { + for (const c10::IValue& element : item.toListRef()) { + find_and_unprotect_tensors(element); + } + } else if (item.isGenericDict()) { + for (const auto& [key, value] : item.toGenericDict()) { + find_and_unprotect_tensors(key); + find_and_unprotect_tensors(value); + } + } + } + } + + void unprotect_if_needed(const at::TensorBase& tensor) { + if (!tensor.defined() || !tensor.has_storage()) { + return; + } + + void* ptr = tensor.data_ptr(); + orPointerAttributes attr; + + if (orPointerGetAttributes(&attr, ptr) != orSuccess || + attr.type != orMemoryTypeDevice) { + return; + } + + auto [it, inserted] = unprotected_pointers_.insert(attr.pointer); + if (inserted) { + orMemoryUnprotect(attr.pointer); + } + } + + std::unordered_set unprotected_pointers_; +}; + +} // namespace at::native::openreg diff --git a/PyTorchSimDevice2/csrc/aten/native/Extra.cpp b/PyTorchSimDevice2/csrc/aten/native/Extra.cpp new file mode 100644 index 00000000..129ad621 --- /dev/null +++ b/PyTorchSimDevice2/csrc/aten/native/Extra.cpp @@ -0,0 +1,210 @@ +#include "Extra.h" + +namespace at::native::openreg { + +at::Tensor quantize_per_tensor( + const at::Tensor& self, + double scale, + int64_t zero_point, + at::ScalarType dtype) { + return at::native::quantize_per_tensor(self, scale, zero_point, dtype); +} + +int64_t _fused_sdp_choice( + const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value, + const std::optional& attn_mask, + double dropout_p, + bool is_causal, + std::optional scale, + bool enable_gqa) { + auto backend = sdp::SDPBackend::overrideable; + return static_cast(backend); +} + +void quantize_tensor_per_tensor_affine_stub( + const at::Tensor& rtensor, + at::Tensor& qtensor, + double scale, + int64_t zero_point) {} + +std::tuple< + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + c10::SymInt, + c10::SymInt, + at::Tensor, + at::Tensor, + at::Tensor> +_scaled_dot_product_fused_attention_overrideable( + const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value, + const std::optional& attn_bias, + double dropout_p, + bool is_causal, + bool return_debug_mask, + std::optional scale) { + const int64_t batch_size = query.size(0); + const int64_t num_heads = query.size(1); + const int64_t head_dim_v = value.size(3); + const int64_t max_seqlen_q = query.size(2); + const int64_t max_seqlen_kv = key.size(2); + + auto opts = query.options(); + auto output = + at::empty({batch_size, num_heads, max_seqlen_q, head_dim_v}, opts); + auto logsumexp = + at::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat)); + auto debug_attn_mask = at::empty( + {batch_size, num_heads, max_seqlen_q, max_seqlen_kv}, + opts.dtype(at::kFloat)); + auto philox_seed = at::empty({}, at::dtype(at::kLong)); + auto philox_offset = at::empty({}, at::dtype(at::kLong)); + + return std::make_tuple( + output, + logsumexp, + at::Tensor(), + at::Tensor(), + max_seqlen_q, + max_seqlen_kv, + philox_seed, + philox_offset, + debug_attn_mask); +} + +std::tuple +_scaled_dot_product_fused_attention_overrideable_backward( + const at::Tensor& grad_out, + const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value, + const at::Tensor& attn_bias, + std::array grad_input_mask, + const at::Tensor& out, + const at::Tensor& logsumexp, + const at::Tensor& cum_seq_q, + const at::Tensor& cum_seq_k, + int64_t max_q, + int64_t max_k, + double dropout_p, + bool is_causal, + const at::Tensor& philox_seed, + const at::Tensor& philox_offset, + std::optional scale) { + return std::tuple( + at::empty_like(query), + at::empty_like(key), + at::empty_like(value), + at::empty_like(attn_bias)); +} + +namespace { +struct CustomAutogradFnReturnsSelf + : public torch::autograd::Function { + static at::Tensor forward( + torch::autograd::AutogradContext* ctx, + at::Tensor self) { + return self; + } + + static torch::autograd::variable_list backward( + torch::autograd::AutogradContext* ctx, + torch::autograd::variable_list grad_output) { + return {grad_output[0] * 0.5}; + } +}; + +struct CustomAutogradFnAliasing + : public torch::autograd::Function { + static at::Tensor forward( + torch::autograd::AutogradContext* ctx, + at::Tensor self) { + return self.view_symint(self.sym_sizes()); + } + + static torch::autograd::variable_list backward( + torch::autograd::AutogradContext* ctx, + torch::autograd::variable_list grad_output) { + return {grad_output[0] * 0.5}; + } +}; +} // namespace + +at::Tensor custom_autograd_fn_returns_self(at::Tensor x) { + return CustomAutogradFnReturnsSelf::apply(x); +} + +at::Tensor custom_autograd_fn_aliasing(at::Tensor x) { + return CustomAutogradFnAliasing::apply(x); +} + +/* + This implementation is only used to test stub registration, so not all + capabilities are fully supported. + + Current Limitations: + - dtype: Float only + - input tensor: must be contiguous layout +*/ +// LITERALINCLUDE START: STUB ABS +void abs_kernel(at::TensorIteratorBase& iter) { + TORCH_CHECK(iter.ntensors() == 2, "Abs kernel expects 2 tensors"); + TORCH_CHECK( + iter.common_dtype() == at::ScalarType::Float, + "Abs kernel only supports float type"); + + auto& output_tensor = iter.tensor(0); + auto& input_tensor = iter.tensor(1); + + TORCH_CHECK( + input_tensor.sizes() == output_tensor.sizes(), + "Input and output tensor sizes must match."); + + auto abs_loop = [](float* out_ptr, const float* in_ptr, int64_t n) { + for (int64_t i = 0; i < n; ++i) { + out_ptr[i] = std::abs(in_ptr[i]); + } + }; + + MemoryGuard guard(input_tensor, output_tensor); + + if (iter.is_contiguous()) { + abs_loop( + static_cast(iter.data_ptr(0)), + static_cast(iter.data_ptr(1)), + iter.numel()); + } else { + TORCH_CHECK( + input_tensor.is_contiguous(), "Input tensor must be contiguous.") + + auto output = at::empty( + input_tensor.sizes(), + input_tensor.options().memory_format( + input_tensor.suggest_memory_format())); + + MemoryGuard guard(output); + + abs_loop( + static_cast(output.data_ptr()), + static_cast(iter.data_ptr(1)), + iter.numel()); + + output_tensor.copy_(output); + } +} +// LITERALINCLUDE END: STUB ABS + +at::Tensor& abs_out(const at::Tensor& self, at::Tensor& out) { + return at::native::abs_out(self, out); +} + +at::Tensor custom_abs(at::Tensor x) { + return at::abs(x); +} + +} // namespace at::native::openreg diff --git a/PyTorchSimDevice2/csrc/aten/native/Extra.h b/PyTorchSimDevice2/csrc/aten/native/Extra.h new file mode 100644 index 00000000..f002949a --- /dev/null +++ b/PyTorchSimDevice2/csrc/aten/native/Extra.h @@ -0,0 +1,69 @@ +#include "Common.h" + +namespace at::native::openreg { + +at::Tensor quantize_per_tensor( + const at::Tensor& self, + double scale, + int64_t zero_point, + at::ScalarType dtype); +int64_t _fused_sdp_choice( + const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value, + const std::optional& attn_mask, + double dropout_p, + bool is_causal, + std::optional scale, + bool enable_gqa); +void quantize_tensor_per_tensor_affine_stub( + const at::Tensor& rtensor, + at::Tensor& qtensor, + double scale, + int64_t zero_point); +std::tuple< + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + c10::SymInt, + c10::SymInt, + at::Tensor, + at::Tensor, + at::Tensor> +_scaled_dot_product_fused_attention_overrideable( + const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value, + const std::optional& attn_bias, + double dropout_p, + bool is_causal, + bool return_debug_mask, + std::optional scale); +std::tuple +_scaled_dot_product_fused_attention_overrideable_backward( + const at::Tensor& grad_out, + const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value, + const at::Tensor& attn_bias, + std::array grad_input_mask, + const at::Tensor& out, + const at::Tensor& logsumexp, + const at::Tensor& cum_seq_q, + const at::Tensor& cum_seq_k, + int64_t max_q, + int64_t max_k, + double dropout_p, + bool is_causal, + const at::Tensor& philox_seed, + const at::Tensor& philox_offset, + std::optional scale); + +at::Tensor custom_autograd_fn_returns_self(at::Tensor x); +at::Tensor custom_autograd_fn_aliasing(at::Tensor x); +at::Tensor& abs_out(const at::Tensor& self, at::Tensor& out); +void abs_kernel(at::TensorIteratorBase& iter); +at::Tensor custom_abs(at::Tensor x); + +} // namespace at::native::openreg diff --git a/PyTorchSimDevice2/csrc/aten/native/Minimal.cpp b/PyTorchSimDevice2/csrc/aten/native/Minimal.cpp new file mode 100644 index 00000000..8a3263bb --- /dev/null +++ b/PyTorchSimDevice2/csrc/aten/native/Minimal.cpp @@ -0,0 +1,185 @@ +#include "Minimal.h" + +#include + +namespace at::native::openreg { + +// LITERALINCLUDE START: EMPTY.MEMORY_FORMAT IMPL +at::Tensor empty_memory_format( + c10::IntArrayRef size, + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt) { + const auto device = c10::device_or_default(device_opt); + const auto dtype = c10::dtype_or_default(dtype_opt); + TORCH_CHECK(device.is_privateuseone()); + TORCH_CHECK( + c10::layout_or_default(layout_opt) == c10::Layout::Strided, + "Non strided layout not supported"); + TORCH_CHECK( + !c10::pinned_memory_or_default(pin_memory_opt), + "Pin memory can only be on CPU"); + const c10::DeviceGuard device_guard(device); + constexpr c10::DispatchKeySet pu1_dks(c10::DispatchKey::PrivateUse1); + auto allocator = at::GetAllocator(at::kPrivateUse1); + return at::detail::empty_generic( + size, allocator, pu1_dks, dtype, memory_format_opt); +} +// LITERALINCLUDE END: EMPTY.MEMORY_FORMAT IMPL + +at::Tensor empty_strided( + c10::IntArrayRef size, + c10::IntArrayRef stride, + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt) { + const auto device = c10::device_or_default(device_opt); + const auto dtype = c10::dtype_or_default(dtype_opt); + TORCH_CHECK(device.is_privateuseone()); + TORCH_CHECK( + c10::layout_or_default(layout_opt) == c10::Layout::Strided, + "Non strided layout not supported"); + TORCH_CHECK( + !c10::pinned_memory_or_default(pin_memory_opt), + "Pin memory can only be on CPU"); + const c10::DeviceGuard device_guard(device); + constexpr c10::DispatchKeySet pu1_dks(c10::DispatchKey::PrivateUse1); + auto allocator = at::GetAllocator(at::kPrivateUse1); + return at::detail::empty_strided_generic( + size, stride, allocator, pu1_dks, dtype); +} + +at::Tensor as_strided( + const at::Tensor& self, + c10::SymIntArrayRef size, + c10::SymIntArrayRef stride, + std::optional storage_offset) { + MemoryGuard guard(self); + + return at::cpu::as_strided_symint(self, size, stride, storage_offset); +} + +const at::Tensor& resize_( + const at::Tensor& self, + c10::SymIntArrayRef size, + ::std::optional memory_format) { + return at::native::resize_( + self, C10_AS_INTARRAYREF_SLOW(size), memory_format); +} + +at::Tensor _reshape_alias( + const at::Tensor& self, + c10::SymIntArrayRef size, + c10::SymIntArrayRef stride) { + return at::native::_reshape_alias( + self, C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride)); +} + +at::Tensor _copy_from( + const at::Tensor& self, + const at::Tensor& dst, + bool non_blocking) { + TORCH_CHECK(self.defined(), "Source tensor (self) is not defined."); + TORCH_CHECK(dst.defined(), "Destination tensor (dst) is not defined."); + + MemoryGuard guard(self, dst); + + if (self.device() == dst.device()) { + at::Tensor dst_as_cpu = at::from_blob( + dst.data_ptr(), + dst.sizes(), + dst.strides(), + dst.options().device(at::kCPU)); + const at::Tensor self_as_cpu = at::from_blob( + self.data_ptr(), + self.sizes(), + self.strides(), + self.options().device(at::kCPU)); + + at::native::copy_( + const_cast(dst_as_cpu), self_as_cpu, non_blocking); + + } else { + if (self.is_cpu()) { + at::Tensor dst_as_cpu = at::from_blob( + dst.data_ptr(), + dst.sizes(), + dst.strides(), + dst.options().device(at::kCPU)); + + at::native::copy_( + const_cast(dst_as_cpu), self, non_blocking); + + } else { + at::Tensor self_as_cpu = at::from_blob( + self.data_ptr(), + self.sizes(), + self.strides(), + self.options().device(at::kCPU)); + + at::native::copy_( + const_cast(dst), self_as_cpu, non_blocking); + } + } + + return dst; +} + +at::Tensor _copy_from_and_resize( + const at::Tensor& self, + const at::Tensor& dst) { + at::native::resize_(dst, self.sizes(), std::nullopt); + return at::native::copy_(const_cast(dst), self, false); +} + +at::Scalar _local_scalar_dense(const at::Tensor& self) { + MemoryGuard guard(self); + return at::native::_local_scalar_dense_cpu(self); +} + +at::Tensor& set_source_Tensor_(at::Tensor& self, const at::Tensor& source) { + return at::native::set_tensor_(self, source); +} + +at::Tensor& set_source_Storage_(at::Tensor& self, at::Storage source) { + return at::native::set_(self, source); +} + +at::Tensor& set_source_Storage_storage_offset_( + at::Tensor& result, + at::Storage storage, + int64_t storage_offset, + c10::IntArrayRef size, + c10::IntArrayRef stride) { + return at::cpu::set_(result, storage, storage_offset, size, stride); +} + +at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size) { + MemoryGuard guard(self); + return at::native::view(self, C10_AS_INTARRAYREF_SLOW(size)); +} + +// LITERALINCLUDE START: FALLBACK IMPL +void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { + static const std::unordered_set cpu_fallback_blocklist = { + c10::OperatorName("aten::abs", ""), + c10::OperatorName("aten::abs", "out"), + }; + + const auto& op_name = op.schema().operator_name(); + if (cpu_fallback_blocklist.count(op_name)) { + TORCH_CHECK( + false, + "Operator '", + op_name, + "' is not implemented for device openreg."); + } else { + at::native::cpu_fallback(op, stack); + } +} +// LITERALINCLUDE END: FALLBACK IMPL + +} // namespace at::native::openreg diff --git a/PyTorchSimDevice2/csrc/aten/native/Minimal.h b/PyTorchSimDevice2/csrc/aten/native/Minimal.h new file mode 100644 index 00000000..a2e5cf02 --- /dev/null +++ b/PyTorchSimDevice2/csrc/aten/native/Minimal.h @@ -0,0 +1,61 @@ +#include "Common.h" + +namespace at::native::openreg { + +at::Tensor empty_memory_format( + c10::IntArrayRef size, + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt); + +at::Tensor empty_strided( + c10::IntArrayRef size, + c10::IntArrayRef stride, + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt); + +at::Tensor as_strided( + const at::Tensor& self, + c10::SymIntArrayRef size, + c10::SymIntArrayRef stride, + std::optional storage_offset); + +const at::Tensor& resize_( + const at::Tensor& self, + c10::SymIntArrayRef size, + ::std::optional memory_format); + +at::Tensor _reshape_alias( + const at::Tensor& self, + c10::SymIntArrayRef size, + c10::SymIntArrayRef stride); + +at::Tensor _copy_from( + const at::Tensor& self, + const at::Tensor& dst, + bool non_blocking); + +at::Tensor _copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst); + +at::Scalar _local_scalar_dense(const at::Tensor& self); + +at::Tensor& set_source_Tensor_(at::Tensor& self, const at::Tensor& source); + +at::Tensor& set_source_Storage_(at::Tensor& self, at::Storage source); + +at::Tensor& set_source_Storage_storage_offset_( + at::Tensor& result, + at::Storage storage, + int64_t storage_offset, + c10::IntArrayRef size, + c10::IntArrayRef stride); + +at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size); + +void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack); + +} // namespace at::native::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.cpp new file mode 100644 index 00000000..3d35b677 --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.cpp @@ -0,0 +1,8 @@ +#include "OpenRegDeviceAllocator.h" + +namespace c10::openreg { + +static OpenRegDeviceAllocator global_openreg_alloc; +REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_openreg_alloc); + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.h b/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.h new file mode 100644 index 00000000..c9aea4a9 --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.h @@ -0,0 +1,43 @@ +#include + +#include +#include + +#include + +namespace c10::openreg { +struct OpenRegDeviceAllocator final : at::Allocator { + OpenRegDeviceAllocator() = default; + + static void ReportAndDelete(void* ptr) { + if (!ptr) { + return; + } + orFreeHost(ptr); + } + + at::DataPtr allocate(size_t nbytes) override { + int current_device_index = -1; + orGetDevice(¤t_device_index); + + auto curr_device = + c10::Device(c10::DeviceType::PrivateUse1, current_device_index); + void* data = nullptr; + if (nbytes > 0) { + orMalloc(&data, nbytes); + TORCH_CHECK( + data, "Failed to allocator ", nbytes, " bytes on openreg device."); + } + return {data, data, &ReportAndDelete, curr_device}; + } + + at::DeleterFnPtr raw_deleter() const override { + return &ReportAndDelete; + } + + void copy_data(void* dest, const void* src, std::size_t count) const final { + orMemcpy(dest, src, count, orMemcpyDeviceToDevice); + } +}; + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h b/PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h new file mode 100644 index 00000000..e869cf0d --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h @@ -0,0 +1,146 @@ +#pragma once + +#include + +#include "OpenRegException.h" +#include "OpenRegStream.h" + +namespace c10::openreg { + +struct OpenRegEvent { + OpenRegEvent(bool enable_timing) noexcept : enable_timing_{enable_timing} {} + + ~OpenRegEvent() { + if (is_created_) { + OPENREG_CHECK(orEventDestroy(event_)); + } + } + + OpenRegEvent(const OpenRegEvent&) = delete; + OpenRegEvent& operator=(const OpenRegEvent&) = delete; + + OpenRegEvent(OpenRegEvent&& other) noexcept { + moveHelper(std::move(other)); + } + OpenRegEvent& operator=(OpenRegEvent&& other) noexcept { + if (this != &other) { + moveHelper(std::move(other)); + } + return *this; + } + + operator orEvent_t() const { + return event(); + } + + std::optional device() const { + if (is_created_) { + return at::Device(at::kPrivateUse1, device_index_); + } else { + return std::nullopt; + } + } + + bool isCreated() const { + return is_created_; + } + + DeviceIndex device_index() const { + return device_index_; + } + + orEvent_t event() const { + return event_; + } + + bool query() const { + if (!is_created_) { + return true; + } + + orError_t err = orEventQuery(event_); + if (err == orSuccess) { + return true; + } + + return false; + } + + void record() { + record(getCurrentOpenRegStream()); + } + + void recordOnce(const OpenRegStream& stream) { + if (!was_recorded_) + record(stream); + } + + void record(const OpenRegStream& stream) { + if (!is_created_) { + createEvent(stream.device_index()); + } + + TORCH_CHECK( + device_index_ == stream.device_index(), + "Event device ", + device_index_, + " does not match recording stream's device ", + stream.device_index(), + "."); + + OPENREG_CHECK(orEventRecord(event_, stream)); + was_recorded_ = true; + } + + void block(const OpenRegStream& stream) { + if (is_created_) { + OPENREG_CHECK(orStreamWaitEvent(stream, event_, 0)); + } + } + + float elapsed_time(const OpenRegEvent& other) const { + TORCH_CHECK_VALUE( + !(enable_timing_ & orEventDisableTiming) && + !(other.enable_timing_ & orEventDisableTiming), + "Both events must be created with argument 'enable_timing=True'."); + TORCH_CHECK_VALUE( + is_created_ && other.isCreated(), + "Both events must be recorded before calculating elapsed time."); + TORCH_CHECK( + query() && other.query(), + "Both events must be completed before calculating elapsed time."); + + float time_ms = 0; + OPENREG_CHECK(orEventElapsedTime(&time_ms, event_, other.event_)); + return time_ms; + } + + void synchronize() const { + if (is_created_) { + OPENREG_CHECK(orEventSynchronize(event_)); + } + } + + private: + unsigned int enable_timing_{orEventDisableTiming}; + bool is_created_{false}; + bool was_recorded_{false}; + DeviceIndex device_index_{-1}; + orEvent_t event_{}; + + void createEvent(DeviceIndex device_index) { + device_index_ = device_index; + OPENREG_CHECK(orEventCreateWithFlags(&event_, enable_timing_)); + is_created_ = true; + } + + void moveHelper(OpenRegEvent&& other) { + std::swap(enable_timing_, other.enable_timing_); + std::swap(is_created_, other.is_created_); + std::swap(was_recorded_, other.was_recorded_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + } +}; + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegException.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegException.cpp new file mode 100644 index 00000000..09eb09b6 --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegException.cpp @@ -0,0 +1,9 @@ +#include "OpenRegException.h" + +void orCheckFail( + const char* func, + const char* file, + uint32_t line, + const char* msg) { + throw ::c10::Error({func, file, line}, msg); +} diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegException.h b/PyTorchSimDevice2/csrc/runtime/OpenRegException.h new file mode 100644 index 00000000..16c1ee1c --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegException.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +#include + +void orCheckFail( + const char* func, + const char* file, + uint32_t line, + const char* msg = ""); + +#define OPENREG_CHECK(EXPR, ...) \ + do { \ + const orError_t __err = EXPR; \ + if (__err != orSuccess) { \ + orCheckFail( \ + __func__, __FILE__, static_cast(__LINE__), ##__VA_ARGS__); \ + } \ + } while (0) diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.cpp new file mode 100644 index 00000000..566bacd0 --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.cpp @@ -0,0 +1,74 @@ +#include + +#include "OpenRegException.h" +#include "OpenRegFunctions.h" + +namespace c10::openreg { + +orError_t GetDeviceCount(int* dev_count) { + return orGetDeviceCount(dev_count); +} + +orError_t GetDevice(c10::DeviceIndex* device) { + int tmp_device = -1; + auto err = orGetDevice(&tmp_device); + *device = static_cast(tmp_device); + return err; +} + +orError_t SetDevice(c10::DeviceIndex device) { + int cur_device = -1; + orGetDevice(&cur_device); + if (device == cur_device) { + return orSuccess; + } + return orSetDevice(device); +} + +int device_count_impl() { + int count = 0; + GetDeviceCount(&count); + return count; +} + +OPENREG_EXPORT c10::DeviceIndex device_count() noexcept { + // initialize number of devices only once + static int count = []() { + try { + auto result = device_count_impl(); + TORCH_INTERNAL_ASSERT( + result <= std::numeric_limits::max(), + "Too many devices, DeviceIndex overflowed"); + return result; + } catch (const c10::Error& ex) { + // We don't want to fail, but still log the warning + // msg() returns the message without the stack trace + TORCH_WARN("Device initialization: ", ex.msg()); + return 0; + } + }(); + return static_cast(count); +} + +OPENREG_EXPORT c10::DeviceIndex current_device() { + c10::DeviceIndex cur_device = -1; + GetDevice(&cur_device); + return cur_device; +} + +OPENREG_EXPORT void set_device(c10::DeviceIndex device) { + SetDevice(device); +} + +OPENREG_EXPORT DeviceIndex ExchangeDevice(DeviceIndex device) { + int current_device = -1; + orGetDevice(¤t_device); + + if (device != current_device) { + orSetDevice(device); + } + + return current_device; +} + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.h b/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.h new file mode 100644 index 00000000..c2eb1e80 --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.h @@ -0,0 +1,18 @@ +#pragma once + +#include +#include + +#include + +#include + +namespace c10::openreg { + +OPENREG_EXPORT c10::DeviceIndex device_count() noexcept; +OPENREG_EXPORT c10::DeviceIndex current_device(); +OPENREG_EXPORT void set_device(c10::DeviceIndex device); + +OPENREG_EXPORT DeviceIndex ExchangeDevice(DeviceIndex device); + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.cpp new file mode 100644 index 00000000..c2e03f66 --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.cpp @@ -0,0 +1,28 @@ +#include "OpenRegGenerator.h" + +// Default, global generators, one per device. +static std::vector default_generators; + +namespace c10::openreg { + +const at::Generator& getDefaultOpenRegGenerator(c10::DeviceIndex device_index) { + static bool flag [[maybe_unused]] = []() { + auto deivce_nums = device_count(); + default_generators.resize(deivce_nums); + for (auto i = 0; i < deivce_nums; i++) { + default_generators[i] = at::make_generator(i); + default_generators[i].seed(); + } + return true; + }(); + + c10::DeviceIndex idx = device_index; + if (idx == -1) { + idx = current_device(); + } else { + TORCH_CHECK(idx >= 0 && idx < device_count()); + } + return default_generators[idx]; +} + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.h b/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.h new file mode 100644 index 00000000..877a9707 --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.h @@ -0,0 +1,21 @@ +#include +#include + +#include + +#include "OpenRegFunctions.h" + +namespace c10::openreg { +class OpenRegGeneratorImpl : public at::CPUGeneratorImpl { + public: + OpenRegGeneratorImpl(c10::DeviceIndex device_index) { + device_ = c10::Device(c10::DeviceType::PrivateUse1, device_index); + key_set_ = c10::DispatchKeySet(c10::DispatchKey::PrivateUse1); + } + ~OpenRegGeneratorImpl() override = default; +}; + +const at::Generator& getDefaultOpenRegGenerator( + c10::DeviceIndex device_index = -1); + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.cpp new file mode 100644 index 00000000..d50e56e4 --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.cpp @@ -0,0 +1,7 @@ +#include "OpenRegGuard.h" + +namespace c10::openreg { + +C10_REGISTER_GUARD_IMPL(PrivateUse1, OpenRegGuardImpl); + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h b/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h new file mode 100644 index 00000000..f0150fe6 --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h @@ -0,0 +1,197 @@ +#include +#include + +#include + +#include "OpenRegFunctions.h" + +namespace c10::openreg { + +// Device guard registration +struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface { + static constexpr c10::DeviceType static_type = c10::DeviceType::PrivateUse1; + + OpenRegGuardImpl() = default; + explicit OpenRegGuardImpl(c10::DeviceType t) { + TORCH_INTERNAL_ASSERT(t == static_type); + } + + /** + * Return the type of device managed by this guard implementation. + */ + c10::DeviceType type() const override { + return static_type; + } + + /** + * Set the current device to Device, and return the previous c10::Device. + */ + c10::Device exchangeDevice(c10::Device d) const override { + TORCH_CHECK(d.is_privateuseone()); + + auto old_device_index = ExchangeDevice(d.index()); + return c10::Device(static_type, old_device_index); + } + + /** + * Get the current device. + */ + c10::Device getDevice() const override { + int device_index = current_device(); + return c10::Device(static_type, device_index); + } + + /** + * Set the current device to c10::Device. + */ + void setDevice(c10::Device d) const override { + TORCH_CHECK(d.is_privateuseone()); + + set_device(d.index()); + } + + /** + * Set the current device to c10::Device, without checking for errors + * (so, e.g., this can be called from a destructor). + */ + void uncheckedSetDevice(c10::Device d) const noexcept override { + TORCH_CHECK(d.is_privateuseone()); + + set_device(d.index()); + } + + /** + * Get the current stream for a given device. + */ + c10::Stream getStream(c10::Device d) const noexcept override { + return c10::Stream(c10::Stream::DEFAULT, d); + } + + /** + * Get the default stream for a given device. + */ + c10::Stream getDefaultStream(c10::Device d) const override { + return c10::Stream(c10::Stream::DEFAULT, d); + } + + /** + * Get a stream from the global pool for a given device. + */ + c10::Stream getStreamFromGlobalPool( + c10::Device d, + bool isHighPriority = false) const override { + return c10::Stream(c10::Stream::DEFAULT, d); + } + + /** + * Return a new stream for a given device and priority. The stream will be + * copied and shared around, device backend should be able to correctly handle + * the lifetime of the stream. + */ + c10::Stream getNewStream(c10::Device d, int priority = 0) const override { + return c10::Stream(c10::Stream::DEFAULT, d); + } + + /** + * Set a stream to be the thread local current stream for its device. + * Return the previous stream for that device. You are NOT required + * to set the current device to match the device of this stream. + */ + c10::Stream exchangeStream(c10::Stream s) const noexcept override { + return s; + } + + /** + * Destroys the given event. + */ + void destroyEvent(void* event, const c10::DeviceIndex device_index) + const noexcept override {} + + /** + * Increments the event's version and enqueues a job with this version + * in the stream's work queue. When the stream process that job + * it notifies all streams waiting on / blocked by that version of the + * event to continue and marks that version as recorded. + * */ + void record( + void** event, + const c10::Stream& stream, + const c10::DeviceIndex device_index, + const c10::EventFlag flag) const override { + static int event_id = 1; + + if (!*event) + *event = reinterpret_cast(event_id++); + } + + /** + * Does nothing if the event has not been scheduled to be recorded. + * If the event was previously enqueued to be recorded, a command + * to wait for the version of the event that exists at the time of this call + * is inserted in the stream's work queue. + * When the stream reaches this command it will stop processing + * additional commands until that version of the event is marked as recorded. + */ + void block(void* event, const c10::Stream& stream) const override {} + + /** + * Returns true if (and only if) + * (1) the event has never been scheduled to be recorded + * (2) the current version is marked as recorded. + * Returns false otherwise. + */ + bool queryEvent(void* event) const override { + return true; + } + + /** + * Get the number of devices. WARNING: This is REQUIRED to not raise + * an exception. If there is some sort of problem, e.g., driver error, + * you should report that there are zero available devices. + */ + c10::DeviceIndex deviceCount() const noexcept override { + int device_index = -1; + orGetDeviceCount(&device_index); + return device_index; + } + /** + * Return true if all the work previously enqueued on the stream for + * asynchronous execution has completed running on the device. + */ + bool queryStream(const c10::Stream& stream) const override { + return true; + } + + /** + * Wait (by blocking the calling thread) until all the work previously + * enqueued on the stream has completed running on the device. + */ + void synchronizeStream(const c10::Stream& stream) const override {} + + /** + * Wait (by blocking the calling thread) until all the work previously + * recorded on the event has completed running on the device. + */ + void synchronizeEvent(void* event) const override {} + + /** + * Ensure the caching allocator (if any) is aware that the given DataPtr is + * being used on the given stream, and that it should thus avoid recycling the + * DataPtr until all work on that stream is done. + */ + void recordDataPtrOnStream( + const c10::DataPtr& data_ptr, + const c10::Stream& stream) const override {} + + /** + * Fetch the elapsed time between two recorded events. + */ + double elapsedTime( + void* event1, + void* event2, + const c10::DeviceIndex device_index) const override { + return 1; + } +}; + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.cpp new file mode 100644 index 00000000..57bc2d9f --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.cpp @@ -0,0 +1,11 @@ +#include "OpenRegHooks.h" + +namespace c10::openreg { + +static bool register_hook_flag [[maybe_unused]] = []() { + at::RegisterPrivateUse1HooksInterface(new OpenRegHooksInterface()); + + return true; +}(); + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h b/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h new file mode 100644 index 00000000..656fba8e --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h @@ -0,0 +1,41 @@ +#include +#include + +#include +#include + +#include + +#include "OpenRegGenerator.h" + +namespace c10::openreg { +struct OpenRegHooksInterface : public at::PrivateUse1HooksInterface { + OpenRegHooksInterface() {}; + ~OpenRegHooksInterface() override = default; + + bool hasPrimaryContext(c10::DeviceIndex device_index) const override { + return true; + } + + at::Allocator* getPinnedMemoryAllocator() const override { + return at::getHostAllocator(at::kPrivateUse1); + } + + bool isPinnedPtr(const void* data) const override { + orPointerAttributes attr{}; + orPointerGetAttributes(&attr, data); + + return attr.type == orMemoryTypeHost; + } + + const at::Generator& getDefaultGenerator( + c10::DeviceIndex device_index) const override { + return getDefaultOpenRegGenerator(device_index); + } + + at::Generator getNewGenerator(c10::DeviceIndex device_index) const override { + return at::make_generator(device_index); + } +}; + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.cpp new file mode 100644 index 00000000..55263803 --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.cpp @@ -0,0 +1,8 @@ +#include "OpenRegHostAllocator.h" + +namespace c10::openreg { + +OpenRegHostAllocator caching_host_allocator; +REGISTER_HOST_ALLOCATOR(at::kPrivateUse1, &caching_host_allocator); + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.h b/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.h new file mode 100644 index 00000000..edef545a --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.h @@ -0,0 +1,48 @@ +#include + +#include +#include + +#include + +namespace c10::openreg { +struct OpenRegHostAllocator final : at::HostAllocator { + OpenRegHostAllocator() = default; + + static void ReportAndDelete(void* ptr) { + if (!ptr) { + return; + } + orFreeHost(ptr); + } + + at::DataPtr allocate(size_t nbytes) override { + void* data = nullptr; + if (nbytes > 0) { + orMallocHost(&data, nbytes); + TORCH_CHECK(data, "Failed to allocator ", nbytes, " bytes on host."); + } + return {data, data, &ReportAndDelete, at::Device(at::kCPU)}; + } + + at::DeleterFnPtr raw_deleter() const override { + return &ReportAndDelete; + } + + void copy_data(void* dest, const void* src, std::size_t count) const final { + orMemcpy(dest, src, count, orMemcpyHostToHost); + } + + // ignore + bool record_event(void* ptr, void* ctx, c10::Stream stream) override { + return true; + } + void empty_cache() override {} + at::HostStats get_stats() override { + return at::HostStats(); + } + void reset_accumulated_stats() override {} + void reset_peak_stats() override {} +}; + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.cpp new file mode 100644 index 00000000..43809d60 --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.cpp @@ -0,0 +1,48 @@ +#include "OpenRegSerialization.h" + +namespace c10::openreg { +struct OpenRegBackendMeta : public c10::BackendMeta { + OpenRegBackendMeta(int version_number, int format_number) + : version_number_(version_number), format_number_(format_number) {} + + int version_number_{-1}; + int format_number_{-1}; +}; + +void for_serialization( + const at::Tensor& t, + std::unordered_map& m) { + auto meta_ptr = t.unsafeGetTensorImpl()->get_backend_meta(); + + if (meta_ptr != nullptr) { + auto o_meta_ptr = dynamic_cast(meta_ptr); + if (o_meta_ptr->version_number_ == 1) { + m["version_number"] = true; + } + if (o_meta_ptr->format_number_ == 29) { + m["format_number"] = true; + } + } +} + +void for_deserialization( + const at::Tensor& t, + std::unordered_map& m) { + int version_number{-1}; + int format_number{-1}; + + if (m.find("version_number") != m.end()) { + version_number = 1; + } + if (m.find("format_number") != m.end()) { + format_number = 29; + } + + c10::intrusive_ptr meta{std::unique_ptr( + new OpenRegBackendMeta(version_number, format_number))}; + t.unsafeGetTensorImpl()->set_backend_meta(meta); +} + +REGISTER_PRIVATEUSE1_SERIALIZATION(&for_serialization, &for_deserialization) + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.h b/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.h new file mode 100644 index 00000000..559e92ea --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.h @@ -0,0 +1,10 @@ +#include + +#define REGISTER_PRIVATEUSE1_SERIALIZATION( \ + FOR_SERIALIZATION, FOR_DESERIALIZATION) \ + static int register_serialization() { \ + torch::jit::TensorBackendMetaRegistry( \ + c10::DeviceType::PrivateUse1, FOR_SERIALIZATION, FOR_DESERIALIZATION); \ + return 0; \ + } \ + static const int _temp = register_serialization(); diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegStream.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegStream.cpp new file mode 100644 index 00000000..aa6c325d --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegStream.cpp @@ -0,0 +1,253 @@ +#include "OpenRegStream.h" + +#include +#include +#include + +#include +#include +#include +#include + +namespace c10::openreg { + +namespace { + +// Global stream state and constants +static c10::once_flag init_flag; + +static DeviceIndex num_devices = -1; +static constexpr int kStreamsPerPoolBits = 5; +static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits; +static constexpr int kStreamTypeBits = 2; + +/* + * The stream pools are lazily initialized when the first queue is requested + * for a device. The device flags track the initialization of each device. When + * a queue is requested, the next queue in the pool to be returned in a + * round-robin fashion, see Note [Stream Management]. + */ +static std::deque device_flags; +static std::vector, + c10::openreg::max_compile_time_stream_priorities>> + streams; +static std::deque< + std::array, max_compile_time_stream_priorities>> + priority_counters; + +static thread_local std::unique_ptr current_streams = nullptr; + +/* + * Note [StreamId assignment] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~ + * How do we assign stream IDs? + * + * -- 56 bits -- -- 5 bits -- -- 2 bits -- -- 1 bit -- + * zeros StreamIdIndex StreamIdType Ext/native stream + * ignored for ext ignored for ext + * + * Where StreamIdType: + * 00 = default stream + * 01 = normal stream + * 11 = external stream + * + * For external stream, StreamID is a orStream_t pointer. This means that last + * bit will always be 0. So when constructing StreamId for a native stream we + * set last bit to 1 to distinguish between native and external streams. + * + * StreamId is 64-bit, so we can just rely on regular promotion rules. + * We rely on StreamIdIndex and StreamIdType being non-negative; + */ +using StreamIdIndex = uint8_t; +enum class StreamIdType : uint8_t { + DEFAULT = 0x0, + NORMAL = 0x1, + EXT = 0x3, +}; + +inline std::ostream& operator<<(std::ostream& stream, StreamIdType s) { + switch (s) { + case StreamIdType::DEFAULT: + return stream << "DEFAULT"; + case StreamIdType::NORMAL: + return stream << "NORMAL"; + case StreamIdType::EXT: + return stream << "EXT"; + default: + break; + } + + return stream << static_cast(s); +} + +static inline StreamIdType streamIdType(StreamId s) { + // Externally allocated streams have their id being the orStream_ptr + // so the last bit will be 0 + if (!(s & 1)) { + return StreamIdType(StreamIdType::EXT); + } + + int mask_for_type = (1 << kStreamTypeBits) - 1; + auto st = static_cast((s >> 1) & mask_for_type); + TORCH_CHECK( + st == StreamIdType::DEFAULT || st == StreamIdType::NORMAL, + "invalid StreamId: ", + s); + return st; +} + +static inline size_t streamIdIndex(StreamId s) { + return static_cast( + (s >> (kStreamTypeBits + 1)) & ((1 << kStreamsPerPoolBits) - 1)); +} + +StreamId makeStreamId(StreamIdType st, size_t si) { + if (st == StreamIdType::EXT) { + return static_cast(0); + } + + return (static_cast(si) << (kStreamTypeBits + 1)) | + (static_cast(st) << 1) | 1; +} + +static void initGlobalStreamState() { + num_devices = device_count(); + device_flags.resize(num_devices); + streams.resize(num_devices); + priority_counters.resize(num_devices); +} + +static void initSingleDeviceStream( + int priority, + DeviceIndex device_index, + int i) { + auto& stream = streams[device_index][priority][i]; + + OPENREG_CHECK(orStreamCreateWithPriority(&stream, 0, priority)); + priority_counters[device_index][priority] = 0; +} + +// Creates stream pools for the specified device. It should be call only once. +static void initDeviceStreamState(DeviceIndex device_index) { + for (const auto i : c10::irange(kStreamsPerPool)) { + for (const auto p : c10::irange(max_compile_time_stream_priorities)) { + initSingleDeviceStream(p, device_index, i); + } + } +} + +static void initOpenRegStreamsOnce() { + c10::call_once(init_flag, initGlobalStreamState); + + if (current_streams) { + return; + } + + // Inits current streams (thread local) to the last queue in the "normal + // priority" queue pool. Note: the queue pool have not been initialized yet. + // It will be initialized in initDeviceStreamState for the specified device. + current_streams = std::make_unique(num_devices); + for (const auto i : c10::irange(num_devices)) { + current_streams[i] = makeStreamId(StreamIdType::DEFAULT, 0); + } +} + +static uint32_t get_idx(std::atomic& counter) { + auto raw_idx = counter++; + return raw_idx % kStreamsPerPool; +} + +OpenRegStream OpenRegStreamForId(DeviceIndex device_index, StreamId stream_id) { + return OpenRegStream( + OpenRegStream::UNCHECKED, + Stream( + Stream::UNSAFE, + c10::Device(DeviceType::PrivateUse1, device_index), + stream_id)); +} + +} // anonymous namespace + +// See Note [StreamId assignment] +orStream_t OpenRegStream::stream() const { + c10::DeviceIndex device_index = stream_.device_index(); + StreamId stream_id = stream_.id(); + StreamIdType st = streamIdType(stream_id); + size_t si = streamIdIndex(stream_id); + switch (st) { + // The index 0 stream is default as well. + case StreamIdType::DEFAULT: + case StreamIdType::NORMAL: + return streams[device_index][static_cast(st)][si]; + case StreamIdType::EXT: + return reinterpret_cast(stream_id); + default: + TORCH_CHECK( + false, + "Unrecognized stream ", + stream_, + " (I didn't recognize the stream type, ", + st, + ").", + " Did you manufacture the StreamId yourself? Don't do that;"); + } +} + +// Returns a stream from the requested pool +// Note: when called the first time on a device, this will create the +// stream pools for that device. +OpenRegStream getStreamFromPool(const int priority, DeviceIndex device_index) { + initOpenRegStreamsOnce(); + if (device_index == -1) { + device_index = current_device(); + } + c10::call_once( + device_flags[device_index], initDeviceStreamState, device_index); + auto pri_idx = + std::clamp(priority, 0, max_compile_time_stream_priorities - 1); + const auto idx = get_idx(priority_counters[device_index][pri_idx]); + auto id_type = static_cast(pri_idx); + return OpenRegStreamForId(device_index, makeStreamId(id_type, idx)); +} + +OpenRegStream getStreamFromPool(const bool isHighPriority, DeviceIndex device) { + initOpenRegStreamsOnce(); + int priority = 0; + return getStreamFromPool(priority, device); +} + +OpenRegStream getStreamFromExternal( + orStream_t ext_stream, + DeviceIndex device_index) { + return OpenRegStreamForId( + device_index, reinterpret_cast(ext_stream)); +} + +OpenRegStream getDefaultOpenRegStream(DeviceIndex device_index) { + initOpenRegStreamsOnce(); + if (device_index == -1) { + device_index = current_device(); + } + return OpenRegStreamForId( + device_index, makeStreamId(StreamIdType::DEFAULT, 0)); +} + +OpenRegStream getCurrentOpenRegStream(DeviceIndex device_index) { + initOpenRegStreamsOnce(); + if (device_index == -1) { + device_index = current_device(); + } + return OpenRegStreamForId(device_index, current_streams[device_index]); +} + +void setCurrentOpenRegStream(OpenRegStream stream) { + initOpenRegStreamsOnce(); + current_streams[stream.device_index()] = stream.id(); +} + +std::ostream& operator<<(std::ostream& stream, const OpenRegStream& s) { + return stream << s.unwrap(); +} + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegStream.h b/PyTorchSimDevice2/csrc/runtime/OpenRegStream.h new file mode 100644 index 00000000..e1fd0c71 --- /dev/null +++ b/PyTorchSimDevice2/csrc/runtime/OpenRegStream.h @@ -0,0 +1,162 @@ +#pragma once + +#include + +#include "OpenRegException.h" +#include "OpenRegFunctions.h" + +#include +#include +#include + +namespace c10::openreg { + +static constexpr int max_compile_time_stream_priorities = 1; + +class OpenRegStream { + public: + enum Unchecked { UNCHECKED }; + + explicit OpenRegStream(Stream stream) : stream_(stream) { + TORCH_CHECK(stream_.device_type() == DeviceType::PrivateUse1); + } + + explicit OpenRegStream(Unchecked, Stream stream) : stream_(stream) {} + + bool operator==(const OpenRegStream& other) const noexcept { + return unwrap() == other.unwrap(); + } + + bool operator!=(const OpenRegStream& other) const noexcept { + return unwrap() != other.unwrap(); + } + + operator orStream_t() const { + return stream(); + } + + operator Stream() const { + return unwrap(); + } + + DeviceType device_type() const { + return DeviceType::PrivateUse1; + } + + DeviceIndex device_index() const { + return stream_.device_index(); + } + + Device device() const { + return Device(DeviceType::PrivateUse1, device_index()); + } + + StreamId id() const { + return stream_.id(); + } + + bool query() const { + DeviceGuard guard{stream_.device()}; + + if (orStreamQuery(stream()) == orSuccess) { + return true; + } + + return false; + } + + void synchronize() const { + DeviceGuard guard{stream_.device()}; + OPENREG_CHECK(orStreamSynchronize(stream())); + } + + int priority() const { + DeviceGuard guard{stream_.device()}; + int priority = 0; + OPENREG_CHECK(orStreamGetPriority(stream(), &priority)); + return priority; + } + + orStream_t stream() const; + + Stream unwrap() const { + return stream_; + } + + struct c10::StreamData3 pack3() const { + return stream_.pack3(); + } + + static OpenRegStream unpack3( + StreamId stream_id, + DeviceIndex device_index, + DeviceType device_type) { + return OpenRegStream(Stream::unpack3(stream_id, device_index, device_type)); + } + + private: + Stream stream_; +}; + +/* + * Get a stream from the pool in a round-robin fashion. + * + * You can request a stream from the highest priority pool by setting + * isHighPriority to true for a specific device. + */ +OPENREG_EXPORT OpenRegStream +getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1); + +/* + * Get a stream from the pool in a round-robin fashion. + * + * You can request a stream by setting a priority value for a specific device. + * The priority number lower, the priority higher. + */ +OPENREG_EXPORT OpenRegStream +getStreamFromPool(const int priority, DeviceIndex device = -1); + +/* + * Get a OpenRegStream from a externally allocated one. + * + * This is mainly for interoperability with different libraries where we + * want to operate on a non-torch allocated stream for data exchange or similar + * purposes + */ +OPENREG_EXPORT OpenRegStream +getStreamFromExternal(orStream_t ext_stream, DeviceIndex device_index); + +/* + * Get the default OpenReg stream, for the passed OpenReg device, or for the + * current device if no device index is passed. + */ +OPENREG_EXPORT OpenRegStream +getDefaultOpenRegStream(DeviceIndex device_index = -1); + +/* + * Get the current OpenReg stream, for the passed OpenReg device, or for the + * current device if no device index is passed. + */ +OPENREG_EXPORT OpenRegStream +getCurrentOpenRegStream(DeviceIndex device_index = -1); + +/* + * Set the current stream on the device of the passed in stream to be the passed + * in stream. + */ +OPENREG_EXPORT void setCurrentOpenRegStream(OpenRegStream stream); + +OPENREG_EXPORT std::ostream& operator<<( + std::ostream& stream, + const OpenRegStream& s); + +} // namespace c10::openreg + +namespace std { +template <> +struct hash { + size_t operator()(c10::openreg::OpenRegStream s) const noexcept { + return std::hash{}(s.unwrap()); + } +}; +} // namespace std diff --git a/PyTorchSimDevice2/include/Macros.h b/PyTorchSimDevice2/include/Macros.h new file mode 100644 index 00000000..c75523c2 --- /dev/null +++ b/PyTorchSimDevice2/include/Macros.h @@ -0,0 +1,7 @@ +#pragma once + +#ifdef _WIN32 +#define OPENREG_EXPORT __declspec(dllexport) +#else +#define OPENREG_EXPORT __attribute__((visibility("default"))) +#endif diff --git a/PyTorchSimDevice2/pyproject.toml b/PyTorchSimDevice2/pyproject.toml new file mode 100644 index 00000000..774fe5cd --- /dev/null +++ b/PyTorchSimDevice2/pyproject.toml @@ -0,0 +1,35 @@ +[build-system] +requires = [ + "setuptools", + "wheel", + "torch", # Needed by setup.py for getting include of PyTorch +] + +build-backend = "setuptools.build_meta" + +[project] +name = "torch_openreg" +version = "0.0.1" +description = "A minimal reference implementation of an out-of-tree backend" +readme = "README.md" +requires-python = ">=3.9" +license = { text = "BSD-3-Clause" } +authors = [{ name = "PyTorch Team", email = "packages@pytorch.org" }] +dependencies = [ + "torch", +] +# Add classifiers info for making lint happy +classifiers = [ + "Development Status :: 4 - Beta", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: C++", + "Programming Language :: Python :: 3 :: Only", +] + +[project.urls] +Homepage = "https://pytorch.org" +Repository = "https://github.com/pytorch/pytorch" +Documentation = "https://pytorch.org/docs" +Forum = "https://discuss.pytorch.org" diff --git a/PyTorchSimDevice2/setup.py b/PyTorchSimDevice2/setup.py new file mode 100644 index 00000000..01e2f065 --- /dev/null +++ b/PyTorchSimDevice2/setup.py @@ -0,0 +1,148 @@ +import multiprocessing +import os +import platform +import shutil +import subprocess +import sys +import sysconfig +from distutils.command.clean import clean + +from setuptools import Extension, find_packages, setup + + +# Env Variables +IS_DARWIN = platform.system() == "Darwin" +IS_WINDOWS = platform.system() == "Windows" + +BASE_DIR = os.path.dirname(os.path.realpath(__file__)) +RUN_BUILD_DEPS = any(arg in {"clean", "dist_info"} for arg in sys.argv) + + +def make_relative_rpath_args(path): + if IS_DARWIN: + return ["-Wl,-rpath,@loader_path/" + path] + elif IS_WINDOWS: + return [] + else: + return ["-Wl,-rpath,$ORIGIN/" + path] + + +def get_pytorch_dir(): + os.environ["TORCH_DEVICE_BACKEND_AUTOLOAD"] = "0" + import torch + + return os.path.dirname(os.path.realpath(torch.__file__)) + + +def build_deps(): + build_dir = os.path.join(BASE_DIR, "build") + os.makedirs(build_dir, exist_ok=True) + + cmake_args = [ + "-DCMAKE_INSTALL_PREFIX=" + + os.path.realpath(os.path.join(BASE_DIR, "torch_openreg")), + "-DPYTHON_INCLUDE_DIR=" + sysconfig.get_paths().get("include"), + "-DPYTORCH_INSTALL_DIR=" + get_pytorch_dir(), + ] + + subprocess.check_call( + ["cmake", BASE_DIR] + cmake_args, cwd=build_dir, env=os.environ + ) + + build_args = [ + "--build", + ".", + "--target", + "install", + "--config", # For multi-config generators + "Release", + "--", + ] + + if IS_WINDOWS: + build_args += ["/m:" + str(multiprocessing.cpu_count())] + else: + build_args += ["-j", str(multiprocessing.cpu_count())] + + command = ["cmake"] + build_args + subprocess.check_call(command, cwd=build_dir, env=os.environ) + + +class BuildClean(clean): + def run(self): + for i in ["build", "install", "torch_openreg/lib"]: + dirs = os.path.join(BASE_DIR, i) + if os.path.exists(dirs) and os.path.isdir(dirs): + shutil.rmtree(dirs) + + for dirpath, _, filenames in os.walk(os.path.join(BASE_DIR, "torch_openreg")): + for filename in filenames: + if filename.endswith(".so"): + os.remove(os.path.join(dirpath, filename)) + + +def main(): + if not RUN_BUILD_DEPS: + build_deps() + + if IS_WINDOWS: + # /NODEFAULTLIB makes sure we only link to DLL runtime + # and matches the flags set for protobuf and ONNX + extra_link_args: list[str] = ["/NODEFAULTLIB:LIBCMT.LIB"] + [ + *make_relative_rpath_args("lib") + ] + # /MD links against DLL runtime + # and matches the flags set for protobuf and ONNX + # /EHsc is about standard C++ exception handling + extra_compile_args: list[str] = ["/MD", "/FS", "/EHsc"] + else: + extra_link_args = [*make_relative_rpath_args("lib")] + extra_compile_args = [ + "-Wall", + "-Wextra", + "-Wno-strict-overflow", + "-Wno-unused-parameter", + "-Wno-missing-field-initializers", + "-Wno-unknown-pragmas", + "-fno-strict-aliasing", + ] + + ext_modules = [ + Extension( + name="torch_openreg._C", + sources=["torch_openreg/csrc/stub.c"], + language="c", + extra_compile_args=extra_compile_args, + libraries=["torch_bindings"], + library_dirs=[os.path.join(BASE_DIR, "torch_openreg/lib")], + extra_link_args=extra_link_args, + ) + ] + + package_data = { + "torch_openreg": [ + "lib/*.so*", + "lib/*.dylib*", + "lib/*.dll", + "lib/*.lib", + ] + } + + setup( + packages=find_packages(), + package_data=package_data, + ext_modules=ext_modules, + cmdclass={ + "clean": BuildClean, # type: ignore[misc] + }, + include_package_data=False, + entry_points={ + "torch.backends": [ + "torch_openreg = torch_openreg:_autoload", + ], + }, + ) + + +if __name__ == "__main__": + main() diff --git a/PyTorchSimDevice2/third_party/openreg/CMakeLists.txt b/PyTorchSimDevice2/third_party/openreg/CMakeLists.txt new file mode 100644 index 00000000..1bde7e00 --- /dev/null +++ b/PyTorchSimDevice2/third_party/openreg/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) + +project(TORCH_OPENREG CXX C) + + +set(LIBRARY_NAME openreg) +set(LIBRARY_TEST ortests) + +file(GLOB_RECURSE SOURCE_FILES + "${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp" +) + +add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES}) + +target_include_directories(${LIBRARY_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +install(TARGETS ${LIBRARY_NAME} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR} +) diff --git a/PyTorchSimDevice2/third_party/openreg/README.md b/PyTorchSimDevice2/third_party/openreg/README.md new file mode 100644 index 00000000..0cee2c87 --- /dev/null +++ b/PyTorchSimDevice2/third_party/openreg/README.md @@ -0,0 +1,151 @@ +# OpenReg: An Accelerator Backend that Simulates CUDA Behavior on a CPU + +## Introduction + +OpenReg is a C++ backend library that simulates the behavior of a CUDA-like device on a CPU. Its core objective is **not to accelerate computation or improve performance**, but rather to **simulate modern CUDA programming, enabling developers to prototype and test in an environment without actual GPU hardware**. The current design principles are as follows: + +* **API Consistency**: Provide an interface consistent with the CUDA Runtime API, allowing upper-level applications (like PyTorch's `PrivateUse1` backend) to switch and test seamlessly. +* **Functional Consistency**: Provide behavior consistent with the CUDA Runtime, such as memory isolation, device context management, etc. +* **Completeness**: Aim to support `PrivateUse1` device integration and safeguard the third-party device integration mechanism, without striving to cover all capabilities of the CUDA Runtime. + +## Directory Structure + +The project's code is organized with a clear structure and separation of responsibilities: + +```text +openreg/ +├── README.md # Comprehensive introduction of OpenReg. +├── CMakeLists.txt # Top-level CMake build script, used to compile and generate libopenreg.so +├── cmake/ +│ └── GTestTargets.cmake # Utils of fetching GoogleTest. +├── include/ +│ ├── openreg.h # Public API header file, external users only need to include this file +│ └── openreg.inl # Public API header file, as an extension of openreg.h, cannot be included separately. +├── example/ +│ └── example.cpp # Example for OpenReg. +├── tests/ +│ ├── event_tests.cpp # Testcases about OpenReg Event. +│ ├── stream_tests.cpp # Testcases about OpenReg Stream. +│ ├── device_tests.cpp # Testcases about OpenReg Device. +│ └── memory_tests.cpp # Testcases about OpenReg Memory. +└── csrc/ + ├── device.cpp # Implementation of device management APIs + ├── memory.cpp # Implementation of memory management APIs + └── stream.cpp # Implementation of stream and event APIs. +``` + +* `CMakeLists.txt`: Responsible for compiling and linking all source files under the `csrc/` directory to generate the final `libopenreg.so` shared library. +* `include`: Defines all externally exposed APIs, data structures, and enums. + * `openreg.h`: Defines all externally exposed C-style APIs. + * `openreg.inl`: Defines all externally exposed C++ APIs. +* `csrc/`: Contains the C++ implementation source code for all core functionalities. + * `device.cpp`: Implements the core functions of device management: device discovery and context management. + * `memory.cpp`: Implements the core functions of memory management: allocation, free, copy and memory protection. + * `stream.cpp`: Implements the core functions of stream and event: creation, destroy, record, synchronization and so on. + +## Implemented APIs + +OpenReg currently provides a set of APIs covering basic memory and device management. + +### Device Management APIs + +| OpenReg | CUDA | Feature Description | +| :------------------------------- | :--------------------------------- | :--------------------------------- | +| `orGetDeviceCount` | `cudaGetDeviceCount` | Get the number of available GPUs | +| `orSetDevice` | `cudaSetDevice` | Set the active GPU | +| `orGetDevice` | `cudaGetDevice` | Get the current GPU | +| `orDeviceSynchronize` | `cudaDeviceSynchronize` | Wait for all GPU tasks to finish | +| `orDeviceGetStreamPriorityRange` | `cudaDeviceGetStreamPriorityRange` | Get the range of stream priorities | + +### Memory Management APIs + +| OpenReg | CUDA | Feature Description | +| :----------------------- | :------------------------- | :---------------------------------------- | +| `orMalloc` | `cudaMalloc` | Allocate device memory | +| `orFree` | `cudaFree` | Free device memory | +| `orMallocHost` | `cudaMallocHost` | Allocate page-locked (Pinned) host memory | +| `orFreeHost` | `cudaFreeHost` | Free page-locked host memory | +| `orMemcpy` | `cudaMemcpy` | Synchronous memory copy | +| `orMemcpyAsyn` | `cudaMemcpyAsyn` | Asynchronous memory copy | +| `orPointerGetAttributes` | `cudaPointerGetAttributes` | Get pointer attributes | + +### Stream APIs + +| OpenReg | CUDA | Feature Description | +| :--------------------------- | :----------------------------- | :------------------------------------- | +| `orStreamCreate` | `cudaStreamCreate` | Create a default-priority stream | +| `orStreamCreateWithPriority` | `cudaStreamCreateWithPriority` | Create a stream with a given priority | +| `orStreamDestroy` | `cudaStreamDestroy` | Destroy a stream | +| `orStreamQuery` | `cudaStreamQuery` | Check if a stream has completed | +| `orStreamSynchronize` | `cudaStreamSynchronize` | Wait for a stream to complete | +| `orStreamWaitEvent` | `cudaStreamWaitEvent` | Make a stream wait for an event | +| `orStreamGetPriority` | `cudaStreamGetPriority` | Get a stream’s priority | + +### Event APIs + +| OpenReg | CUDA | Feature Description | +| :----------------------- | :------------------------- | :---------------------------------- | +| `orEventCreate` | `cudaEventCreate` | Create an event with default flag | +| `orEventCreateWithFlags` | `cudaEventCreateWithFlags` | Create an event with specific flag | +| `orEventDestroy` | `cudaEventDestroy` | Destroy an event | +| `orEventRecord` | `cudaEventRecord` | Record an event in a stream | +| `orEventSynchronize` | `cudaEventSynchronize` | Wait for an event to complete | +| `orEventQuery` | `cudaEventQuery` | Check if an event has completed | +| `orEventElapsedTime` | `cudaEventElapsedTime` | Get time elapsed between two events | + +## Implementation Principles + +### Device Management Principles + +Simulating multiple devices and thread-safe device context switching: + +1. **Device Count**: The total number of simulated devices is defined by the compile-time constant `constexpr int kDeviceCount`. +2. **Device Switching**: Device switching in multi-threaded scenarios is simulated using a **TLS (Thread-Local Storage) global variable**. + +### Memory Management Principles + +Simulating device memory, host memory, and memory copies: + +1. **Allocation**: A page-aligned memory block is allocated using `mmap` + `mprotect` with the permission flag `PROT_NONE`. Read, write, and execute operations on this memory region are all prohibited. +2. **Deallocation**: Memory is freed using `munmap`. +3. **Authorization**: When a legitimate memory access is required, an RAII guard restores the memory permissions to `PROT_READ | PROT_WRITE`. The permissions are automatically reverted to `PROT_NONE` when the scope is exited. + +### Stream&Event Principles + +Simulating creation, release and synchronization for event and steam: + +1. **Event**: Each event is encapsulated as a task function and placed into a stream, which acts as a thread. Upon completion of the task, a flag within the event is modified to simulate the event's status. +2. **Stream**: When each stream is requested, a new thread is created, which sequentially processes each task in the task queue within the stream structure. Tasks can be wrappers around kernel functions or events. +3. **Synchronization**: Synchronization between streams and events is achieved using multithreading, condition variables, and mutexes. + +## Usage Example + +Please refer to [example](example/example.cpp) for example. + +The command to compile example.cpp is as follow: + +```Shell +mkdir build + +pushd build +cmake .. +make -j 32 +popd + +g++ -o out example/example.cpp -L ./build -lopenreg +LD_LIBRARY_PATH=./build ./out +``` + +The output is as follow: + +```Shell +Current environment have 2 devices +Current is 0 device +All tasks have been submitted. +Kernel execution time: 0.238168 ms +Verification PASSED! +``` + +## Next Steps + +The most basic functions of the OpenReg backend are currently supported, and will be dynamically optimized and expanded based on the needs of PyTorch integration. diff --git a/PyTorchSimDevice2/third_party/openreg/cmake/GTestTargets.cmake b/PyTorchSimDevice2/third_party/openreg/cmake/GTestTargets.cmake new file mode 100644 index 00000000..777fc489 --- /dev/null +++ b/PyTorchSimDevice2/third_party/openreg/cmake/GTestTargets.cmake @@ -0,0 +1,12 @@ +set(GTest_REL_PATH "../../../../../../../third_party/googletest") +get_filename_component(GTest_DIR "${CMAKE_CURRENT_LIST_DIR}/${GTest_REL_PATH}" ABSOLUTE) + +if(EXISTS "${GTest_DIR}/CMakeLists.txt") + message(STATUS "Found GTest: ${GTest_DIR}") + + set(BUILD_GMOCK OFF CACHE BOOL "Disable GMock build") + set(INSTALL_GTEST OFF CACHE BOOL "Disable GTest install") + add_subdirectory(${GTest_DIR} "${CMAKE_BINARY_DIR}/gtest") +else() + message(FATAL_ERROR "GTest Not Found") +endif() diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/device.cpp b/PyTorchSimDevice2/third_party/openreg/csrc/device.cpp new file mode 100644 index 00000000..9643bc59 --- /dev/null +++ b/PyTorchSimDevice2/third_party/openreg/csrc/device.cpp @@ -0,0 +1,37 @@ +#include + +namespace { + +// Total device numbers +constexpr int DEVICE_COUNT = 2; +// Current device index +thread_local int gCurrentDevice = 0; + +} // namespace + +orError_t orGetDeviceCount(int* count) { + if (!count) { + return orErrorUnknown; + } + + *count = DEVICE_COUNT; + return orSuccess; +} + +orError_t orGetDevice(int* device) { + if (!device) { + return orErrorUnknown; + } + + *device = gCurrentDevice; + return orSuccess; +} + +orError_t orSetDevice(int device) { + if (device < 0 || device >= DEVICE_COUNT) { + return orErrorUnknown; + } + + gCurrentDevice = device; + return orSuccess; +} diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/memory.cpp b/PyTorchSimDevice2/third_party/openreg/csrc/memory.cpp new file mode 100644 index 00000000..6f02eeb0 --- /dev/null +++ b/PyTorchSimDevice2/third_party/openreg/csrc/memory.cpp @@ -0,0 +1,259 @@ +#include "memory.h" + +#include + +#include +#include + +namespace { + +struct Block { + orMemoryType type = orMemoryType::orMemoryTypeUnmanaged; + int device = -1; + void* pointer = nullptr; + size_t size = 0; + int refcount{0}; +}; + +class MemoryManager { + public: + static MemoryManager& getInstance() { + static MemoryManager instance; + return instance; + } + + orError_t allocate(void** ptr, size_t size, orMemoryType type) { + if (!ptr || size == 0) + return orErrorUnknown; + + std::lock_guard lock(m_mutex); + long page_size = openreg::get_pagesize(); + size_t aligned_size = ((size - 1) / page_size + 1) * page_size; + void* mem = nullptr; + int current_device = -1; + + if (type == orMemoryType::orMemoryTypeDevice) { + orGetDevice(¤t_device); + + mem = openreg::mmap(aligned_size); + if (mem == nullptr) + return orErrorUnknown; + if (openreg::mprotect(mem, aligned_size, F_PROT_NONE) != 0) { + openreg::munmap(mem, aligned_size); + return orErrorUnknown; + } + } else { + if (openreg::alloc(&mem, page_size, aligned_size) != 0) { + return orErrorUnknown; + } + } + + m_registry[mem] = {type, current_device, mem, aligned_size, 0}; + *ptr = mem; + return orSuccess; + } + + orError_t free(void* ptr) { + if (!ptr) + return orSuccess; + + std::lock_guard lock(m_mutex); + auto it = m_registry.find(ptr); + if (it == m_registry.end()) + return orErrorUnknown; + + const auto& info = it->second; + if (info.type == orMemoryType::orMemoryTypeDevice) { + openreg::mprotect(info.pointer, info.size, F_PROT_READ | F_PROT_WRITE); + openreg::munmap(info.pointer, info.size); + } else { + openreg::free(info.pointer); + } + + m_registry.erase(it); + return orSuccess; + } + + orError_t memcpy( + void* dst, + const void* src, + size_t count, + orMemcpyKind kind) { + if (!dst || !src || count == 0) + return orErrorUnknown; + + std::lock_guard lock(m_mutex); + Block* dst_info = getBlockInfoNoLock(dst); + Block* src_info = getBlockInfoNoLock(src); + + switch (kind) { + case orMemcpyHostToDevice: + if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) || + (src_info && src_info->type == orMemoryType::orMemoryTypeDevice)) + return orErrorUnknown; + break; + case orMemcpyDeviceToHost: + if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) || + (!src_info || src_info->type != orMemoryType::orMemoryTypeDevice)) + return orErrorUnknown; + break; + case orMemcpyDeviceToDevice: + if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) || + (!src_info || src_info->type != orMemoryType::orMemoryTypeDevice)) + return orErrorUnknown; + break; + case orMemcpyHostToHost: + if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) || + (src_info && src_info->type == orMemoryType::orMemoryTypeDevice)) + return orErrorUnknown; + break; + } + + unprotectNoLock(dst_info); + unprotectNoLock(src_info); + ::memcpy(dst, src, count); + protectNoLock(dst_info); + protectNoLock(src_info); + + return orSuccess; + } + + orError_t getPointerAttributes( + orPointerAttributes* attributes, + const void* ptr) { + if (!attributes || !ptr) + return orErrorUnknown; + + std ::lock_guard lock(m_mutex); + Block* info = getBlockInfoNoLock(ptr); + + if (!info) { + attributes->type = orMemoryType::orMemoryTypeUnmanaged; + attributes->device = -1; + attributes->pointer = const_cast(ptr); + } else { + attributes->type = info->type; + attributes->device = info->device; + attributes->pointer = info->pointer; + } + + return orSuccess; + } + + orError_t unprotect(void* ptr) { + std::lock_guard lock(m_mutex); + return unprotectNoLock(getBlockInfoNoLock(ptr)); + } + + orError_t protect(void* ptr) { + std::lock_guard lock(m_mutex); + return protectNoLock(getBlockInfoNoLock(ptr)); + } + + private: + MemoryManager() = default; + + orError_t unprotectNoLock(Block* info) { + if (info && info->type == orMemoryType::orMemoryTypeDevice) { + if (info->refcount == 0) { + if (openreg::mprotect( + info->pointer, info->size, F_PROT_READ | F_PROT_WRITE) != 0) { + return orErrorUnknown; + } + } + + info->refcount++; + } + + return orSuccess; + } + + orError_t protectNoLock(Block* info) { + if (info && info->type == orMemoryType::orMemoryTypeDevice) { + if (info->refcount == 1) { + if (openreg::mprotect(info->pointer, info->size, F_PROT_NONE) != 0) { + return orErrorUnknown; + } + } + + info->refcount--; + } + + return orSuccess; + } + + Block* getBlockInfoNoLock(const void* ptr) { + auto it = m_registry.upper_bound(const_cast(ptr)); + if (it != m_registry.begin()) { + --it; + const char* p_char = static_cast(ptr); + const char* base_char = static_cast(it->first); + if (p_char >= base_char && p_char < (base_char + it->second.size)) { + return &it->second; + } + } + + return nullptr; + } + + std::map m_registry; + std::mutex m_mutex; +}; + +} // namespace + +orError_t orMalloc(void** devPtr, size_t size) { + return MemoryManager::getInstance().allocate( + devPtr, size, orMemoryType::orMemoryTypeDevice); +} + +orError_t orFree(void* devPtr) { + return MemoryManager::getInstance().free(devPtr); +} + +orError_t orMallocHost(void** hostPtr, size_t size) { + return MemoryManager::getInstance().allocate( + hostPtr, size, orMemoryType::orMemoryTypeHost); +} + +orError_t orFreeHost(void* hostPtr) { + return MemoryManager::getInstance().free(hostPtr); +} + +orError_t orMemcpy( + void* dst, + const void* src, + size_t count, + orMemcpyKind kind) { + return MemoryManager::getInstance().memcpy(dst, src, count, kind); +} + +orError_t orMemcpyAsync( + void* dst, + const void* src, + size_t count, + orMemcpyKind kind, + orStream_t stream) { + if (!stream) { + return orErrorUnknown; + } + + auto& mm = MemoryManager::getInstance(); + + return orLaunchKernel( + stream, &MemoryManager::memcpy, &mm, dst, src, count, kind); +} + +orError_t orPointerGetAttributes( + orPointerAttributes* attributes, + const void* ptr) { + return MemoryManager::getInstance().getPointerAttributes(attributes, ptr); +} + +orError_t orMemoryUnprotect(void* devPtr) { + return MemoryManager::getInstance().unprotect(devPtr); +} + +orError_t orMemoryProtect(void* devPtr) { + return MemoryManager::getInstance().protect(devPtr); +} diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/memory.h b/PyTorchSimDevice2/third_party/openreg/csrc/memory.h new file mode 100644 index 00000000..35851ac9 --- /dev/null +++ b/PyTorchSimDevice2/third_party/openreg/csrc/memory.h @@ -0,0 +1,96 @@ +#pragma once + +#include +#include +#include + +#if defined(_WIN32) +#include +#else +#include +#include +#endif + +#define F_PROT_NONE 0x0 +#define F_PROT_READ 0x1 +#define F_PROT_WRITE 0x2 + +namespace openreg { + +void* mmap(size_t size) { +#if defined(_WIN32) + return VirtualAlloc(nullptr, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); +#else + void* addr = ::mmap( + nullptr, + size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, + 0); + return (addr == MAP_FAILED) ? nullptr : addr; +#endif +} + +void munmap(void* addr, size_t size) { +#if defined(_WIN32) + VirtualFree(addr, 0, MEM_RELEASE); +#else + ::munmap(addr, size); +#endif +} + +int mprotect(void* addr, size_t size, int prot) { +#if defined(_WIN32) + DWORD win_prot = 0; + DWORD old; + if (prot == F_PROT_NONE) { + win_prot = PAGE_NOACCESS; + } else { + win_prot = PAGE_READWRITE; + } + + return VirtualProtect(addr, size, win_prot, &old) ? 0 : -1; +#else + int native_prot = 0; + if (prot == F_PROT_NONE) + native_prot = PROT_NONE; + else { + if (prot & F_PROT_READ) + native_prot |= PROT_READ; + if (prot & F_PROT_WRITE) + native_prot |= PROT_WRITE; + } + + return ::mprotect(addr, size, native_prot); +#endif +} + +int alloc(void** mem, size_t alignment, size_t size) { +#ifdef _WIN32 + *mem = _aligned_malloc(size, alignment); + return *mem ? 0 : -1; +#else + return posix_memalign(mem, alignment, size); +#endif +} + +void free(void* mem) { +#ifdef _WIN32 + _aligned_free(mem); +#else + ::free(mem); +#endif +} + +long get_pagesize() { +#ifdef _WIN32 + SYSTEM_INFO si; + GetSystemInfo(&si); + return static_cast(si.dwPageSize); +#else + return sysconf(_SC_PAGESIZE); +#endif +} + +} // namespace openreg diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/stream.cpp b/PyTorchSimDevice2/third_party/openreg/csrc/stream.cpp new file mode 100644 index 00000000..30f50b1a --- /dev/null +++ b/PyTorchSimDevice2/third_party/openreg/csrc/stream.cpp @@ -0,0 +1,313 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +static std::mutex g_mutex; +static std::once_flag g_flag; +static std::vector> g_streams_per_device; + +static void initialize_registries() { + int device_count = 0; + orGetDeviceCount(&device_count); + g_streams_per_device.resize(device_count); +} + +struct orEventImpl { + std::mutex mtx; + std::condition_variable cv; + std::atomic completed{true}; + int device_index = -1; + bool timing_enabled{false}; + std::chrono::high_resolution_clock::time_point completion_time; +}; + +struct orEvent { + std::shared_ptr impl; +}; + +struct orStream { + std::queue> tasks; + std::mutex mtx; + std::condition_variable cv; + std::thread worker; + std::atomic stop_flag{false}; + int device_index = -1; + + orStream() { + worker = std::thread([this] { + while (true) { + std::function task; + { + std::unique_lock lock(this->mtx); + this->cv.wait(lock, [this] { + return this->stop_flag.load() || !this->tasks.empty(); + }); + if (this->stop_flag.load() && this->tasks.empty()) { + return; + } + task = std::move(this->tasks.front()); + this->tasks.pop(); + } + task(); + } + }); + } + + ~orStream() { + stop_flag.store(true); + cv.notify_one(); + worker.join(); + } +}; + +orError_t openreg::addTaskToStream( + orStream_t stream, + std::function task) { + if (!stream) + return orErrorUnknown; + + { + std::lock_guard lock(stream->mtx); + stream->tasks.push(std::move(task)); + } + + stream->cv.notify_one(); + return orSuccess; +} + +orError_t orEventCreateWithFlags(orEvent_t* event, unsigned int flags) { + if (!event) + return orErrorUnknown; + + auto impl = std::make_shared(); + orGetDevice(&(impl->device_index)); + if (flags & orEventEnableTiming) { + impl->timing_enabled = true; + } + + *event = new orEvent{std::move(impl)}; + return orSuccess; +} + +orError_t orEventCreate(orEvent_t* event) { + return orEventCreateWithFlags(event, orEventDisableTiming); +} + +orError_t orEventDestroy(orEvent_t event) { + if (!event) + return orErrorUnknown; + + delete event; + return orSuccess; +} + +orError_t orEventRecord(orEvent_t event, orStream_t stream) { + if (!event || !stream) + return orErrorUnknown; + + auto event_impl = event->impl; + event_impl->completed.store(false); + auto record_task = [event_impl]() { + if (event_impl->timing_enabled) { + event_impl->completion_time = std::chrono::high_resolution_clock::now(); + } + + { + std::lock_guard lock(event_impl->mtx); + event_impl->completed.store(true); + } + + event_impl->cv.notify_all(); + }; + + return openreg::addTaskToStream(stream, record_task); +} + +orError_t orEventSynchronize(orEvent_t event) { + if (!event) + return orErrorUnknown; + + auto event_impl = event->impl; + std::unique_lock lock(event_impl->mtx); + event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); }); + + return orSuccess; +} + +orError_t orEventQuery(orEvent_t event) { + if (!event) + return orErrorUnknown; + + return event->impl->completed.load() ? orSuccess : orErrorNotReady; +} + +orError_t orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end) { + if (!ms || !start || !end) + return orErrorUnknown; + + auto start_impl = start->impl; + auto end_impl = end->impl; + + if (start_impl->device_index != end_impl->device_index) { + return orErrorUnknown; + } + + if (!start_impl->timing_enabled || !end_impl->timing_enabled) { + return orErrorUnknown; + } + + if (!start_impl->completed.load() || !end_impl->completed.load()) { + return orErrorUnknown; + } + + auto duration = end_impl->completion_time - start_impl->completion_time; + *ms = std::chrono::duration_cast>( + duration) + .count(); + + return orSuccess; +} + +orError_t orStreamCreateWithPriority( + orStream_t* stream, + [[maybe_unused]] unsigned int flag, + int priority) { + if (!stream) { + return orErrorUnknown; + } + + int min_p, max_p; + orDeviceGetStreamPriorityRange(&min_p, &max_p); + if (priority < min_p || priority > max_p) { + return orErrorUnknown; + } + + int current_device = 0; + orGetDevice(¤t_device); + + orStream_t new_stream = nullptr; + new_stream = new orStream(); + new_stream->device_index = current_device; + + { + std::lock_guard lock(g_mutex); + std::call_once(g_flag, initialize_registries); + g_streams_per_device[current_device].insert(new_stream); + } + + *stream = new_stream; + + return orSuccess; +} + +orError_t orStreamCreate(orStream_t* stream) { + int min_p, max_p; + orDeviceGetStreamPriorityRange(&min_p, &max_p); + + return orStreamCreateWithPriority(stream, 0, max_p); +} + +orError_t orStreamGetPriority( + [[maybe_unused]] orStream_t stream, + int* priority) { + // Since OpenReg has only one priority level, the following code + // returns 0 directly for convenience. + *priority = 0; + + return orSuccess; +} + +orError_t orStreamDestroy(orStream_t stream) { + if (!stream) + return orErrorUnknown; + + { + std::lock_guard lock(g_mutex); + + int device_idx = stream->device_index; + if (device_idx >= 0 && device_idx < g_streams_per_device.size()) { + g_streams_per_device[device_idx].erase(stream); + } + } + + delete stream; + return orSuccess; +} + +orError_t orStreamQuery(orStream_t stream) { + if (!stream) { + return orErrorUnknown; + } + + std::lock_guard lock(stream->mtx); + return stream->tasks.empty() ? orSuccess : orErrorNotReady; +} + +orError_t orStreamSynchronize(orStream_t stream) { + if (!stream) + return orErrorUnknown; + + orEvent_t event; + orEventCreate(&event); + orEventRecord(event, stream); + + orError_t status = orEventSynchronize(event); + orEventDestroy(event); + + return status; +} + +orError_t orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int) { + if (!stream || !event) + return orErrorUnknown; + + auto event_impl = event->impl; + auto wait_task = [event_impl]() { + std::unique_lock lock(event_impl->mtx); + event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); }); + }; + + return openreg::addTaskToStream(stream, wait_task); +} + +orError_t orDeviceGetStreamPriorityRange( + int* leastPriority, + int* greatestPriority) { + if (!leastPriority || !greatestPriority) { + return orErrorUnknown; + } + + // OpenReg have only one priority now. + *leastPriority = 0; + *greatestPriority = 0; + return orSuccess; +} + +orError_t orDeviceSynchronize(void) { + int current_device = 0; + orGetDevice(¤t_device); + + std::vector streams; + { + std::lock_guard lock(g_mutex); + std::call_once(g_flag, initialize_registries); + + auto& streams_on_device = g_streams_per_device[current_device]; + streams.assign(streams_on_device.begin(), streams_on_device.end()); + } + + for (orStream_t stream : streams) { + orError_t status = orStreamSynchronize(stream); + if (status != orSuccess) { + return status; + } + } + + return orSuccess; +} diff --git a/PyTorchSimDevice2/third_party/openreg/example/example.cpp b/PyTorchSimDevice2/third_party/openreg/example/example.cpp new file mode 100644 index 00000000..f00f1909 --- /dev/null +++ b/PyTorchSimDevice2/third_party/openreg/example/example.cpp @@ -0,0 +1,112 @@ +#include "include/openreg.h" + +#include +#include +#include +#include + +struct MemoryGuard { + MemoryGuard(void* ptr) : ptr_(ptr) { + orMemoryUnprotect(ptr_); + } + ~MemoryGuard() { + orMemoryProtect(ptr_); + } + + private: + void* ptr_{}; +}; + +void add_kernel(float* out, float* a, float* b, int num) { + for (int i = 0; i < num; ++i) { + out[i] = a[i] + b[i]; + } +} + +int main() { + int device_count = 0; + orGetDeviceCount(&device_count); + + std::cout << "Current environment have " << device_count << " devices" + << std::endl; + + orSetDevice(0); + int current_device = -1; + orGetDevice(¤t_device); + + std::cout << "Current is " << current_device << " device" << std::endl; + + constexpr int num = 50000; + constexpr size_t size = num * sizeof(float); + + std::vector host_a(num), host_b(num), host_out(num, 0.0f); + std::iota(host_a.begin(), host_a.end(), 0.0f); + for (int i = 0; i < num; ++i) { + host_b[i] = 2.0f; + } + + float *dev_a, *dev_b, *dev_out; + orMalloc((void**)&dev_a, size); + orMalloc((void**)&dev_b, size); + orMalloc((void**)&dev_out, size); + + // There will be subsequent memory access operations, so memory protection + // needs to be released + MemoryGuard a{dev_a}; + MemoryGuard b{dev_b}; + MemoryGuard c{dev_out}; + + orStream_t stream1, stream2; + orEvent_t start_event, stop_event; + + orStreamCreate(&stream1); + orStreamCreate(&stream2); + orEventCreateWithFlags(&start_event, orEventEnableTiming); + orEventCreateWithFlags(&stop_event, orEventEnableTiming); + + // Copy input from host to device + orMemcpyAsync(dev_a, host_a.data(), size, orMemcpyHostToDevice, stream1); + orMemcpyAsync(dev_b, host_b.data(), size, orMemcpyHostToDevice, stream1); + + // Submit compute kernel and two events those are used for calculating time. + orEventRecord(start_event, stream1); + orLaunchKernel(stream1, add_kernel, dev_out, dev_a, dev_b, num); + orEventRecord(stop_event, stream1); + + // Synchronization between streams. + orStreamWaitEvent(stream2, stop_event, 0); + orMemcpyAsync(host_out.data(), dev_out, size, orMemcpyDeviceToHost, stream2); + orStreamSynchronize(stream2); + + std::cout << "All tasks have been submitted." << std::endl; + + float elapsed_ms = 0.0f; + orEventElapsedTime(&elapsed_ms, start_event, stop_event); + std::cout << "Kernel execution time: " << elapsed_ms << " ms" << std::endl; + + bool success = true; + for (int i = 0; i < num; ++i) { + if (std::abs(host_out[i] - (host_a[i] + host_b[i])) > 1e-5) { + std::cout << "Verification FAILED at index " << i << "! Expected " + << (host_a[i] + host_b[i]) << ", got " << host_out[i] + << std::endl; + success = false; + break; + } + } + if (success) { + std::cout << "Verification PASSED!" << std::endl; + } + + orFree(dev_a); + orFree(dev_b); + orFree(dev_out); + + orStreamDestroy(stream1); + orStreamDestroy(stream2); + + orEventDestroy(start_event); + orEventDestroy(stop_event); + + return 0; +} diff --git a/PyTorchSimDevice2/third_party/openreg/include/openreg.h b/PyTorchSimDevice2/third_party/openreg/include/openreg.h new file mode 100644 index 00000000..a5e4af55 --- /dev/null +++ b/PyTorchSimDevice2/third_party/openreg/include/openreg.h @@ -0,0 +1,109 @@ +#pragma once + +#include + +#ifdef _WIN32 +#define OPENREG_EXPORT __declspec(dllexport) +#else +#define OPENREG_EXPORT __attribute__((visibility("default"))) +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum orError_t { + orSuccess = 0, + orErrorUnknown = 1, + orErrorNotReady = 2 +} orError_t; + +typedef enum orMemcpyKind { + orMemcpyHostToHost = 0, + orMemcpyHostToDevice = 1, + orMemcpyDeviceToHost = 2, + orMemcpyDeviceToDevice = 3 +} orMemcpyKind; + +typedef enum orMemoryType { + orMemoryTypeUnmanaged = 0, + orMemoryTypeHost = 1, + orMemoryTypeDevice = 2 +} orMemoryType; + +struct orPointerAttributes { + orMemoryType type = orMemoryType::orMemoryTypeUnmanaged; + int device; + void* pointer; +}; + +typedef enum orEventFlags { + orEventDisableTiming = 0x0, + orEventEnableTiming = 0x1, +} orEventFlags; + +struct orStream; +struct orEvent; +typedef struct orStream* orStream_t; +typedef struct orEvent* orEvent_t; + +// Memory +OPENREG_EXPORT orError_t orMalloc(void** devPtr, size_t size); +OPENREG_EXPORT orError_t orFree(void* devPtr); +OPENREG_EXPORT orError_t orMallocHost(void** hostPtr, size_t size); +OPENREG_EXPORT orError_t orFreeHost(void* hostPtr); +OPENREG_EXPORT orError_t +orMemcpy(void* dst, const void* src, size_t count, orMemcpyKind kind); +OPENREG_EXPORT orError_t orMemcpyAsync( + void* dst, + const void* src, + size_t count, + orMemcpyKind kind, + orStream_t stream); +OPENREG_EXPORT orError_t +orPointerGetAttributes(orPointerAttributes* attributes, const void* ptr); +OPENREG_EXPORT orError_t orMemoryUnprotect(void* devPtr); +OPENREG_EXPORT orError_t orMemoryProtect(void* devPtr); + +// Device +OPENREG_EXPORT orError_t orGetDeviceCount(int* count); +OPENREG_EXPORT orError_t orSetDevice(int device); +OPENREG_EXPORT orError_t orGetDevice(int* device); +OPENREG_EXPORT orError_t +orDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority); +OPENREG_EXPORT orError_t orDeviceSynchronize(void); + +// Stream +OPENREG_EXPORT orError_t orStreamCreateWithPriority( + orStream_t* stream, + unsigned int flags, + int priority); +OPENREG_EXPORT orError_t orStreamCreate(orStream_t* stream); +OPENREG_EXPORT orError_t orStreamGetPriority(orStream_t stream, int* priority); +OPENREG_EXPORT orError_t orStreamDestroy(orStream_t stream); +OPENREG_EXPORT orError_t orStreamQuery(orStream_t stream); +OPENREG_EXPORT orError_t orStreamSynchronize(orStream_t stream); +OPENREG_EXPORT orError_t +orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int flags); + +// Event +OPENREG_EXPORT orError_t +orEventCreateWithFlags(orEvent_t* event, unsigned int flags); +OPENREG_EXPORT orError_t orEventCreate(orEvent_t* event); +OPENREG_EXPORT orError_t orEventDestroy(orEvent_t event); +OPENREG_EXPORT orError_t orEventRecord(orEvent_t event, orStream_t stream); +OPENREG_EXPORT orError_t orEventSynchronize(orEvent_t event); +OPENREG_EXPORT orError_t orEventQuery(orEvent_t event); +OPENREG_EXPORT orError_t +orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end); + +#ifdef __cplusplus +} // extern "C" +#endif + +#ifdef __cplusplus + +#define OPENREG_H +#include "openreg.inl" + +#endif diff --git a/PyTorchSimDevice2/third_party/openreg/include/openreg.inl b/PyTorchSimDevice2/third_party/openreg/include/openreg.inl new file mode 100644 index 00000000..851be132 --- /dev/null +++ b/PyTorchSimDevice2/third_party/openreg/include/openreg.inl @@ -0,0 +1,42 @@ +#ifndef OPENREG_H +#error "Don`t include openreg.inl directly, include openreg.h instead." +#endif + +#include +#include +#include + +namespace openreg { +OPENREG_EXPORT orError_t +addTaskToStream(orStream* stream, std::function task); +} + +template +OPENREG_EXPORT inline orError_t orLaunchKernel( + orStream* stream, + Func&& kernel_func, + Args&&... args) { + if (!stream) { + return orErrorUnknown; + } + +/* + * Some tests in PyTorch still use C++11, so we use conditional macro to + * select different approaches for different C++ version. + * + * Std::apply is only supported in C++17, so for C++11/14, std::bind is + * a more appropriate approach, but the former has better performance. + */ +#if __cplusplus >= 201703L + auto task = [func = std::forward(kernel_func), + args_tuple = + std::make_tuple(std::forward(args)...)]() mutable { + std::apply(func, std::move(args_tuple)); + }; +#else + auto task = + std::bind(std::forward(kernel_func), std::forward(args)...); +#endif + + return openreg::addTaskToStream(stream, std::move(task)); +} diff --git a/PyTorchSimDevice2/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so b/PyTorchSimDevice2/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so new file mode 100755 index 0000000000000000000000000000000000000000..04b3b4e1cb7232dbb845c2f33fe24d94c640b705 GIT binary patch literal 15312 zcmeHOU1%It6uz6Z8tqTk(rRqQj?!YQ?If)!+DdHFH0eZ}Hl`^e{*1G`lkCd=q`R}4 z^`Taaf0RN&d{bzB5Px0-1@%F#RHRBDT0sy6DPkd2(W)rbhmPmWoNqIo?urxr|frkcfRy^ z_nl9q2fuy)#V>nU74yk{IsPE;5**X5``+^jnzlib7!9PHpGJIk-H2}!e*wMXjpH)n zOrm$vYk``pj4MLnWzezhi9)GpS3IZe*|xHW#)j>TTXrXM70)e?4fp3uMR&|e<=s%$ zSYoHA9D6)hbn>}JT{Ti0D(1*rzseDApLC0(?!<5@Qza+)T*@nz(^)%}D`s-ViHcb% zsm{`**O@LAGfpSTH!RyeI<#eI8}|_=K5tm(NqHZJe4fBRE_2b=8(M-7`sl`x&vV7O zLOUMe%SR^=eG%bft+3!^gfCxFp{w2yE+xQP4>|g(GoUk|GoUk|GoUk|GoUk|Gw}b< zz_0Ds{%P&~q0QPi`$VTw){}?57XP@l_oKEW!JG5feM)S9`7ye-FYQ&VpJmDEZ+zb$ zKftuVd^btQ+m~)uf!tsIa-FvJ_q>2CBA2(Vbo^7=fA6qBRc?-$GB5}~>pA5%^J;$@BXB~-E^`@QH-kxvx z#@%}MlsDJf*K;NDr-vx;=?q;yo;{D~#QkJjAD`_{KSDx@C!lX2n!Ip7$W=W%#MDh^ zKxaT_KxaT_KxaT_KxaT_KxaT_KxaT_;D3;T#FEzA_`gg3ugf^&`xap@;UwV*;WNzt z+4m0;a^wG4xg{PQRf(x&V#(Y~+YZnlam$Ez4ZV*4zU2=^fQ4vN_z5FAF~7geT0W&eAe;kHAS)1|MMhTH=O~U0i6Mz0i6Mz0i6Mz z0i6Mz0i6Mzfqy3hsLw<_CTcBriTifJrv&F>Sh=a2C-f?*^SoMU)PXJ$8uguvg+@In z-%E0X{I_#{iRm*^+=-ga!&21A^`P83guXl^)gi$-f*pdW1)Y{UP}GbT9?3_ir#(>`YaQUe#E$ z#_Tq`RX63`+6bp z)2`k!;9&+E`FSnSU!dai@@BwaRDWOa@`3O%Pv9S;P7ANTSl&Jh6ez0(G($W^(4yuk zd@dpVo;nHo6$YON@cGdY14IOq#BWrOzPO2gcN6|d;3wi(x~V4mDqZ#}UUhujOsknP zciSu2X)biHDBI4I?1_9S<>c)QRjOBPr#hw5rNU(1_1uiv)mVdz-*dK8E;}{bEqdje z8ZSEq*UnT6g&LVeD4UDo&r_v;X%Rt<9q<2u9gkwr=0lx7FT0IJjeA%pU98v~_^oHY?Q+D*qc- z_y5f^0T&kd#~vt9W~El}oD`u~4l#>fvE;etM6qg4mP?av*{dnD&Pr8t`ONBEMgrdA|pA$nzHmfgI=OzBPLJ!J+tF{x`#l!)X`#Z4>IbEk;pSyHB(mHEhK$P1{@ zHk_DH6MY)ODdflrxnz$Nf#W&KOjRn%q@_`y8NYA|UKEG-HQpPrpEP^!2hOo?faYqY zsC~2Nf1oeh4@lKVz29lC!T8uOf&4oyH`v9;zmsIVSHd3qDG>V`Z-ev^t?zr-=XMZ} z_c+*Ne+C{RS+uc_XuFpPjt%zMUx64u9zVu+#eS1G#eNNh9^3OCkM|EgA2K*5oG0qv ztMdJda|@{i4RL_xeI{yeh+Q3_D2T6ZU^QmnD*_<&;082scg%i71VHFDvwu2f|DYHE zKY^iO+~59~J@y$O&V_Ij&);)mkNH=IXbP?p5)ijPL;}t~?7NIm6ZlL>)~}dYxtevQ8uQ;3(A>Yq5`u^Z^KVpyCzp*x?c3~h9 V#z@7tOO5QW>kbW0iya_t{}-!tJ*fZy literal 0 HcmV?d00001 diff --git a/PyTorchSimDevice2/torch_openreg/__init__.py b/PyTorchSimDevice2/torch_openreg/__init__.py new file mode 100644 index 00000000..a69151e9 --- /dev/null +++ b/PyTorchSimDevice2/torch_openreg/__init__.py @@ -0,0 +1,24 @@ +import sys +import torch + + +if sys.platform == "win32": + from ._utils import _load_dll_libraries + + _load_dll_libraries() + del _load_dll_libraries + +import torch_openreg._C # type: ignore[misc] +import torch_openreg.openreg + + +torch.utils.rename_privateuse1_backend("npu") +torch._register_device_module("npu", torch_openreg.openreg) +torch.utils.generate_methods_for_privateuse1_backend(for_storage=True) + +torch_openreg.openreg.init() +sys.modules['torch.npu'] = torch_openreg.openreg + +def _autoload(): + # It is a placeholder function here to be registered as an entry point. + pass \ No newline at end of file diff --git a/PyTorchSimDevice2/torch_openreg/_utils.py b/PyTorchSimDevice2/torch_openreg/_utils.py new file mode 100644 index 00000000..1c26f475 --- /dev/null +++ b/PyTorchSimDevice2/torch_openreg/_utils.py @@ -0,0 +1,42 @@ +import ctypes +import glob +import os + + +def _load_dll_libraries() -> None: + openreg_dll_path = os.path.join(os.path.dirname(__file__), "lib") + + kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True) + with_load_library_flags = hasattr(kernel32, "AddDllDirectory") + prev_error_mode = kernel32.SetErrorMode(0x0001) + + kernel32.LoadLibraryW.restype = ctypes.c_void_p + if with_load_library_flags: + kernel32.LoadLibraryExW.restype = ctypes.c_void_p + + os.add_dll_directory(openreg_dll_path) + + dlls = glob.glob(os.path.join(openreg_dll_path, "*.dll")) + path_patched = False + for dll in dlls: + is_loaded = False + if with_load_library_flags: + res = kernel32.LoadLibraryExW(dll, None, 0x00001100) + last_error = ctypes.get_last_error() + if res is None and last_error != 126: + err = ctypes.WinError(last_error) + err.strerror += f' Error loading "{dll}" or one of its dependencies.' + raise err + elif res is not None: + is_loaded = True + if not is_loaded: + if not path_patched: + os.environ["PATH"] = ";".join([openreg_dll_path] + [os.environ["PATH"]]) + path_patched = True + res = kernel32.LoadLibraryW(dll) + if res is None: + err = ctypes.WinError(ctypes.get_last_error()) + err.strerror += f' Error loading "{dll}" or one of its dependencies.' + raise err + + kernel32.SetErrorMode(prev_error_mode) diff --git a/PyTorchSimDevice2/torch_openreg/csrc/CMakeLists.txt b/PyTorchSimDevice2/torch_openreg/csrc/CMakeLists.txt new file mode 100644 index 00000000..4ff321c4 --- /dev/null +++ b/PyTorchSimDevice2/torch_openreg/csrc/CMakeLists.txt @@ -0,0 +1,24 @@ +set(LIBRARY_NAME torch_bindings) + +file(GLOB_RECURSE SOURCE_FILES + "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" +) + +add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES}) + +target_link_libraries(${LIBRARY_NAME} PRIVATE torch_python_library torch_openreg) + +if(WIN32) + find_package(Python3 COMPONENTS Interpreter Development REQUIRED) + target_link_libraries(${LIBRARY_NAME} PRIVATE ${Python3_LIBRARIES}) +elseif(APPLE) + set_target_properties(${LIBRARY_NAME} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") +endif() + +target_link_directories(${LIBRARY_NAME} PRIVATE ${PYTORCH_INSTALL_DIR}/lib) + +install(TARGETS ${LIBRARY_NAME} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR} +) diff --git a/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp b/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp new file mode 100644 index 00000000..38c45633 --- /dev/null +++ b/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp @@ -0,0 +1,99 @@ +#include + +#include +#include +#include +#include +#include + +#include + +static PyObject* _initExtension(PyObject* self, PyObject* noargs) { + HANDLE_TH_ERRORS + + at::globalContext().lazyInitDevice(c10::DeviceType::PrivateUse1); + + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +static PyObject* _getDefaultGenerator(PyObject* self, PyObject* arg) { + HANDLE_TH_ERRORS + TORCH_CHECK( + THPUtils_checkLong(arg), + "_get_default_generator expects an int, but got ", + THPUtils_typename(arg)); + auto idx = static_cast(THPUtils_unpackLong(arg)); + + return THPGenerator_initDefaultGenerator( + at::globalContext().defaultGenerator( + c10::Device(c10::DeviceType::PrivateUse1, idx))); + + END_HANDLE_TH_ERRORS +} + +PyObject* _setDevice(PyObject* self, PyObject* arg) { + HANDLE_TH_ERRORS + TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to setDevice"); + auto device = THPUtils_unpackLong(arg); + + torch::utils::device_lazy_init(at::kPrivateUse1); + c10::openreg::set_device(static_cast(device)); + + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* _exchangeDevice(PyObject* self, PyObject* arg) { + HANDLE_TH_ERRORS + TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchangeDevice"); + auto device_index = THPUtils_unpackDeviceIndex(arg); + if (device_index < 0) { + return THPUtils_packInt32(-1); + } + + torch::utils::device_lazy_init(at::kPrivateUse1); + auto current_device = c10::openreg::ExchangeDevice(device_index); + + return THPUtils_packDeviceIndex(current_device); + END_HANDLE_TH_ERRORS +} + +PyObject* _getDevice(PyObject* self, PyObject* noargs) { + HANDLE_TH_ERRORS + torch::utils::device_lazy_init(at::kPrivateUse1); + auto device = static_cast(c10::openreg::current_device()); + return THPUtils_packInt32(device); + END_HANDLE_TH_ERRORS +} + +PyObject* _getDeviceCount(PyObject* self, PyObject* noargs) { + HANDLE_TH_ERRORS + return THPUtils_packUInt64(c10::openreg::device_count()); + END_HANDLE_TH_ERRORS +} + +static PyMethodDef methods[] = { + {"_init", _initExtension, METH_NOARGS, nullptr}, + {"_get_default_generator", _getDefaultGenerator, METH_O, nullptr}, + {"_get_device", _getDevice, METH_NOARGS, nullptr}, + {"_set_device", _setDevice, METH_O, nullptr}, + {"_exchangeDevice", _exchangeDevice, METH_O, nullptr}, + {"_get_device_count", _getDeviceCount, METH_NOARGS, nullptr}, + {nullptr, nullptr, 0, nullptr}}; + +/* + * When ASAN is enabled, PyTorch modifies the dlopen flag during import, + * causing all global and weak symbols in _C.so and its dependent libraries + * to be exposed to the global symbol scope, which in turn causes + * subsequent symbols with the same name in other libraries to be intercepted. + * Therefore, it cannot be named initModule here, otherwise initModule + * in torch/csrc/Module.cpp will be called, resulting in failure. + */ +extern "C" OPENREG_EXPORT PyObject* initOpenRegModule(void) { + static struct PyModuleDef openreg_C_module = { + PyModuleDef_HEAD_INIT, "torch_openreg._C", nullptr, -1, methods}; + PyObject* mod = PyModule_Create(&openreg_C_module); + + return mod; +} diff --git a/PyTorchSimDevice2/torch_openreg/csrc/stub.c b/PyTorchSimDevice2/torch_openreg/csrc/stub.c new file mode 100644 index 00000000..4e02f9fd --- /dev/null +++ b/PyTorchSimDevice2/torch_openreg/csrc/stub.c @@ -0,0 +1,20 @@ +#include + +#ifdef _WIN32 +#define OPENREG_EXPORT __declspec(dllexport) +#else +#define OPENREG_EXPORT __attribute__((visibility("default"))) +#endif + +extern OPENREG_EXPORT PyObject* initOpenRegModule(void); + +#ifdef __cplusplus +extern "C" +#endif + + OPENREG_EXPORT PyObject* + PyInit__C(void); + +PyMODINIT_FUNC PyInit__C(void) { + return initOpenRegModule(); +} diff --git a/PyTorchSimDevice2/torch_openreg/lib/libopenreg.so b/PyTorchSimDevice2/torch_openreg/lib/libopenreg.so new file mode 100644 index 0000000000000000000000000000000000000000..272fb567b8daf1c45b8dc0f7b3a557257a8b68c2 GIT binary patch literal 59728 zcmeIb31C~rwLg5lC~+1ng{7D+2yog2h_SL^0RbzvlB+~cOq@UP6ezTWmeSC&C?vsQHvt}@Kr#Cj1`g%ybl>o_ zn=zv(U4%rJ;Ik07tlsY!R0-WbJhc!4E}53_tU^4}XHb;qrdEL`)7*NB9Fd%~T-1{( zwL#ZteCR%3{|JXJi7(TJHzUn-pUa=jTd5K*EBmV`lFxckZ{R9kkefD(e#=z0cRK1x z5et%En}}gh7K`@s^{-FJNv5PN)b$omF`JpC{N|LEWdgC&Npvw z4HO=?_(S#V?_L%;*p4D)5;?<&gx=Ian-_6S1lf> zElAxqruG=ZWfhUqbrMgMlaW^A;}H2;k)Rr4&0U(g?VZZe zTlaWIpS`B~2y5@c8@wl5k1Ow5Kk=3|g}=S(wcpIFEWG3W3g_!rt$)4hvKy99yLilL zzk2o3BgXA_+;bm4IPQ(xjw#!EK*_}oe?IK42hJ+|!eVcGv-72^JRSFab(nJA`n&Ia zH#YBsGY(vI_1uql4-LGa4Hh17c;OpQK0N!`tIsd7x79?iT>Of?>HXQie*E*!r)U23 zH=k5|`q(SC9b>uspkPUD;Ynf zN=FJ}nSc+DrFIWi26Nz*fbS1FgAtC{Cg@)+`B}~3djvd)ai@0s zF5&QV1pIFpF9yGm!!O{8LY^+s57DBDYrT-ek%E4GD~De!;1_}(;a7@<7|pTkaKYyQ zDH1NTX!kOKUoG&9gkQNsz^$}sq2suc9CnV|2=lE|yzI6RUz%4!4qW;qi<2HVth{!@O45ysq;DDaly}tkPpeNehr703;gTExLds(U={FN z1^sS8e};fh%%cy}gq#Qb9C5qACx3?cPYFk9qJRfLhx9}he)nVn-zfMQY~q0Z1iX>i z0qAXj0~QN-3HT)X^V= z`^(^q=5@Uo7dh`X%)XT>R-ymj6@2~ybbbjs8?NF2sh?*F{M0HAUm@@nk7reT zFyQHkc*7BoNAb9p)OnhHVc#l$N5mIiQdiR!4EUCKSGM_h*~lfHrY^6i#UJpt`Ooty z!SFm^q}JEzZ}Qayqk)JKfCeLN9iCObh{xO997a)t#8o_JH@DSywj&L!X;(s#*09gp z>}iiid|jTlVAE<8G(>Dwp2aIYk+9DPKAXb6c3&V;KMWiQHv2p)y&b+B643x7((Y?- z3awSzL*Zb=*A!tyr&WTj)^Ko*r(-Sp-R|*)!@;m$X>a$2l=f(Vl2EY2-{nDVbaGWd zX#occu$zK`7WAUd+Xm|B9Z6zU)EjR0c$?0R`ooYLgL%;lKUy;&gney3Vl$^({=heM zi}0t}bust@9X<@6H_+?}`&N3}yn!a)$~+=zsPrsuz%bK*H?8(GwXXKGc>Qevtm^9W zgnZ$SAPr(_hHg89VC$cu=^Y{ZTK_~b+dtpPw9)zSlEp6UaP|EtSFXRpTJiabpQ1X*_)sk9zt|2Cf4RVot+TGq9vZ%(4r-k9_P}wPN&D|T--3r<6OuxGdy#bE_8Ns zwV3H)l|4j;&qCI;&F*OkdD^|JeV&e1G~4WncvsEzGn<+m%= zS)MviI2w>dM;gf)YrOsl^ebYsMFakGqdsO!Y0#KuB{9OBkjgLvxDqB;P9~;xq==B` zNTh9*fndbnvepCTlq}3@0+~Q?y622(wrRqeHj&|-9tf@>Bu|@vWs|4FA6V7qqc&mV zea)De&Q4hJS|1cMxK?IQkNU!T#@Xf#b@-Z>_?a!@Gau8tKJ167ju0A$NgChGHkLM> zZO+MzAHZxd81>j-Vx>u5FbtkuYDkE7*qe4Nf)QV*&W!8d)sVl7w(<0<)qD2@cg4RTrd!Zf1J`g1XXPf46V`My` zV4J^bt#g{w3CjnC;_DK`AV#GDwwOkhFYauhYA|jQ-s;8sj{G`OL(B*|yb@P9 zeZ_6_1y)5``HEY=ph=fpwgo9H-iWkEGr=i$Tlmm9F*21x!$XlUnJ8ftsANX7uf-c} zi@<4U^F<(L8$?=97MZaFGm*8k&{1aDu|{;Q)tv|ep@i0V9~Q+XZ=|VJ*er~=1Myp#_`T1n?e&7Ot3}9=BZZ}?CrFQnM%#FWsW&6kF8>Qg`J}>TL#KADy9w3 z+H3~YVTh8Ts5f0O#k0dOrwP^srp>ODFIw!H=UR9II(L$|rwvjeo--8Vp9!B)cxF?A z{22T#W*^Fr!To3k8>@_C<>LX_51&$e2-nQ&jbVQYzXW%qu|pe!e+7u`QJ(Jpr~mN{ zAGKl{&!4ea0jmG*T10#%DAj<`Mde2r$`A1Su~oMys$OnV4iM#y!2iRQKfw;$tSlC= z0~vm)vO?tVX656RRU&^4%a2pSA|E}}kDYjlav}0`6+jjrgCn_bUHkN%$nB@Z1^(JS z6u}y+{6yrvEI&rMS>&I&b2WC}qm{cwzL}MeQXUlfYL+)CPl|kG%2kMP6e}-_e3+FN zDQS`SvwWel2YI^Wb4XNT*9NhV?-KDK2~UZKN!>Z{kihTDfme!n$#psKZV}g6p9Aj` z@LO}>iikIC$bqjH_#1QJ8w7k?4qWNu{VNlE$$nWxTw!t!yi&lca^Q^uKDdd~lXPT0 zCEBHM%7OUEahLJ6T)0Kh&xKp_z;_tv*KrouUm3?FIutXL?d~z)8{u+%a`80-{^OAK%kDq`?z1N>A2oq7X&wgJA(06)b5Z#2MH7~ri2c%uQ{{UFas zxyYhE0mr1rOwC@`8Q>Hfl-GI#or5G0>8%1j0ry7u1_Pay2KKhW`OT9z-JlYdkpXs4RCD}&yZeCHNeYKJcFb7f;4=111$_p1DFMWFU5eij9kYN@vOdK#1UJr&TMcm1X?bl(@jJrbUjmVCOz{lC z58$`#Zz{zz1pf}dWq-G&c!uBy^IP_JAjLBTpTlq2-yJEQA-Ies6J#(&f4Q6w;kWFs zmf{)0Ka}6Hzq?XAL-1;T%l__3@eIKa%u1^b@JR-I)d2sl0bXx_f6oA4W`IvNz#9#4@o0~g zwi@8)81O>|xO^5t#a#yYaR&Tu1H9P)zsvwX-T?12z{L?RR(hQQUSYsrZ-C2ZQ&fDb z0bXgq-(Y}GGr%_*;L{E8lmRZEO;PbS1AL|df4~55GQf8j;0p}!eYg80uulT}B(P5c z`y{YW0{bNJ|2GMIXrB0)8hf`yO_*N&v7)HGsYqdFK#grF+043+6_ev_W34dm|hU3#82j=R&p)bq4q+?~Es&yQmHOY}T# z3U;T@)$_E`+nx66c?ultPM@ylX=ArL?a=eIq1&Cd>3Q17?M@%1=V=4CJ3T?qm$H0; zo;S1n2Vcql(nbya*YmVN+ns(|&(p?icY2earwtkUujeT+h5qY#3Rs~3dY(3FyVIBG zdD@_%|9YM_VCcV|r;Qi-ujgsQh5qY#+EAhYdY(2?=)az)4HWvX=V{}#J6)jXX~VQT z{lS;A|5lcNRnH&A@=xn|+8Ck#dj1%eze~@Tv-~ghJZ+57e?3ndBJ^L+(?$sW*YmUi zLjUzVZDi1YJx>7_^k2`@h6erD^R!_>|MfftO3;5jPa72UU(cVw@*jL5`%fDZjK7|z zfDOi9&(p>P{nzufA;I|TdD@6z{PjF-K+u0ZKa1rr(eo7GLI3r970Y{N9+ZTa`JEHty;bD9kyjJ70Uufi@BoRO~AwjB#TUB zl$scwsV`Au_uoo|=H9ztA*quWaTc_7NYf$oj2bx$trO3*zLC5vS#i4K@1q)|1u0nw zr%vEHky70+S+k6%&uC9lf6>fu07k>^ z_*>cn(c;y{miS=r#oe8v`YlL_JCCu>Ufk*SzQw#Vr6v!Wa2e#8ocSmv{idgo(1kvz z4#;qd=`KLhZ6MuJVLq~l6}I%DI1Lq0lcv`wvG0i-1Xu9MUfe?|$7BZ0M|KnHW|0E_ zUWPyO50p&#N{#PQH@$zFx@k|5TJW&?^jDDsLEuI~pd>Tc!uXW+Nkh74_oA*6J#MKQ zn>`koQ2RF{XA5C7TEw!1q zw~>6(%`MD7rSc;i)cEH#pb7Mx=D{-%L_&!d*;31iz7ao`DOhu414lncpx*@aco`v& zKtD0soq&#am8eBsm5z@$6-1Cbcwxdl77D4Bsfl{4ng~sD4}2B{CtIC^TH0H+~XXE9GuEx1`~HBTURmsYCrdVGq8t}rm%=Qz`G zhGVH?33!gZJ6VmxD1kq7&-IWun|$s>sI0E{V{^|cK?nDpWiFQ@Zsi-$i;ODcRB7JG56esa#!LkAio%yq9)ES zbH`Dn{Q%0K#$Qnr%PkM6B%~MiVD?ZgAV!}@4Gwq5pVA&c5mooQ;x7#S(Vg(|BAT+l zMiDi5ygUA8ItrS6oWX~zzEitc;5=LhoU^H2SgaD(s+tH|G!G;HUSzr&Z!0fJ9{~&( z86<{*2~E8()9i!=F!)PFegVsxE@OxuiI>s7!w9_?DRISLNt0=ICB839KhgGl!h3_Vk5I*w$&Ja%$n^o%~HQQEqX zu)fi__o?w`sPr;5QG@<9pnpy1pDVsYjc*=$v!xHDi=tsRf9MrZqQ}Wc|3vjR5Y^ig z&N7;EFAzeiAnq(1TBO?pnt5-5sccPi&l*x~)IJ8aIW==H2`Bx16mU7P77-F)^&&q9 zwz&s~Z4}fn_xu@fcYHUNKU%cY{~&7bMaEJS=AM60CBmZl%)0BeWwdL6l-`c@POdYk zU@`LaI;0^5hqj(YmZr2!yQ4J825LHcCdOG!4UolZ0+f!ifd72P3hEPQaQl5IV+U^g*A@#WTU?A9D z#2~Pm_lTNKvQOsT5727*BmCiVb=kL>d;S0vrs*(Zw)aSJE!qL-0;wysfJ~dv*_az5 zONW@B5Jl02CCq`*ZU#0@q-GFC!)X&zHPHc#^94psU@RdFbh8vpN(|d}tyvT=7sVD< z%$q>5Qxw|)Ci>)ca(%xU6>RUN$AG1HzjotpbSx`9u^v;rDgKqy_6TFvmT_!E^~}8# zHqg!oW!rW&wkM;SYagOTZL@&@vp?z4plVqE<4?y2Vy_m#2st;!A4=Z_!RzC}{DnV% zFbqFE8W@K94^q4%{Th%sf6~4!RpTFtrMaqEO%zX>3_LPalWi&UJ*HP-6?!nOY*2d) zrQ<%N3M?RL<29+0hb~)EUJ^NhEdAtW^*CPJ+`FY1*s(2DwedlNMb!CqCuhC@y0mS$ z3%#T<{3&WM7PZsBB4mS|KzV;{c{vG9jc-zu&W!rW2WqmlKsDc))?B+X8OYWkR9i;4 zweM?(qw6l1M#$IPLoY3|#ds8MK$tck{hD92TR`?UG3i)hCpr5tBv>EKJ%1x`{8J%6 zS7J%Ig%-K8I(u8WwZqKl{T`CQ!Z!43pNdiG`4zJ6_^VhihMp4j+{sD5ypW)8Yd_=d z^=u>^O`84yu<&S4!)?FmM*O8qO@{WsftwG<1>Wo*0Y!2j5_r3pNSwIoQi8-yn0i#u zEeo=Av{nd+8XnwT;sFt7V7+Ix_B6X!GJR;w$XQa2JCBz0H3 zda|p(nmw3CN)DCc=&!D(6lBajQ=toWCqp3SOSc1NUUxYFZu_M7E>IM5G2Z5r`U-Hs zCbiq%YBjI>9z^SkT~J=8(8|B%VJwLSjH_|L6*Z^n z43w-Ne@IQthu=D<%w>N9%kLaZwfU-tp^ecupm)_hADerx209AYv;rVpvVQG$^eO!~ zZvy^wUHpr55h-HgAe-LKwr^^uKif{m+rc-@ z2}V-2AN1sT=t$3%sHeuCr!j@2f_bG)1amZ?(os>ZDhwPgIN+~ zsP+}sHM3$DPAwFqFYi!~nT9QC+72+cZ?iAPTGwy79{{?rgx$#y8a}g-uh0RNyw>`7wMo?S?e3%;s!j(rnOVtK7RVnppKFAx{pM)gAwe*}d#~ z=duGV*1NCJf?>VejVzpB<9cWQ;WSXxt0H@Qw=YumIYZl+GOxS4X@Y0&os^t#-6QIOe*Gelh7f)ex6LoBtcq0 z<1kqZM~zzu%xkb@o?j<6mO#Ihrg2UrNQs?tO4XjR|{c zSFIEXC^5%=N;)SOpK9~9Tc!RJ(_^Z8un8h+BQ36vz6F{w(@AKWFRtkt)@3n&VzaF% zQ83By7gqW62okJLW74kzfc+cnahR=3s{I4GE)8EV{z^6NADDkg`za_*`Z?HJ=_lDI z-3yB5o(70abk;(<*b?pZw|Qqv__hF@HLoMh;~s>$hn{ajDVd>T6ByRCowy^+*y+49 z{W%{$=(-(cwEWcqqOZ4X340!pbT#u2pr14%Z2fu$V7`8pG8S*6e(1{zMu8Tw674MT z$-EM=+YUh*tni`qC@=w;^>=WYnTF2&C*;*LfOcK>r=wTb#UD@8@(L0?g2dRN>D)j0 z8o8%qL;!Nmpi5ZKhW zXSm}3jmTL13C9v{2xIS-Va5Avm|MLV)QcNBijJ{3i3y>x=tOQ|u_Y*jbsoMYsD)UP zvmXHk?ItirM(#2)h>0Vv1x!us_Y+EJ4ZV>9Cdrwdl#jox#y=S<#Sl{cz2YMm?iC-2 zoJ2kxwq)bWwL8#rH8H-N95;mKD6;W<_)ue1iz=bt;45I0i4X&QCy@wEu=w7(@6jeyV)NiyEzE} zx`f@_j`a}+^JGAg)Dn??(-?^pH;pAo+%%3Cl*ocC9c|a^Xb26rB2DjNbDTO^6g`i& zj6J8KgDm2~I%)2uy*2A!C2m;%zK0tT%XyEGc`RoE`B{v@2jJE~q309onT+EIs@DN0 z{dW`y{h*ew67?d#V(smrEJbKeKqE4skfUPT-yeWB=kZv;0w3q<~UB0rIk`{s;=-o>_4tfQGV?V@` zBZMN+%}3}NQo@L-KiJEMKD?qW}WtG6_!-U6v);yuw$}9F)mk_<$cq5i73auqg#{-87 zx3!qtHc>-XZ3<-Jh;Jd+)QX)*&XZaK!`s#4I~4kWF|Rhncj);N;yQP=(QNueyX{pr zc0GsEh$Kx90W9p!BHZ?yeuuww3A=MP)*l$0&440#90}~sl@cdzx{4ri)78A7PZnh9 zXkD++9Qs7t^>?W1O&CA$!*}%QQ^A9_4dt}1(Pq01%+Wn<=Pnlac-n(deFFyT{ngag zhcrfy5r3AdgK9_u4{!#u@sNbGnq*B%1e*!MacA{VDr-^H0IWJfy3$k>yYU%+R z-uNQM<&PK_+8VRXGGA!4(;y3y=zib`9Z8>rRQMU_N%~_{!Jz#N)%Epdr1840b=Y{l zfGi2{E;LWnDL#S;^eccdURNXe#__s{kl6DW?M1ZyPmS09K>Vl1i#^4-1A@jXCOdt5ubSqjMwo>A_0Vq=x7#%;QoX!DUpk0s7!AiENi6Rys#?c9P zvT^dzf!zOK@!GewnXJ_!^F7tedQy>hSo}D846RjV`dWp2Y>Sn49ERt?YJxZPtX2p{ zhn~=GM*_B=1wpnu)*i)Oc7E#5O8!@#2bc`?(jEcBIp+ZgPUiuBPhx_DY&Z`N~Zdz{xoez(~+}fJKb_2~n=4J~d=RiM7lJ)SHu=9tZh+Oo5MF?~bxDZP+*{`JO zm)|EzkEXLMP(isrW9L?M7z|&1^dh41MJ8h;ON6TLfOz?Fka%Em;OB8Ft#N_ z*R=E*^5cgd;QJX?hnSv#bBWxfB3^C2CsmNzOyiHUF6{UV4#8|H*=Ft~*Nq*Dq7zNx zs1#H+fF-4rrVkFt9kB$;&t}c%z==6a5L#S`@@W(z>ggu}i6bh3Q-_#!2}67hKqpNv zN3y^qKFO zOQP)g6C*dbm3c_VIa)td&Bvx}sSnNHHyv^Ug&8M|PX7crdgsxn^a)Ie-QX?T-uN_` zG`&62&;C(_fuoVaUNPN_6+$xlgPSjw!g1RGeZzT&opiEKo4%dC*K+TI5T```)P#;DiP2l{9zzB?3>6Afb9paKzt_1e=>?ju% z;1nWMhDud;(vN*VosWsbq}#R5*>yO1>`MNEe)oZ`3pj*`qizs5eL+1Q>fqF_E7osi zUVUc3Wq&35vMaW=#BD#me9dNx$Kc80_;S7%Rq;UI5}C^5s&w9r=o|w&?BK*se#RoU z)r#}RI16Ekz6g1{?TDjBCb<%GIrHuv4?@~J{d^S7hS^ln#cO$N}PkP zC0k2W`^%k2%lQe1@B!+K8rxi=?R*LfYkzsoJ5-Y@A>ttpSwD*+%msc}yPxo4TX1Y; zulO=`9=$gAz7FM*JvaYwD{?HVrJ_I78$50yy;6C%Bii0whHy%vucYajG5Sp%W7Ar$ z>w=EO+)Fbj{RaNfJmL06O%^X!Hw_lM3i|6XKy}F=cJXNzCgS6?dC1J?IgWYKxv9v3 zbPz850fn18ooXD1qc;z|jfeX%#4h`yGIMY2%S?tQ*7-`xl9corK%&!i!Aq5B_Lo=#Si!FoDMj9C)_W$-qO z1cZYgs=cNXV2AnIE!q^+b{4!rPlE^b_y~^&Nf_FA9ptn2B62Vmxyb(gqe)!06t^g3 zPx$y{_XCU{{e19V+Zj3n`7}tl6G7Tw(NSMIZ$9@-EG<=vx#vBUIcP~xFxfbpOLylJ?WMpp;Rm_9K3PC;gR9QBd#<{#{4J$bi8yA5^jEu|dAa7+(D z8P(*47I)GIn~JlxNz=uPsnCgm{!qOgWmTF94dbTmVP|VWzq5^)de|Al4&B)WLW=g> zBNQKgvmkl`_tu?gqs3(oY@t}{6L8B}ICTPvk{w>BQwS^`1S$nXk7f0T>l1Z}HbT?U zrM>-a+sA3*hskf4zF}``IHiY%=%`#V!p6mn$q<;rfwsG#Y-&@ayg0jZ;Q`{+fV$&@ z>Xd=(8usv-u8uQcn>xuhxs%5i*4f{UINkC0vi!1B%CJXAvZK@QeJ+J3h6N7$9q%-HIOt!ITLdG44(+|8bdw%&8!h?XuLqOx1fXv&VSIm0_ zh*?VQ0t_tqBo9^tL%cUcdV|n;2_T5235W2&DZMoy2%w&vd-C#GPzL zSRZ1*>jXHY;gMYFG20?`Eb=lqTaZR#1s zfcImDN~C=w<49*&xpBm~b(~`rN~^_C2d3$chgotn5^eofZk%W)--5o#S>0~9zg^rT zE}XvG09T-}68{arrT;@PdKv*A)w1_&r%8>-X+L-YtHm}tzoq>Qyb$L!-2~^Oz`13` zX6;E7?QLH$iy{|0mbjCv>g5q&xKi3tOyYQ0n^;hdWff19P9Ie51Cev+y#waA_gAw? zgEwmM?hMX8E``sxJ#rwuUPBNIS6?U87w@*!R9cDZ^$?PpJikjT#7%a;FV@vQ=4ZX4 zZQX)k+yLb37BZ#hGZ~EOF#9*MJva+`iW3R6OYNy+I(sDER)J}#S;ijVtOd-SnEKEn zEEsfL7U%x~p+(rXeVs*sdIX5s0_r7ADArqjW|^lHmoxroDsN_nUs`>>={?AUCb~~T z1K$QZ_V- z`%N@iXkru#ui-h=p~;~?1jJg`1DIUj&6|qY*BP|*#nbCb(zh^YxSPuPemDjI_jh@o z-gYqNJN3NI3uiZS8h1E2tLEN=$n@##vhy~a4)-gFk5NF$u4fVscjE!(pOf+2W!~f* zpC&u1-LN@3FMo{TkaP6UxF>w3k4v~z@Ot07doRhDT*uP{hVK6u#F#@Be_wlD+-v`} zi32gvS_Qiwwo;PO!AY4xiWzF9sI>R|b)1RQOs|tVXCFlGJvCtR4~GsPPEywnLPPWd z=WGxpOMMv>6uS8HPr!?`)aSanh5Z?Enp}c~B5oneyH>z2VLdo#C9*Kr=TnlLy##Ck zPb+_j6Q*ZqRHPO+c!twRoS3o^vo{%7W0p3fp=|sqlm1}jp#SM4iMk$ zvcDgh*aH7`F3i#=w7pd0HG8zj8N$P=eQOk_#WRbNkx6iq-qYSl!61#p`=i(_Q`P{v7$#?C>qusB8BruNNZT5|*|rYuw55;RyCL1au{>=JdwT)nr5EE)>Wj|A$oB7W5Y#T=9NZz=Aj{e#&np*dcya)Z(OATa^eV%s0Q5v@m`xj%1s{KoY4fo_N^y+2yw zp;#i5)Wk*DOV5059%O9)Byv#qX~#y&v>KrIrf9C?g$w5(wl=ia z?_)20(5;A5u!$4vuSy>SCSgBVK4H3$h{aNH07s|Kx(|#Nu zWB=@Z12k4mjWHGy^UJ86O1xA8LAc9R@;MHJH+M8EhIy+tPe2jOTeW#gb9rTpIS6$8 zb(dphFb7`1E!GJ-_iAMv6EFpTX0v$;4ttxYETM`hM5GH(1+XIV;{|v*&Ve8zW3k4H zoux&MX$Zshs|b#nd#)q?QKJ-iB5=uZ`mHS3p~la*L|at6NMwbP8HEQrnKu z^w5upud$3Mn4>GDi&xmY>7>NkY&1JO z9^gu}v!^u^U39=NwgrOOxm=y{KKvWUiu8VrYs7`9L~OOP?1=eFVSE=8A{}M%@-s2(n74k^-Z5helSeSZ3?cf83^rS$kGUyE5jmRTX(PD$2vl1c zM)Va|;#3y(ph*qqMa7%)&?_D_KMekfF<$;493c@eKh>4!#3qaad+MgQisYN|=^KDT zc3h8()A@LKRIG>QAJX@sq?uDq;qJ(#vTu_{?A!35dGtluzmjNwx)vJF+m~&#co^v< z8CJ5C7^x$UpTokHk$H)E6se4#$pV#QxH`a|g8{K!!85Oj&FK9{|0W7?P^*kr$P7yV zeqPDYJ;VL`p~v<0i=3gq(g6NNJlQF9oax#qAM-fQKwO%X4Nel@A2U`j#v_;+cnX2{ z)>sUBmpbJ`bqd0GFRPnAD|T)Ave;$5bD++Ar{f2w)ZQt$!CN;@+sCftffLnun)BmM&ME1iQ^MPP2>i?$qH^xK+mK;h{5G{H`{k!Uw=E7SW_i^LMJTWtqRjoxqzH6Xswsp1;c- zcVK+g$>z`rIz9)+FcArp0V?P(Y zi-~V+(Kh}GDqZj%`KLI!g0GHPAm!;U*hNgU!>k93V{cFs(IdkJ9)}t z7a~aHwx3R9nF`>@uPgp=T>&0=>`K!M;jRZ^2xEIiTIdLD_M0bpk< zf4QvA{#LZ}L1uXGMr-V?*$-;tFv*)K5U1WM}3%dBgz2QO3uu9a>3Fm%{!dXekobIPp_deL(&dM@p2{2%U< z*OWy}YGUHh2JsyPXJyN2{%0S{())C5as^d2v1T&aboID-Rj&AZ2vVC|Q$E5|GIi6Z zMQZFzJTZAxjcw1Un_k1Tz`}c`RXy%Z3+A4QvRFshRfLEYzVLyxKXE2H`#F8VOk7i^ zU?NfEyT~;q<)V`@>#l+jSOiz=`Xdc2o{Mm|2agCS`h^G=pX`RH7ouGUf9Nm=4=k7A zE@%;r&jc(#4d?^ZH}(*l82EN2_(CG~Yi>hWx2BSTc+9A#I!E<{{M^ZN3l}7dXVC-Z zAJY53n8Ykrrkb4(xsw+Z7P{kj3ay=jZ&pC*{NyQLBEk1d(HJK>CS#Kb1oxC(>eHG# z8QBUk)TqoKvG0zIQ>WldD?ZWIpu>QppLnD;gElXU79#@my7c+5bEDA7i$*O-9$(>( z6V26XHi*50`FC()$ohV0z6g4z2e=1Owgo+{O8*M8g6Y+i-L!>PpMJ-kobfzaM6h80 zxbt8Yo9dqsGw-0`q06QJ4KSW*Vklk73ro}~&$|mg(Y662y$Ue+1FlUwXGcuh(z~!% zY1{yo@qJC>!TuK9Gk=HF2}(=hf_>LM3G9==J_+oTz&;7=lfXU+?32Jg3G9==|6LL| z_H5gXVA$=&uidvU@Pz}uHdoz3{EYKF_o6utx5u;4W3x@?zj0sZ4R}}i!p^yN{Q7b@ z91MFR^_}$#I@@j2*pJ>fh1M?c2b!Jp>+xI2iyNlnGt;xUo&6BK6TdfHum6;L&X3hA zoxY|>FnrQU_?cq)tIFO`2tSIfC{5k~eiGb@U(3d?LtF79vnQRze%9LB<_&`xYoyg1 zu(o@<@LRg)`6f>p37=EXSK%9m`(ON7VFmbh`ibmz?`of?qtzS6&tylutMaz;@B9X_ zk=VER+rEq}^kKUAac%xf&d#N6old-%Aug1kD1X~o|IOwc9EnRQeo%Tk$wU5RIDSam z;}3+|yiGor%_drMwTIg1SIH|q&aeM9aU*4MUmn48PJA_s{2W$vax)$WEkM5hMm$8o z{jT+y%&kb3UuH5Xq`=v!{hWa#^Ys*#r9`AP`sWTYFAo`SUT zHt>bC^7c&TampjrkgmTolbKPB-_%3uM!E}WBT@_Y?h&L`r1bqCl}K+uT8(r7X&(Y_ zwdhCa&za0MNE`7mt_1xFAw3*vH=Y-_BCW@Xf@i6pTQV8TXtakj8*Zf4+cKHU5$fE4 zbR*Ib&Re~Uv>N9nd&VG-!qaLBv+4afuhZ$P>pX$tAxNC%K^Lpq4`C8U*aW-?zOtww4=zZ#L^ol2z- zDSZP#qn623p?|xOhQMzMCy;g^tscr`=y!TIyqC$`0X|bWrgRPXqU%7OC0oDhM{e)DkN^yR@WsGvnbVfGUTTompBk zZ!=-C-N5p83gx8ci__o7}ZsrrIvMtj?%JN(Y#Wte{5-4O{v9ES~7RsLc{%h z{eC`VrlJ}hjNAbl57VTaQ)>Bf;hfU4D~jfnT9d_dN+(}#a+FrC8|5giijA%<$yyy*hR1}0J{vZgAFhX=KD1{c;$fg z<-n={BYHxO*f@TNtb2@8#)3cD$$$ zi$?4N2FwatRp9GxF?WjJg3-z81C7&7m_PORWXFzlW+Py3z=|MCE=MqWMUj(q(@DCS z<)OB;J}@E+^T2*Bs?%3xTJU_Bbf@?S;32P^YSEof$Cg&sGSTQdF8WdoS?|GGRSlZO zHr|&}>r9syCyTB?ky3?E2y%|@1+ckwBRSu<~(1NSIr_oD6{ zsJjD4C2kRQlQfo>7h#ar(OAYz^GYiVujC^Ms(k)A$FX)b)O9}yTlO#3r8!j&8Iuh! z$34v{SSB{7P652+ z1?}rlemv^HT(gVl+yU5Rz~p=-yO{!P24GVN%T0D}nQm9EfsUvs+XCSi{|=UDq;Z9w z#75PY_7xPJQ;3m;93fAAWF1fuWioxR-rfQE`KX<&J;k_%F;EU>PZtBZ62(hsZ3gZ= zgqs~>(2W(>mR1)=$Cg?#KB$DzT`-Q>+RISqg4BO*YiVwfZ(w}|-+Cn4PZnZs81)2w zCd)!IzEv2w`QgXwN#yH_E-z%d`7Y}2L0v1= z#dQk(fm#%A$z*;?6SPUEC&sF>@Mnghn$IX!3W{9c$?s@}|0VUb6=cRk{*8b=3H>Wx zfXt$NYnsaKd}Gmb1+Y*xd#!5o#uaw6jy&z@b1vbV)QoG>@8>{?Xsntsey!m7V&YHX z{Ox4}VeJba2hjCA|FE51ABp`vR>)>GFwE9kX>-tpiKuS6-u&w*Z!&q%t` z2b{;Tk8Pti@8xZdpwEp(-FfhYFr4{L@%v z!&sOnNIMV0hD~q#`q>53XPv4jzbG(mGbx)2OwXDMHWiu% zP0G84rgu%sWyPlZOv+2erVS?L7bd#@4K;=yvnzA`Aq5lc0Gy3av&moZs{+%hq7-e6 z`$>W6rUD{9th?iXTwv-(3k9Zs6f;&%;N{<&h4iNdrca8L-xr#Oij*%4O|fF-$s&S0 zN51@9MFb}9?Qrj?g-+98vGNS~&QcP*?MwDaV4no`NnoD@_DNu$1peQZz=o4}dnr3l zLn1G^sJx%wb+6%RpGcK1o?kEW8zfv-?2|X&PHLg!uZF8KC*PQpH=us)t@MvRhM!26 z%$u-3Fy;@)$sZx|GCnp<iY+T$*bcDkFu{oEHjC|3fgpo` z1VdBccC%}lAbg<6V~fqMgGH(x#cw3MTL?n(*(%z{){R|o3fc8lhVx%7I+hW6fv)Tl z`8FY#PeoqJU6A=dCJBl8mfNAqN)F+_afSG-$Tx~KB+_n?_K9@8NH>TyCDH+r4vKV_ zNEI7z$0AazNGnBJEz)|CHi|SP(r%IViFCb4H;6PP(gBeUigcGqm1&}Vky=GsDbi|@ z){C@Jq#==Zi?mOq>qWXjq$!aOh;&e-yF{vphi?{Q({7+!ODj_g*k z@n5HkKL?=#AT~IPUR^az@8=I z_O}AAU~5VjdoGCdsXXw1NIC+aJu3vh4Th%x(LL)!L?vtA0s>ww;LCWiavR{pPveOk zAo={Wgj0|Q7kgfY^br9M<>0?4;N1dVO_gxH4|oAZoBMdamnX_*7`Tz-+zEq4{8*NA zM4!N?pBE%}w}7)}2`Kvx>?y&k&*6wtp4e@3I^XAT7H7xp`+(E9ScQF%{oM}+p3xCU zLL~kgzzN@ygMSr+AFiZ?JShf5*Utrfy?{&myiw3k3HeaWm97B+e=bL!?@Ih1aJ=_< zqLe^xG`<@JeQDnh0G#-LQ_!K99$nJ~oc-hg^0L3@3iwl13{9!yiP9tBhYLUEbOFCn zz}p2};?ubVYFGL_GXC=<;56>i-;i=h3%K+<#B!z3w=RvO4+j8F^pzZc>L3BH%z;l9 z@Ld1vRKQ7YijW)4bGqg;_;+-_fI}7KTtP?rbrOCd!#@aTP$Yz>Hvm2o|F-}>NJ{fOfPT>80~ZtS`Q@B-y9CD(3WA@HT&>k{}G0k>Sl z0W9{9+eL7+h>j-gKf2GZ)qoc&W<~mg0<4_N@DEm`|1I@!3&Sr}2Ip{0)_vT*B;e9N zm;EBYw2-{MN=ccgRe%@3>&x{w91MP#vP&zjC4M*0v*#GN{fB_d{fd;&65Q+$ z`t?gVKAjz)Yo&mP8d+BPJx`Q#8C+xx$<2Tt4LT{IpY-gFuBQN>z;B2B{fN;iRR+Qw zOfhx3#+Zqpy{|I?C;FByUMA&yOCJ2qdEoS_C-rww*q;i~6rGT_ zd?dMr^1yFlboQ3#Cx9OX`=g3=bF!d67sLs^N7yAfAFln5~PcrA3-{h2miA?@cj2FLt5lUHJAXK;Ie9{gQ^GyS=c<6j`!nh6JHB>MC6 zz#H?xw=y{F;c8Bpo@LTCm_U_3@Klh)>G?EWw=g)r zH~zgZ4}1sUq_>S-9GlK~(KQuI`$+S`!QiEeRjfzf<>*Rd9{lqG|0VQc!&Mwn>f3Jw zodG|GZx?j#&qL?sJn)b6z^5KIGXM1q4m-ArH@HHy`+dNz;@+5?mxs=kdEmdw1K*Sf z{yyL&|4JeMDWdUH59j(>J)Z;UoG4uj0JpNc;eXD~1CIbs^mF~9p9}gHp+6fWxprTB zFub;+DHKvXXX9-O+lijJco`<*4+cDVx1_nv7k16<>~#K*yeA{xk!d)Qy(d%UaXK4j zYDK|{puaRk4}7WK39EbLu5wC>kqpY;GLD6*QI98X$v;3 z#z;oc=LTOJ-fgmVL|k|m3ZNNnzK#y|q6@w60+18k_Mp2RK4(2@Kp}V=o2gHV0bPDq5^quo43vH?Er+AtHF+l#KC)O=!c_YZ9r6m zJ~jNW(Wm^DW~4W;{{6|H?|rRrqG-@}{mnhR*++BN)$9vI{E@XTi8)si44So zws&}77x98*r@zSub$51R3>u)?t>It*FF#79XYd;KzT}zyNbB4-@2U=7M$~NnfIs5L zlsV7m3Hw(0LE7)bZ^5H%)+$d6kU_eEwU?z)ANB{s7;Z^s;fXNO0aUlm^fosy@pi0U z66AH9*_S*UB2_J7A!Q4wa~OmC4+7a=64>k zQc>QaZwf~`B2j#oaFf#PnT;7V_srx`D+c6hweE~P1m zHy3>oUo+X~kuW?hkhjMh4tv*nFkIoaN=w+=?(;N9+uPToiXn%U-5(i-%7t(k7&S;N zm(#dmLBb5oETnw3^UPbg)Z3oB55AwILtWttI#$hzqM%?%UeVcBS^rimVuH|IAjQN;f17=zPRK!<$$SK7eSK$p`?oL6+DcLbnx~4V z_N;}DI#&&*-IBT*ebVWS%xaGY+Pxw8P>c`BUsZc39E`vvqyiEKwDsGR1zU}p{T-no z97^U6%9dxe6FbA0nb86t3XFXFZ1CAB%=DtUa~qsXJWCvN+<1|kb@@MWEo`%(cMy-_CxEvFxbp5S?R=A4}AOGe*lz*35d64t7-}cn&H}D+urF7`@JjMd^6T~ z{ShZNn>HI)6c6pps8Qx!4HwJwN{D5iPb{Z$)SPYJP=~L1iNAd$mx~DvhR}nK2G>9q z%^qfda`}T~XBv^DU*y5=rEM5;g>43i($WnzZ^(Xa0*Tr64d0F6@icXLJuBhV1wc5` z)LIP{a?EjgY!%bFkiJQU*whL|J6h3|rq$%(xaKq6a(32+*vw%~z>YHql8?A&TE(=A znY@AFdPgH?)QSqI9-ICn4VJAE-}B*bS?htjl~c`T8%F&<)hE8rFF7MSPyR!S@V&68 zeT(Fz0a0Ma{aqex$06@kfno9vhUaB{3|xk;ii6nhk6{^cl$2c~#UKCKJ@= z!wfJgG^{3Od`k}h#?A}7$~x~icD;fr)G%A-58xYS+Tl&-VboZiM*EC~u=$t_UEY=c zP8*qkrZ#U!he!9UoJL-0P;9UOK^Rds=gB%5NfJjPZY}xF4=Wqls$sojT7g*Q(tz+G z_+VGb!Di-%?KA?uH6A|nIrC2}CI9jWvjt8J0v0$PAJUm4o5c(ROmmk%qQi5|0z^=U z-I^B*H}5NpkqOg`FNUGDK`vPk*bMnyK-_obfrWaC#SRl4ThH?CSaqqA=yk@XlaURq zX*bdVq`3`FnPJ>2NL<6zq}lhcsR`Nf5x)+I0%2Ium_pE|8_Q~a*cZar1$lg3zNRRx zV>L5eXx&b01k)VXV(R;Rr?V5&Cg&@ZoLJ_tEn3Ky%^9A#OBXUfgo_0hLhj%(P#6H% z_NLa=o))jaO_#|GEO{I~*DEkA`PpfL@UsC_{~IPQTZ;b`e$Y(A zx4IM<;yL;d=>ZWPBHc^)piS)|z2Ev29BIj|!Ah7rBvJ`@@JGz+u#Y)7hN+>?E9OZt zziL>UG8zm1bf2mwh(o6UunwIb^&uF-2Tu)l5FF83Shsq-b@aU}aN z)6?LI_~8bV+sHz41})H|dLX{{HrqHzsNB}Uw*Lszg>~5pZ*U})(7iAioxPRADAXB0 zfcvpGiqs!Iz8=>S3KGb88g|MQOSc9*N^DOptMB5?Lni_0(DizwM>YOt5 zWlIb91H4V=M*ZPoen4J_lATheF>TgyULaVo< zRjFuR8vrq$M#8+L6QLb?K*!KT#)C5W2wtin?m}%5rGmvvDsWq|Du^uIboiQ-3Rrk# zSx~4V9Au9%D}1dYB-+{xYBGmrnwqdMHu*wu9hml1L{@fmaN0a($|=cvqDIVsFk{@? z?njGgpPC1Ro|#rK3JBqn-zOgj|EDP)Z!0*ui!+4$8WcA&wF710YzW#!Y zIQJpX*~n`0oUF_{aPu+VS&{9__jF|1EzVQOdb#b>ITbpSBJ0cdc=S{p)SzP@L`v4j zu@iPR;4iT%>&y3oWGdeS;@F&qyr*Mez-4{;UXe`YdquK+Szq@55>bDSASmBclBs;JiR$Ahhu*%dp8$+l z#IbgMRSr){zl@jUL&DbK##rAf>dSP!&?AX2&!@1Kr>1jEnM%BT_5UlU{(z`2 zQ{#8q5e-RCroYaqFWE&*!mAzy=3Aqh7M*ihFXPL_LXc9kHes_xca{T4{7&0Bu z3pfRb%mJii~hgICJM&&|ILG-6SzS#JH@@sj+@^h4A&8|usV5W7YD5)UZs zl6jfZ3rW;P*}gp2f0?MS@oM^CSx*ASqcY7sSzn%45B-i~(D(e&C62x6^H=tZhRfJ~ z_k#>w=_bX()y@89|H$@bdJrlb>nkaNyMd>JRA{(lJ(*6-sW0Cn7`%&Dl6*=MvYyPB zqdvi8|K)oiyF`6WB8t?Slb<5mugrnV`cre@vOC5!XNs^I)<5uygQ5dRBD)YD$-lIp nv^?p&^D6xBW}ZBcb700xwku&1a5GZ<=U?F%({d{0WR?F1mZhy& literal 0 HcmV?d00001 diff --git a/PyTorchSimDevice2/torch_openreg/lib/libtorch_bindings.so b/PyTorchSimDevice2/torch_openreg/lib/libtorch_bindings.so new file mode 100644 index 0000000000000000000000000000000000000000..144e6dc6d88bfba08b0424d1a4b975ae430924ab GIT binary patch literal 166144 zcmeFa33yaR);Hb_1cCw`6g1;T7~`0TOBx6y0+NsfZc7IPArhC^grtFJHj-`#Dq}DS za;NQ?;JDzp!MF^L3PL{Ucs6u51a%@J?`zTY`@@9oN^`*ZYt|IhRP zo@d&J^!=UMPMtb+>eQ*a?Oi$U@!jn9?l%3?-FA)5ZUB5|7|#p48Bd_4+0Mk@1lu6a z+rvyb{bBYEf@XZ%;smBPV(4)?{lRBC|24lKyH;>ve9Psyy_GtbmhVI3MLov1T&^dQ zmVGD6EjvT>$M{a`E6Nq>-$psgMdb>A5akN{h;qhvhJZHXJ62RE*27iF!HXo7$jH{$5|{&uK4DFBiW_^w0YJxvm+;x6$5Ylq0$PryqUTZBz92 zqK(6JRtl(r{;iKP%aZCK@`OnRHi)N%crg^0o(Jfd(zg%Xf3T0z@J!uP3+DCmH`=d_ z`~06sw)MxqPw|h$+YA4eBrTiKcJGr3wv{&9AL-y!^?&td#q{#n}M-<TQm2MFF3X5!mO(nojL0MU+uYW!;tMq-(22r*SRmQy+PLkuDZ*g zcYq9;$_WS2!Lww7{``Ctde-@o>-iT18>#;HFh3&IAA`)1^56W6$oXetU?b(zpk4Spi!e;3LYiO+vWsYi>_?qp1&NcGH(LcbgR zjg)_6lyN+7VC4KmqU0ZXcI5o+$R9}#-J{51E;7?xvwo_fSCQ&@2Qx2H{_c=Nr2He# zi%fqo%DDG~;fPev-YEQ!jKb%>DE%4?dZhaMK%SBM^=zNW`3Id7S^f*6@To=N^ScWo zxBCp{Vm(c%6d^SeW^FvYWk1tBQqoeSD zBkGUTZqF$7w?^TAZj^bt_sqz0$c&;_=R(gT@$-DY$n~s`BDZ@`PbB$QqF<5b#e^vB z9)dh0@$*;|{(DE!t2d(9|MxGn*?ws?s=&8Kk#i9`6^Z|+Fkd3k$4AlU=c4GtrBTM~ za`2Q8fzN?a=INX$c4=Z1`P>V?2R#x$*G8F-|BNF45cDdNK71Qxo$VWCovJ=J@;rJ!O1sZT;s5g}^N9Ky zNv}SO(%hEV!EI~(kP>Bl--DbZ^|v#M9`1;u=Wj)kvp0%942{yS!YJ#` zq9}5!isGNVhE7G&!<;B`n2B+Zl)owpKmI8Cwgml(v|cnsnJ@Mza#$IK&tp;O=S1P> z?kMB(rzrD!U=%-jR+RZTGfKbqN0~2yDC^zzQRdzE7?eo***%K>H%D2oAC98uKSY_Q ztE2GwW)%B#985;y(-y_f*I=GT(zlW*`?%gw_K8cPjPH&p^W`2i8mZk8s6P_DFp7Ro zi!$$Si=zK~qUhWGQRMc!D0cP1D0ZnJN3v}QG8ynboxXRCjS$+qjomu(cO==G=Q^(%UQ74oF~wgDpJ2*H-E8F4D2Tk{`Rc0dOh1%&(ZEZf&ek{XSU9NhtB^5ou9X0mzHCOO>Yx~&3gVq%y*Jco-QA#2Y)Vt zFe!gVd)NHqbUs(UDDo}P^T+A(*`@dQVV!gqfo)K*arkmsh@JU5n>=gqA^ z=A60`&lG>9udKqGQ&Uq_L!_D-Prlce@2e@RoQ15D@_o)>H6oA4OPR8V=FGKu^5#uWxo$sc2o#BRIKnQmU`2rewBrMBe;q#Wj_vUMa3DE%(-B zIjKS;J=O0kE3b8qDD}=QEAe{Di*KLrDXT2=0}%q{65+Z@#;#a+YU&O;rUwgqt~?jcjIv#-rpmPs!}tJTr^S${|4#u(z_* zQ&Q|JnXTg)7^O;saw9}E(rDRRSK_Vql~q+rT^gG1t0}ImEid+YJ!ZPcSLLam4`Eg2 zOqobhPW4EM)xH|%uy9^L?(L6fbEbH5JvH8PZ*eX4U$9?;L8+;lXY&+P&MQM#rg&@p z70?cg_=XCxc}l7<-d%-PP+40xtI}IaLKK2zxfGX|S3xdPxAYc8d%E;l0rKXLtEwtD z6oW@4vzERxD@(7N>#dntP7GQ&8V(JeU0gZKn=J;k*vJ9(@L_nI!&#@YW_wF+8xNe5 zH?bstsOMJv({=f}5+0dSDtMYIL-SA|Vv^8?8>hNXE2Q^uW{p?wc7?a1qe=uIf<*0G)i?&r*r6x;@Ywjs31)~ zR|%Cs_iH>*pE6&qt0X7C7=LooF~lV`UdTOX3fF;*=^hWu+*4jP3svw;rM{uAl8S2Z zTwXQxiQ(GT2F*rXqKq<9ebGBh{(C@v!IXQFlN^YI%$)S8@voM~bw-)OP zm1J}9Q*|Oo*Z1*WvQom1PpYb^;B^7BNEjtbFrLZGPpv5~@y>w3vnFH}`)g@A>5@CE zs?z7J^Xaw2h+y(zVA&FsdMhwbX|4J>8m(Kc>T9lu^tmw{YoXACpE&UWPxVlRmT^m! zEzu1B%;+2oci_gU*UPN`=f!i-@YL&Jz`{km#T8Lnq|9h$1T8w{Ufo3hymtR*g;UsL z>DuRXA+joAUGNwEh%Pqn3VSre#o9*pYEp59cU&s31v!)QVLQS}IWws5LyNH_6ql4> zeXl9T>~jvE<@IHTvB(8mR902%0YBa>cGg(JI)6xs#1)UjR9Tikqd)n+A8BMbdvbY^ zha0-xWqftN5Sd4KJS0{K6s}+`+}nsfn1Y3Kf(h&ALfBK$A;Pvj;^}w zL;i?E@iT3*7&4>op%|>Jg6vEMPIE155Dlo2nvT@GnzFfA)(dLA&d8alirlF#xr(1| zBgua#t|f;KYfb)ep8FCbPP9>&R}h&2O3cm6kDjQw^8PQ0U8&6li*Izs`Rs44o|F-9Vt&4_T^-CU3tic zEk;gGcJ*~rX{S+8Syob2>h)yS7EB$Vh8<15*o}{ar=M9<^t^$oGBHs3$PNBeVZ zip(6DwfSPxEfb}BYEgTa9JL-jmG~_&!d6Zt@mFdoZ&Vm&ST8;S ztBTK8V`g_FCLvm5+5c4MbVS%a2Up{3u*`YTL>5?*vAe9&JIP-$!&?J|`Ei%})(9@O zRQot`YkwBIWFKKYXnz&)PfH$rqD&eDhOT%#m$b%BXJ3e^trSmI`E3joODXygr7#AFl9UJe+g+at1 zVI$x%w&~C}#ESH+PT>yohEvaXeQ~*84=CxGhRUw9p37BP>aFWi)lh7d&0V3v>Wt!2 zkJw2ICg5#X)lir!%)*GW%DLF*AYaWa#5F3-ZVWB0swhUFMlc6m_!%3DktJ0X)x|Yl zjuhn0pWt%mlV*5wDr^1N{eUU7P1-!pUsmpedje8+8K$5f_0VTj7Wy#WUs*z-A&@d_ zW>H4NZcu#6oNQbg6}qq>O)K-w7Jbc|pH*C4i%kW!l~+}Utr$&llb zk=LYne6_WvC`BeFiBLkZf=56Cbrf-}lb5x^;kB+T!U+Qo8=gA=M3)CEGC zl+UZJ(J_gaSri25+J%D35(@8lOGRH;+?geF{AD#>Q4o!rQb3-mv&(8F5fGn}K)Q6r zRcnMTEx8P9DwoWLfPLO95n)7rW+cCOuE7N@;OQ8Y3It<)UJ5-LqbpVgb9|+74DYBV zBJBdGepvNsHCS^BDiQvv6@g%~ir6eu7|BNz7w}KD#_&CcR~MJnxM=T}HC-0C7?yO(mT&1YzjqV$((k{Lu)^WX{DaS1Q%BekK%tOTI@N5XfsK) z6bFhT(GikF@l1Im3pBbsDJ8NvJwGl@mT6fX!U`jJCjc91hO#2p>l}^+veJu%#TW#H zN)W*lfkuj#&+-@7U=YMC)gwfLriZ4x(28eN)%?uZAG1>9<-n1r*k6YQ8v(gmUri~W zO6Znninp3`DKOMa(&Z>nq0+-?h51jYp@$tI9NQ@on%zJm}pN?@WkJ zmjc^GN(2@0OOp<>ORH8Md!2gT30XL9pa3HU7b!+YOJJ4m31O3@FY8929WkIVQ8$0% z$?v9xh?hj&+vVr1Dg@hgIt(){m8N~=3}RfS`m3>E(z6In`&@{{ddWp#Uzmc_KW~6U zm=VF|pI1D!Mha76fnqlG{s{nyQm2eQP7-lfV;CSw@rZQ2u$~|MVE!5rj^QqZB#rYm zD_uxloa11tT*huwKCGnElgoQ&3Lv@M@?K~t4*YOBrd#WwIUF^jBa)w34Z}NoKhOx>@V-kw;NhIkSo@CSFE(@~g#x6wJ*}VCd(R>&g`!R@9b7VvJ_AWivZ` z-t1yJG84h@YIug8xn(szteVh}sjjZw#V%kbjzZBPKcC0t*l;+5ePzjPkA9MQWo=cM zCQb>vkV>lkI>zy*{#5HLEg3Qd;K~tHc2)_Fdk8rRm6&wV&J36rEYp<1HX$p^GwjL~ z+XT03T$X3(mCoVj$p!;HMb8t7ykN zady1hM(Kh|PvSqy-yNmIf3bRccUx~RBTB?@9{PI*@=;ygY-i$c_`hy=3y%<`_R*ih zsr1wX*MY(m%6+EZLO+RaC@aHFtytR%9sAEK z!wr}owwHkELnAdwPa{10_(Qn)(%-gLr`2qE0qw`x-qEpoS2EK3**?;-pVQB@X*w?E z@@Lq-(Qz%O_qO#IB>Gg%>Ah_Ibv&Qbd)g9oyzPbCa04aAc7=|8$@k%AOLtqEj^}Xs zZnkV-`WWx?B9r*ki?0>>`@RM};Vwa6X`#31^cD+!rGD>blZ9TrRMgX9p~vYLYj#-Z z^(RDr%|hR#*MG=Dum3^hKW?EX^wcln=;e%lwLB>D$64sRbR2J?cj);OEOgsLqMjrR zeV3j;#X^tM^QT$pDY|^BE%bzqg8#)9dWKGKw$Nwmc$tMhUC+P5LO1fSve29L{2dnh zN}aA*=q)~eb)0O_s zi$b0m7J9vojc=o#o-gV0x8zT_SB(2I3%!q?e}#n}7ZmwhEcA=@{F^NFxO$O)mxX?n zo?o-jTbe|E+qqr)J6rE>tc9MiP~=ar&`0U@Ct2vrbbhKWbYpz$EcBK7_|{wKOLcmN z{!P-o0{?JblRt_?Kd95_D^H@g0vq{__NgA3Zs?blZnSTu8~j@7o1(P4D+;|(q8}#4 z_0M#P{;5I4cb!DPL88Yy#5iJ+vdQ!oDL<|jnx7p}=q>%i^~m`*>ElBEmFb71dOFS8 zZMJxQ-cf$t^mD2bg`ObMub1jsD)B>g8J}em9alch&kBitj!CuIR!VeSZ8krvB>I;o z)n?ly(FaNN4vC&9(RWDnUrF?3lH6qaE-8PKlz*v|U#4qPe&Y@)6+0x+he`Dum+0dq zx=qp(@@tGwtVEw;5b+%+(F-N|Dv3{-o*?D_Ldu^c(TgN{ibVHF^fZZnn?zqBwJXyz zr2OSlenp~JOY}U6?vv<+65TJ+r%Uv?5`DHrzg?nNOY}P=dYweSQ=&IZ{gUbRQvQIH zf2EXPrZ1NA>q`!oYL@8tN%bt1=(HCW-!vl)powzb4UlNc1%leV0UkQ=)4U{T~wjkVM}i(T_{?of3VqBp;bR zTcTG<`8y=~pCo#GoUp^V=8nBl_>&;f@rsH0Ns{PzRmJ?INOU;-<|j>})0$y?G9)^+ zZG04oZumNsmM77%;W0mj5*@E}n4jqq{Y;aJ{g6cOBhjlRIyPA5r%s~d>b?1?m+1RV zs?D}oq8q*dB{xg-{!;#>68$WRzD%Oy6&&-kLZai<5c9KAq8slhP~s|yey*7tKfWW; z<0bkgiT(?T-XYQPii!EzA<^*)g8A7c(Xn+jKbk~OFsWEyB>G^9eq5seQli`XhxPVi zi5@G_FOleR68%z%9xu_8C3=EHzf7VhN%SESJw>8lF45B@`jrwrL!y5r(G`iFBGL0C zx>KSTO7x);eY!+9-|ay9Y>A#K<*%0LyCr&^L?0p1>m~X~iN08(r%Ci?iJmUemrC>l z5`CFOzgnWNkm#c%`bvpDTB5I#=wl>$i$uRhqHmJuV)o-O4+F41!&y6vp6{!ftTu@YU8=y4L=CDG$0`W}g%AknXt=t&ZNqC`)T z=x&LgCed>xdWJ-wB+(U#ew{?mljxHrdZ9$mm*~?a`c#QNTcQ_8^lFJdO`_LH^cy95 zy+prBqA!-{H%s(piGGVjUnm+)TM4vCwQzZHViJm6We=X57BznC>S0wsEiJm9X@0RF=61_p9PnYP8 z5`DHrZ<6TM5`B?GuaoHaNc4J%zF49!mguTPZkz68#Sn{kTMb zM55aUg!TVXi5@G_ACu^D68&+B9xu_Kkmv~#eVIg0lIVYw=qVEYDT$sY(Vv#+84~>& ziLOZW6%sv9qCYFq3nluWCHi!U{uhZpTcSTN(W@o;XA-?mqW@K**Gu%35`D2me@UV@ zOZ2}<^raI0Wr@B_qQ5H9S4i|#5`Cpae_f)llIVvddW%F~Ezvhg^fx4WheZFoMBgFN zLlS+LL|-e>HHqFL(GN-VbrSu!L|-q_ZF*e27ib$KdaOj>DAD62`sWfoUZS^2^aP3C zF42=D`X-5i%EYY`1^k#|vzC>Rt(La#r%Ov^^iM~Rje<;ycO7xE;`YMV3 zu|#i?=>L-FnB=EK4qVJT3FJqNJ%={PZHl?Y> z*X>k?(y$>`cuS{J_keul)Fq$c%Q47C`FWpyB((d~sZ-62?dZ{w&=wQZ*5XL0)x_Ny zzi8qfjF+1@hVc>;_hh`##N?zM3C%HaEaPGm_hvlV#Ah(hG%*G6kA$2i?!$PniTg6{ zZ{mK8?Iw0GK77*XZye*@ChpI8i;2%-+-l;p8NX=a0gRWMcp&2?CO(JpLKC0Mc#etV z85f)Q7mO#H_&mm$CO)6B)5I4r9&F+Z8TU8wMU3qx9>nK730G{82wLSyxYW1##>B0lyR$xhcSN9#KRdcH*qTCB_yX#8)%UH1R0LP7{x2JlMoz822~vHH_^hra<uB__U}@j??%V?4*i zg^Y_$d;{ajCccqzripK2>@@Msj0c;zh;e@t-@@2#Vh`iP-x~d&&Um+piy3b*@eIbT zCN5$8qKQixFE_E5@e&iyWW3PCvl!1YF)ctxLd7O7V?5c!w=&K&@okKqCN5_@*u)i# z`w~6O4-eTe!#;qoGq#)fF2;wC8U3$kyxYVJ8E-N1-HcmJ z+`#xn6E`wmZsI1!OH90o@j?^d!+4H~7c(w4vC4R|iGRa5)5HPBP7~kDc(93sjQgAT zw~Xy3{vG4PM~(hBGv00DC5*S2_&&z1CjLF+7fpOWDM6601A|BdmBCVrXmaudJ8c!`N$WxUY9@V?hP zN!~ZsP4c&ai=OI=3oJO-f^#f*j0LA!@D&z(i3MM1!2>P0uLZ|g@Ci$QzOmr1EO@U4 ze`3MgE%+S^-e|#AdA?#vf8K(hw&2Gs_yG$JT5yvEFR)<01y@+G*Mf^IxWIyQEjY)5 z$5?Qx1z%yomss$H7Cg{``&w{}1)un1SN;9Qg1@rhy%zk51#h?DcPx0L1+TH-S1kB> z3x3*yAG6>GEI4SvO%}Ysg8dd;VZmMtF0$YP3(mFR919*}!KoH}g#}+?!53QaKnw0` z!7&zmV!37fE%+-7-fO|1SnzfWe#e40TJRbRe#L^Hx8SEO_%RE9z=DGo++@KEEZA?s z6&CEZ;35kyu;5$^&avPz7MyCqS6J{R7JQ)v547OE793;2Cze^p--5rg;Jp_7i3M-B z;CC!|qXneINens7( zG-!5zhIR~|IwjnNh!K7FLgZL!7@cVI#S)AKSt~)|9Ovhuy|Ff%)`kLY*@;PZ?Ou?z z53th|fq0ZbW1`p=#9y?_k&d>smyOM+&dUIumw$oYg_qlOUcN?YYWxmFaV1{vq)I

FWR^b2ZNWCdSF;U`TQDSts#2itgn^=sI|^7#3)mF^ifpt1++=a(KY8C+H)2t>Io%qSDeyx%GXB; z_!E@mliHIaZ%(+60n z@$>~AW6`K2jMpJq8s>!{OTtjJIJ7|tONIVx3Dh@J82hyZ>d74_?Q991I#cXb6_v#> zko)&(Xevfr(=bq{31Yvt8)~JEreOt78<5AX9@Bmx9LUlX_2$HCw_4t#8YfjF1>KpIyGV&vqQ9NLR04M<<|(7@Oib|gQv%tE)0JSgT`N5-cz>JgyE8F`awLfy z8JuGra!||HvKpCnYz^0ufE=CcNCK}4J<(1cGUo9Hw5`j~rM{>6N%W{}3{|Fn>Q;BL zLZxVLqXZ>PMQ7>zU&K$?RD6PpwKNol2)AHnXn)6u30cXef!fz7UP;V6j4Wcn9aL#B zby+HnE2i59t%y1m7GrF2elulO6W+2ZVP{aQfbpr15KF6`@>#fxT=-( zPyO@V>Su2Cly(_Pgyxb3)l+=^l|W-+J!QaB; z5iH2lCZbxmx=*))kC8Y71FwVu#rVxyA|~uKR6+%LEjXyH(wS)1$zAIEo+pw~Uy-P9 zCsh-A-TOm+R64ANw6H&1`mbdXMod*4`;d2{HoEAYQYr-#p>44G4r z@f(~bCzUFJ9+!}%Qq=6kcqK3a>tJ?bER{QhdLOF||B5=M)8;9oa%}!SBIP60)ag}% z<%yWt+4Se2){}DMdnZmN42whfucU$)mROsAU8mC!iEff8L(t4%G<*jrG-4$*5y3!w zsWL?Hicz|SMyX*#5_+_LrWtQ5P=Z%Hzf_3+0?0-QjPW-UJWzy1?+;jLLK{&V=U0Mb z7E;z=%W0lt9-4%PA zvgM?204i9ZR}gz@m*C4NPkpW*(-&oJ{vicQ!;@L4MgX6841wyk)D1}{wqQJ)?MI~GD zN0LJz2RA7Oo(j^?cQ~3Tkgkjx_yn+{kxmJepf9lx*lW*D>_gqtz9hb}Mm8Kj_Bnrmc>qb4byG4fP-?V@~0(*3r`r3}Q|f&#Fqok+ zSf|f0B|HXmFb0MC7-W(fWY~_@Q*hkw>#K~4IestLY`Ou;palD#d8OU9+DWYkQ!m9c z6gBNCXwyv?W$2H#jhbv~aWstpR}gZarWXG>N_#s>pVHbwu$w-5>Vxc|Yjwef=IIam zus;OYJTpAlsFzGlzM_7u1heAAy_6A3V62kf=7UMdjZ*^ou}b6`tfs}yyVqE3uc+Oyz|*=%jSU@$hs-c+Q|XD%m~QM-eg&JIK0l=M%0qqvo(t-iCB zhVSgY3$zKWNN72*08^_VwynpxXna)XA7(Z?Ud@VI+=~k!v2(Yg9_wsq4tGh|;>F-Z zbE1}+j)N`ux7oS1e)^3kmG+DjViMnq{+&nPGu6Dp_6*oM8~o@ViH`m@NB@jC{7Z23 zSJLo38~^GskeEsq`H`@&;Pg})uCn{Ot#1x}zn^4m&L`sMJu=e}^7jyMTo-+g#Uw|+ zdn-*x9gX$WeDarWH8IiE@PlJsHoA-lyMLJD)g0U6UhY6*B1mqvBz9a|k4qsDnDel* zC zx>v~3v}jmk!22QRR^24Ty0Nsl1DCpv9m)+}{$6e%7Y%K8G?Khsfn;>irEber{{c;b z+Pv$#ghvqaxm5{V7wg>WQom5tO(^Y3-{!xb-0t*u{U0N@EjN9OFJEce>^nyZWKvGf zxsm2hA% z!=T3L9li@(?K$8dYJqk-9~Ac9t#)+o#QY%L?%Ww3Pu*VgfHr)Y*JU)>I+c=j+OPFs zYrxj2SY_+vzY~@z2G73Q)2tHPZ^J6l>8hCEj;1GJu7u^e82xjs{a$+uBclW^ycrdt z9i^eWT~jdoX>$xyM#g6VdjkGK2qMqa2ifk_FDAP~ixBLN4a2?q66$?0^@U`%J7(Rri@6*t~cc7*Oh(Q$VnW94ZF$uKpP-aE#PA;6O|NGsfj|a35NPwWa0~dETYn%511;>1m1Svd^ z6v<%D+jM%9~)=rcFYk+OD<{EB<@nj2(oKL~eMQDN3;B2QEf` zQZyTFLW6%v+=Zg1_o?j{x?c?-e6C$2;RY7aXeN3y=V zX=urCB$pFgVna&_w27hbW%$+{=q`;4_;$mQsH1LS891s08xuQ_ftbNp9R5)f@_|_b zNZ5pL`m~9xHti)=vZ;w_v{6YR8&MCvU!8zFn0kNtmDnnl(_Db~rz5y99a8gQZ;h%G z=uaMPoR(33xHEJoiF{l~BPK0$Ot79ygcKsAw>a*(6c0+^-bA{;;N0pwiVeOid6PDq zIf*H5WC0CNR04k&Y_mXoVS&(|Jx*)~sk^)#4D4pcE9jj11BLK-yYPw2nqz#P@}P5V9Ol@hq0XNqam98GjpLs__i z=9QTHnYUzmr1`IPqWaD@bN*}3?E!w(#7gjG?kmMQcA<&T*Vt{s&LMsl+JZ-I2nL{Y zk(pon9{ml~!uElL`vBVcp8=PIaVHib-WzL6kdv$-OibuS_#Vdd8~QUMP@%Kos0)j^ zG4zQ$2wlE*D!e-TYPZaE}YN+Sz^0Li^4F@z>5O;4#K9!^4YpHxNo|~HrveEm z(79N9co!K3ZHnG~a1~O_uC<5AnUfZYQb5U~QcC7DvY0OQ8}9hRFXLgN+<{9TUT86z zv7lpOKMOi-S2MV)rt!r5Gpy%bcb{%OPq&=fbnx#A6v54mSwnTEuXi-jRTa4AMZk_m zx`#t+^=P>Gm~wQD)296a18(jh`a*VCc1P0=LW5<~zJwH@0Oq(j8ZSc@A;(=D?jbok z)_#2|4|b zqVE=bYl~>4U=`t79dietVuMDkl3}CZ4eaqDJQ9!iio-wC9`7Q61P$Nxfjy?IHtm<- za`jbsg7tlTFk3V2094r>D2a#n>}Z^f?vk1(?<9k@kF1@#PJ0VkXp~yGWRV@dLGq;GSnxw&3tt_`FVF?;1JbIG7K=uOM8~po&tDo3g8_J58=H%nyH3B%y=pY zwJ_SyHi99yx_k9G+(PQ2K}ZjdaX@U^jnLq*|D_Ef`Oqq~7g5Ym3RVTyQ?jpTLo-Qq zqLDfc>Jpfj(D1HZd;cTkeFo;YIT!wdFh~crJWL1}r0X*suVVZ#4H{eg7l^6!0Cuvx zl_8Uq31L$*jM70Zfc8W8qHSTX$=>e*(=?EWFp#0A$;gv@F!`SNmoQDC61WVKje1mv zWXboLN$W!=q~})YxhHx_#VNm0mds!>=#s6YeOcx-nhe52)FXWn!yJl=;3nx+2AIc0 zfImfhpB_p>FFQv)f<^Jdr|$-0pePtmIM zHiQO!iD@qEyRRTm1n$UGn9=}thM0Lu`uqO(C@u&G<%l*G7iFmwGW)kF0pbCfP)UUi z9%m{V(MIDgw419zN?fjb$UOu(T!^MQ8lMK|3TACmjuV50&>!)ET4Jvaj6P05O3{IO zM`J%WG;Z>sjyt!y1Fj6$vBTK4fw>+XE_Ew;^~p*AR%Z_OmB$>7bc0k0VBfr%+><%j zFk+jEGOqNkzMV?og*;T~3Qn*qD)zcfUpbm8iJ-PO%N*vWA9vhy8!07+G#!oAyuf7h zE~*FbqTp~ajY7+qNa*^8!kdZ^Hl%F_S>ezfJJ^2!v+?dg89IP{O!`LOSly3t1ujo? zK|ZlqXt2V3<7gx!fCI12empNQ%h01X-ymgFy#I)+VPmX2{qjWrCRandJ#-y-)9a!0 zn551#nib?w(hvCNVZ0#$gicblLa@MXKuVvw(tU|>jz&L8G&ml4(d6YU-s_Jm>N!gA z`r|Gh#PsbD9X4Rev;|UxB6vQ6An-(!BvO9o; zKtY@0tr(Q{TAALR9lE^;oKKe zPvrM^PzJ7(_gR&KzJiZ5)Z>lMOQ(IJ3FH*-}6DGH~L!zeTZcBvsgqj^lfEuQCTt;@h+v zz#j#R>%pIhhfmdmhp>OYj)l824HFt>(AOV_dZ^9US9=;O8J5Z(?qCTEoPy4DrlR); z$0HMKgaq>4fmx7iFA5M|ggwJ;=u-VCoA3HevdqD}Q<*B}1_Cg?0XKmw@E=;Wjm(hH z4_5$T5_Lbuh6*{UP#?WeABx(%WoC%)d=qk51+_hODAUo3$~&K(*(~8co$pHgJvS#Z ze{aE(QB${^#@_`bybHh^@wqmhMib0-Ze@FHvcY_&pT_5UGedmmYgRs&hc&DVzis;b zX~Nk7Em`_O1p??$6y4tl^+48aeF_O}hgRWC{1tBCqH4pVfJVu`VDpt>qUY3c>;T0~Cq&Yg+9V zG$#`MpTXwMCU!1R0(J3Pl3)OoQ@%5JGHS0-?!bb0BfgqMS0I*wxRye@BG!SWnt}q3**IMxPzqo zMHo)3<;QBF<~_EfX|xT^QMz|`2hws$?sRwG6}Spfbyonf;@dGN=tQDM_yreYR4^kP zjZYz)_PY;hRydmIeU98fFE?HD5Vg7BOQa_`8uw8Pa3b6kB6bIE!{S0=z-q9WS$_?& z>9V)c?kN#$@(cwaMqdUdPX(*}v2R@I@A+^h0T=4>4yswrwXr_p4vuzH2aSV>OY7lE zefL(6wlUtk0Qq)fLj?xzKv(KWM(6_>2pELXiQ1odbZ8aDTHU6laoetA$Iz~Sg)7k0 zjY?eD_oJu_)hOVfy`+8^CO4uEbfS)QJT`MI9UY!#bU0FbBlY%7lhnq9T=}>Vsws``}hjin#jwzR^lM^$?tC)GFoN14fK_0@sDc!j5$UuI*;uCPf|o zLFYO{uPN@nc4o7P4e+TBpQ5PSFxQgYfq8L(3Gu2sv4?xqyaY$%6&QCVSZ9Zem#ZGh zd;`$tTEoplAr$qy9QAm^ew^}t*E2WR=Y2T0O7amUIJ^U4^W-+fq<@_y_{t$?fZjjK zgm>UpKgw2Hm2G=S@rc-P5VRfYx5~C%u8Yrd1$z6gQ+jW5sW9~i?Edwus9E(xq++M2 zoQaV=wpF>fz*c`}BEr{>>~#v-yXoacZq9>PG-6q_@i%i8PjkgdTYlc|2>3i<|OOT5+5NDjJN{Q$~> z)27#nfzESb4};g*bJe$kl2|KH*HB8#K(OX_LhoJ{P6@ zFP9^G8ysdAIE^DGIXHZaJNbQ56>lb2)HwkBHLSNc93Sf&-_XK3;B%?1ZgpKQdK_>- zanui8+xC$5yLxY7WjNP8cnh@M<;dBhsOL6(Ki0p7TzconsKvipX=umfXJvYgc7Sm( zeh5Y!^&!^c6Pxwqqgw3SBtY73*$?KbmnY`ZUeSfDaUjN%2C$QuNrf$=->d|GjTS<^ zG4c?B5DfPwbd)^P;FuNtv8R9LU1;2!$5}y6IYNVwr0b0_ANhPIEgMbXmI+P(U#8&; zc;eJ;fobtB^&NOP*xq8YC_{R=)D9)sXB~EY2|!BguI@_mCTx-%_xu*6CItsv1`Tip z$BlJ4Tt2p>O3|+h0WnY)suxG3*8S3=G{8(4;_6&D$(s7!R zA7-v4m2tg^KlG`0Cu0s`OfhWhl;n45*m7w9&GY+7>|DkCJ`$eaMCAE>NT1(lV}3gt zZvt7G<8PASs*; zH+{dO5&zp48?F;H<)KQq`kgy@4-_rdhQ$hmwIpnKRxDDj;4NBhk7kpJ^bLGH16@+u zdB$cZ!ysv{-n{}jSWKp&Z98T@JyBDL+tGTv%QjOn|7KVb%vBmrg@$vJ()ulI+VFoWu=}mwQim|6ePS~kc0-1nX-r!&rUx*l z@3_)WIvU?bc!11TIlk2`N^oARK2Eg6z&<4(;W1VMIA%LRp@iOOuotGDKHj8E(9>A0 z1ig2|3c(2gjZW2@{>GsPx_6Ni7bu7)l@1PA=T2USI|IM2;1jvd^YnQMJCxaQVyw@V z*{~6o=nK2govnrtPI0N<=fd;awu?;BXRbh>L|5+?h{knsPxs&tg(+&m?wU-|YFEPs zm?Emk{~9eIInJ-BF8|B4=T6AR-aQF3$#)hUjs*0WVYJ=q0$Y0y8y`2; zbrqRFMo#dLUAhX5?W%4wo4c%cj>gBaujMtb2oJ7?PAqTVh1Wbmq&4rjSo5YKmsq#a zJH)!e!v9VZK6M)BBfVQ)DtfQ}JJgI~#yUuRLn$&44S?g86BpV?b(%XigFiZ*{x_YV z=@ay2^x2r82pplz>64VCJh)dVUohE7lfssnzSGf2zbv7&ucH~yrLvO`D=>H4H1Zwx zD1$qolddy+VZm@+e6G+RI1euDZ%3je$6?rm`A*o6rj>{+5O1b3odfQ=DpTX%kf$q) zAvDM=_6$9UL?Oq_^ev9Y0HA)gNat@eG>Q1zqHNo*493uczn;wBITC+8L-d;SX6e=znM;dhcv;ZVmmKq!Ao5@k}f{S8T%bDr4a>Oh{-IPM`#Tw4c{c*Bic^ zEuE?3L=+1QwhcGa!O+bJU3Uiu6$SeoDHZq5cAqh zGy@BXylL7VHka@bl=OolFIvUT>o3+*UWh|FLt03!H*A>A{m{2-LZ0yU{PWP)kOSSL z{U^LMnn<__utgVD3J2ODGLlCIoKOTkw5MpG&DeL9ejw;-oO2(`YosSwZ zuFbtMt_7$|tEOHA$1Ldu!c;1cd-^LmK%;K#>`e~{pm#zrB#5p87)YIhfG;GTkZ$et zH<(NqW9`I7dGew^tS@xmmoK@|4=**G?Cn^D`@Imz$r8sRyeN(1wUhg?&Nvo5CxGKp z0{0VmbK(t)(%#=#FVgniM&tXBE!73e4d3Zq`IH(oM{(>a}II10>cx73VJ!_P$}N) zShfC}q?>=Y?wcfiyr7n?Uvxvgu4B7gsJda#53bfwH@E#CSa|Ovt!>@aO~D$W?r^Jb zWvZLA9j{KEW`|l&8^3r!zwDsvR70rSI6Nw2sGswmSH@81=Z$VykD&jq#R%$ROON0Y z_cK@>V`21mj3V)Zc0O?D zhh}@)nS@uHnDxzwfwmCEi}k*F{d3J$mrjmH$@|xuRQpvO>YsVLfP_T1Vm-H=!Y?xfIT1prW1;q5{x7TVjD0I8` z4dO^xw#$`3ZTwB3hW-hvJ`P$wo;%x{wTbk-(zIVNb*Ar2{IWg6`nko8Lm@1qN?;1K zw;!B;1XgfP*8?l@VG)&e2c{uR;dG~W%snSJI2pR~zN3-u)8qySbj+Qo?Svw@f(WR% z)B%a)8j+iXo4;k`nUiPa(p@Apg{=^IMY=;oo)Q)oI@4geh|3cH)E>oHMyS#MS2!X{ zT&ANxT&~W)hWo_+3CJ+8oex2OGFLmmVJMr22lCc&u7SD#;%*Z4UOk2=kXhn?kF2`i0j=6t&g%t)>#|WIs9we(}0_Mfx&B z^CiMC;)wSriI2`_!tz7ZY0O!tuwZ@=)_WdLzMrcPXD+;5h>7m!Z_};;pP5(;NZ0ne zVCIE09;l`5Laqv@$KIwTldUj!1g;^-g<#gaWRA}Bx$iM!tqBYxr&I&H3CgRle(S;3 z9Vfw72G}Bguu8db^o)3duOG$v5VCEvYrjMjydEQ$MZQzvj1v37)K^dsIT&#ErJL3Y z47_-EXbcL6d8OS8e1PM{jz!xrcB^OL4=P!TN~om_VjIgL?(vG<3zl@Td*NG``lbcK za?>fgzI%cA63|H^CP}KTolT1qyCA`svD4@UAXqZiQNhQm zSEKXsDAH&QpF)L(J*0`lM}7(do_ImXHz+(lw1NlD=d)#PI@R0svA+`m92m_7Hs?{q zdePb>sP0NE8d$BOVUE5U|-en--M&DFJsV zUZ4vt(1jN=Q;6vut7 zZXEVdM2+>Bj#TIr(fP3x8<}3Yc;zN|=eePUv#Hf_D@ zQmp-VjYGu8e;MvHV)0Uoa9h2AcGdE!UNf9(Z3ntY+0TXm;XWR7BW6F9y1j8G(#f-? z(~=p`DAea$*|~%Km^Y4#?%qWjgw;{2LAny4mf98j`p^_4Q$A4n<51E!!BK2Qz?ZeD zvmLtc{2sa=f~uJEWqZf>Ys~{$VeIH-BzWsMJDiRKfz0~RiQ=?#18MsRj3saMF|XUS zO+fmgYyju5qdWK@mVWe&PA_xuIuP9?p|eLaFXHggH-|1J^w#>oSmNNhj*i>u1RD32 zqMUPLd!ns{lz0chwzN%~#4Ynl=llNATt}e5mJ4&k2fukR%Nz8wU;poDD4-aI2Zd|l zkQcb?+=p-Y_53~$$wT;4W65E1!Z%7F{eJl+wA@bnNn5*klMQdQ@gA%_OYFefvl1w^ ztAix`qyNYCH3@yU>g!W4TJ`lMsMY^XU)Pbf`-%Fx@Hq4}9fRIQU&A`y<+O=TeKGkj zb>osDLgDnnZu%xiBi=xv$nO%u}C!6A;-PO`qW5bz0Y* zAJe_WKsT&RBi-qHaUGH-Cf?15ZGp-N`${VypCaPeK)v?3lCJr`bOqfbaog=1-?^@2 z>$U# z=fGzBUF`zhZ-X166;N9gguyNshbLqqaDovVx&pQ#EIzA=_&ZEQzhe-28%7)=hrdhn zAza}9DR~bjWAPu#d;4GiXY$_rx1THT$G?HRug7c(%Uej<9juBoR&Kn1yYS1!(8(@t zT%EI}Z2|(sfh+M^lhQsxIHS1D+IS`DJT23Kdsd(n{T?Uod{X?H-WYnSJu8xT<9BuLWtfT227D(zf*J0~J_k-aB!oNn1G~-RVh+plv z0b_|(nPv^H>|nXl9?C>BaD4M1EOLbJnLrq42#Shx|J1^N10r0M0VSer@K6VqGi@j3 z{*cIGO}~iLBp7p@o8t5cm*B{ToZAG`xlQ1%PWTITm7?58%rFx(IPo$)F^>|bn~Bpo zu`d$oI#LsD!GZ%%zE5ANmrfL~FT4OwjB_n8*8CiWqQtO&K>`A;5BK3_R7@j}{*%N(jmxfIwTc}Lz>WcdcFC4 zdaYZ3A79<6mBIG0a`C2zR7Skt4Sqr&;FO5%67iaV@e};}?jW3^%U`r(@{VgIXj{8a z#Huh7+NB&^L48m$N7KWoGxSHUIK!shhDt+U0}1{RxK5l-FO);si66d3qqRYDY#H%` zE%=#?F-H$Womy!xMWY&NNPT}6i7AnWbef(B#!ny8t0__aghurRiUq+noCY-#JHiw4 zjxdgM9@OH|1;O3M&K<@>yUwrpDVN}wT4K3hc!$~fxRId-R>7Pb`(w2hh%lA>S=pnBj;CNdr zLYKmuC-`a1kN!Rw1<&av+QqE9xWp}4$Hv4uP$`f(A&!kCW}0Ck>5?q?NKk@}i7hY$ z1gaIhF0_&W-UV8PZ~EZ9krw)fCy0O~%&T^S+(2ZXP1(upu+4m=7gYiWJ7xi-N`Z`E zOd~(FX=h?EDDzUvj67sxALx(g;QfhYo0Z_piOc94?b0`W=$X>!qx~6EntzSr1eFTt zx$)+sDBYMy?=sMfT+{K57ToIJnMCiP*^g+Cyh%EPIj~bp;VrjFb*G;aFK%)vvaehJ z1V&K}F{EwK|mdiY8x;kNH9r6UEIGfQR!{% z-j3gA-ithGD~Nq6FQ1m1`$ zAxRVU*Rz#C-$gDM>^F^*Hr{8eU+Cv-IJ~0^OnlC!;hYVFPv>k%*3a2A+^fWu6`ZrV zC(}9GuH>`1#@r)a-#QKYzzAiM1wFrWp`Hf;rs=KYCfRDF))}_WH1A| zx*;%V!y)Dlq~`FQfEe z%oM!jhATfTZLSK|FJ=;mRBfT7=?DfXbXgCaw2~icP8XVHKQd$LXdLPHTdq2joR1=G zJmEhFZ?h}unDvgv51^ecyhWmX6n(1E?Dj+R~EOLYH=l*%a+~0#$=!>)+<~<0nkDDfDC0(Xn-2YR<2>X(Ai*>mLG$`SmXXtBhI6$JKjOtzl!AY4#dU!JrnqP$^Bb4t$%_ zY&wmoLi`h1yWtsP{AC6WU*USz$&z1tp!0*r+ikdCX3R1^s>Vt7u|0?q)AUkW4|Y@R z--+oZ9b@B%BGiMJUD;xGWiRe=O>n`fhOfFU9&n95x1J4VVQzgd=GN;uJhyJp=hh9F zTM*;)%vBeHJH#=9eZTC)!%=NIt*4lFHd15`c|OYTh8j`N+n`>Gdaq8SYa929yoi0$ z^KlY}bUs1hPk7q=!A+4yk)h#uH^(9`8BE9xrq;Haq70bfjo?Cin|OeSl_40omu-^4 zzz;ZNq=tB=@x76Ig`pz`pgeWdF>DC$O@ullC)Qwk!AW9C>+?sd|8&zZmD|-#L-;lS z=`0FN;-{8VIblyDP4Y%PojOF0JkJ3kNR*zbr;2&?~7axPljO zj8zDp6qs2wOq$8>tEdVVBQ|r@l_*D7sc;LX?;#BXGzwEQv<1Pr%vA#^jekagQa6W? zdLdF>4GZWJ=bbTVF+fWyUX2!R08!v92<-p2J8(O0B;h4a7rn$8Onn|Tpd~E{m2gPy zZD@w>zf%Y6VdwBocOwU40l<21YL{yWBFc1WY0=lvk=58Hkw&T5Yf^>th^xT(Y7#^q zYX+-{a5_i|>{`%qGZu8M>@Xw`Vg}{JycH$L^kYWz2qAGgSHnw3o1^hpFx%$xp}s6~ zA-AYPXd62v_v05bA*Ow_enFee@fP~P*=b1`@dPnk!G_!&^cseI5Sk%Fj+YG__uP$6 zw)IF#B14|5dU1RFUv1e4{-~d1wthM2*e4F96RWFZ(YJ8c8cx7OeusyJC&a~`xv{vO z`NhH$!bIKz#Lh_P;D+yza~|Zr@H>(74t^H-?!xnFmeJ~ftf4{2%!7zCgTpV-t)f_~ zwOlB0j^owO;dZ#YKgFU2Z*#}P9R3--pV_Q_fwv>dwhn6QoD6^ClG~1yCLdA{sN2<# z>@9eo0loqLjZV*8JbFMdpFE1p#XSZrJW2e`TMv(d912|e=zBA2vws!L;S!mPv$6cV z+9Mg6;h~4eLmyr3dN$F~Xa6&gVZ5x$1~Yaa1ZSA9qj-0Ql|B%xuN z1HX1b(t)uu)JoSV@fImvVf`s;^n~oBklC=P(ZNTFmDBuzO=!mS2k_fQUHk!d1zZhR zV!5^G()Ba9=7i)I^P-T!k+e47$Ce~u?m@NPW+%4Z}|^ukI-@tl?+X%J!LbM zKmG{1jdP#PO2gm9Uc+~zITf)jVA~W-?Y$Q><56h7F+zrvNUATRTGPic=AU7=rh^Bz zk0+@m%qy-D0;v`PX<>7Kd9{<%#a_wWkU5%GlhH(2h5M^X0dwO2n3jf8#_yuY{oP57 zJ@=&E-`8l5-;Dj-y}EH|b0pu(79f;?B;Af7T{j+35Bz+Quo0mTzSsBp!NQo(8<4CK z?+h093_aHc_X<5CVR2kPl-g#$QQ(hJUFIq}$q(D-v7bY{3wZgHjB?X$-$7)?xHuZu zqNcTUmxuZ`=nv@l6i~DrngCdqXH)FB@jNOMfIg5a^IOAMhGK}Q{!=@H3LXCz0K>!v zj@C`uS^s`_?5qmWIg`}ex?0U>S9;Fau!VA=27%38AiPR|9k)P2DArnRQ4|o z2L$fdbPTj(H)H~Ra`0%~O#8Wk4>AqM1)pjwn9Q%D^SZoKo4J&_cDhph6@H0n1u(z$ zA7cv7|A$0+9H{})ty{63dOCYYP#3ar=7x>|t-h=DTpeUKP>a=DTdJpqG(??o@`8R;dLLz^G2lNC_fKZhwo=DLl6C% zDTSas%1&>n=Qnr-lW*%>jNG`GiqoFZEHs8Yru29-9(h#{-3n!YgM6aUbUfk8DJ72~ zmLUU{m6)PWoQ)1+in94c0OcTuUUjDzSrcS&G2s0#x^ud-xPB)lcfZHc(}xbAbxavs zXwNY`VHVgzJMg4)+!yPF&T%(>WbTf^##s!A-NM~uojV=|dG1$Xf_E8={SSbR8$jhI zzYcv2j#vs`hdTHX^7=Zo5s!S^mbZ$b3n5g#WV?nX*N-6azJ@z+3Qi(lyj{aRJJc1X zgL^6SWdx}fQPuebY2-yieGDiD>l$K%GK&$riXb&91eZXNniLZ452z(!01%WNiL2Mz z52TyHF+=G2ka!+M&!6Mj2zahVaq+ydbE-i+Y7*Cna6?_sus(!MnUOg}Gej>KI!}6@ zBt7HuGklonuGCHU)8ItkNR86dd5WI4k-CslRa_tuTkc4+^Sj@R2F%%Maf&{obSOE; z{ci#9Q#=kfkkD~(qp@GwWlePY?G@I{I+J}UHXT{O9hm^iy1LAPvC$i0Rv%Lx~;6p8I z_S1qzhKYPiZ1{VAgc3+dro|e^fGMY4le>#2l6bcV(IV-Z9DbJrcT@J8x8&dqCm3-W z3fd%T>)8dMEBjq9r=8zS0*rh~?$qz_BjZ_V_Rt{A2H}@U_u=Y6)P6v5F(Ml~2k1ki zzz3ve{Fh?ZN2~iu)J?s8jweMn^%mnE1hsQ>_{4_qYl(}br@!P-e^vwbF)Snz!YogWBcvD2Yywiq>mq(@emn#$f8g9Xt+* ztZ26ob6|2V{3NY^xU11R_3m#){`a9phPOC0Cf>F7LLA3`K5-OlZlC<+;7Gso zwcbV7Eb*%{^rI*TwSNZrEJk~cWGDV76GW~fBaZU`1Z>WjJC^TP5g&n=hhXy1p=|09 z-srNiwx@g=eEUa$pTme^dIP#M+mGrn9H{7l{wJXd}s2s-B81L(~cfk;T zwuajYq-Ica?X3_bgw~Dh$p#Av?GS$W}ywD2?ftVxs6>{Qk2GT-B zq*JkEu+gvPu)srfp-%IN=8~wLK&tDs<_1}s8x^6sAzV03`}%P-cL2OA-&NgYV=|>s$C;$?ImjMz%`DpwVq^N!d*EpLapCXDU8R~$587#u!P(d z`hU}Cp9Yxe6`Uj0c^cDCSgL;mRahlOze7*e*HZPBs9sZ1t2mUPUyK`gEoFet=;FZt z@8&YRNRn--db(V7eCQHX#CjUxr0Ho<2ZeRUD;G4^`GVmMo_42G4|@@xF$2l4sV zqu{d*tBkh$H{AO_b3!Wjqf4#JEw!#eNo(&P(`)rpt@KteGkrcWeYdE7BIPjq{zQ?V)T1BM-*W$;85KCb|cjo9-_uztf?H?@Hf&>b;#Pa(T6hqqp+X622 z*8@fXne~^EFCf=1?kA1IK?uiZ5En!^UJwy;qS~&9?C4DGDP~Rd%QWb?7L5G{;$hT% zDHtZsB?Aic-K4-)D8a_WdSs#(_i&*AN3Zm!8W;TVYuc!qO#AmF3T6a!5CWoaEoX)nOF?6hcK{$IoU_{0o{+&*05s0W7oF9zoGN~ zAfm+d*6S&44$lU@nt_{GBrKdD)KiZj6b#!wZ7iEhN&%%h^d4-S%uh3&4w2XdKQ5i0 zhZl$W@iHyU&lM&=Svo&~j4pc9v5!LO)j4B`?UcK0Ag#Vrsw z>e>n@3aC+j@6V~a-8~2<-{0$b{&?o4^u2X!Id$sPIj2sYI@Kc8)Ez0E1XdJ2UuB25>By2B>=yj9VA42-xLqNsRSoUmsiVH#;DGy?V0<+szm;3gqh7(R|J z;4NO&7T~zUvJeri{tF!-9@5+Ezog1UUVf8)SGg}`&E@w~e%0?w!NjJ4+$0=_7vR_g zCiV&+3*Aek&xHwSs`xH;Awx8hq`1Ww!b4OY68*9>S^}x*Q~quJ8$A+Ug~|QnB9&G( z$ZV`%;Rnk%a$_5tlREdenZdl6@A~KmGtsVXfoNVy3vaO`Nn*ugk2{(aL|gRzkfHc6 zq|lE=c9Tu%`tFC&>7>nzD)ZvhfONwk-};!ja3KuXs z_^c_O+G;|-G;fJpdm>$C44gCAskSfS*<+ZfJi)WCr!YLbyLQ!xa7zx++c*yJeut@i z33~)Fd1L2=@_s(SW28xIn{W9e*DW^fFJXoh(+GHIbpTzM9{KI!hr{ECDvkZdy*;1WmA=W|j?hQOW#ki{ zS&U%nsJ0TX`|%&Jw(G2eJ*gqdmj8u~ORdumZ_HNfL;>;s({{;3i5|1VmSi12UsY`6 zE?qd&>$2jnj3u)hFC?KmIViKh^%HD(Xg5)o_+N&4^FvDCpv^)>GexvC>GY0|+FYGz zM^XUU!0Y!e!$g9-?%UyZ7tb=q2k$^BgP-C<6TG1wrlQ7N-QG~3Sm@rg7<(CZmkSbV zJ;=Ec+SKMN%qfw+h$5z4<#4Z%O}FrJvjcZQ!b| zk^D1Z3OV#EdeN_pmHO=;e#h~v*yG7XM^^BN9)F~)YWmeDekaW2Cp^01V>L;ZnwcFPwabNE)J+<(ihEOo2|EQp#@GjR6;J64Vr@;Aw49XWNuN$=C znp!G}`|=a|Vy-~%uq4spL^=Cvb<)}+O{z;Mo9rdgG zb5gO3vFM)JD}XeL)UY$D5;3O zt%9@6{*5y1X4n&0VirLjjhSh)MIPB(_2X7jK_H*`OpVO44OfYJVlKE`(ubi`u4Z_K3VPIA1PmlQihf|L!gZQkXHNB{y&eJkr$#zzdk^2ZRvPZG6<0c>AwC zX5J2Ly3*ZYOMao`dV4B=phkztZ|wjUL||0&xnECa%P*u$RQ35ns=^@jo7qg%DgiRu zI0fXlz`Y~^lk2tePWr=nnJUdf?hj|Y3U+_azmG!>yn^Cre1C(h)%&IKJNMn_{ZRkz zj$-^ZCE69d(ba{w^gsCQTi1Lt;$PrS_N|Zht>3kiZ@rAxrz97(YJ~I`?eTAvj=ZJ; z^`?&m6~3nF#hSM9HSOnXIwREd5ZDd<(CsL#R3>uk_(Z*(Q8Ufn-=gEUED*Y}m;YL@ zt#XGWY{UAr1djWGqu@_blK!_Zf$x|YzQY+o4_`$QzVi|H;CbR5YPs%-9eq!CD#GWs zipyV69+!Vo$P=4b;J)q45B23Q59MD0+*rSRMso~3=tAH26~(e2;l@MT-|}UT3T1Dk ztXn&lLfP_RY~lje(z#@WQbx>*$7*P?mb-l|^W~0{tNXH-@4-nzSZ?5c^BC)g!y1oS zSKNI&PUyT<1c$U}a_?&W%TsUnx~+s`ox@ZRHw=78@i@W|x6BQrb}53UN1z|78nFj= zOISu%xv4zm?+KamdEfafg-egPPvB;Bej5+=&;a%=P;6YrG+4p>ojB#BWcD#54y}SE z`wNBxd(m3v<`#kSSWTl2ZWBASWL9*u>&4Ps1`%#HL8WD{_YCZa(hWFr3Lm31^T<6g#xwiR28_8 z0v=_jiF`cD=2RHUUYn-6>$cy)kMy~aN{k>BH5SdEhSzAXVzbe};&Z-tod0|3`wh@i z)P>J_!BIWX!HZ_%5p6uhQxtvaGn^MPG`R-OT7wdiukpbPjbgSU3+cwmFcCqg0uO&S#ZRY_IA7jG# z1eSi$w(3xB;A2GKb=R^A@V)p@^Twn5c;C{wzNI_2^S$^tWE11}nSaN{W)+L=>WlUB z#m*1K7L*jbsR&>+IXv6=0M4ZyfKvfDJU=DS{*7|UbGx2b^BlK*iwFNrr7tZ>Zzv4N z-bI)uNlN8Z zWz7WuJJtAo)o9b}Qq}m{7lmxmbl;ER>PMyT>xKBgxNiRWXL?@Gx+f;t(h*%Qvao9R zQ?)}F4!^%Uy9}Wlw$}-<%7~yIWrkI_!#`39Q|QIL&j15N91?pT=C=Elz6}t!xVmHV zQ59U?v|Zbz$rR-TmIX19{laZ!ta(7^FU3s^cX6T4mKn+H-xdRjd&SE9&6XCrPk7;? zR@JLNmxpw@Bp8?D0r@M!{Sx6kp~WP!%hg@o7%ff~x=U9}EuSq#RIv@pF{iW;FRT1r zCW4t7Fdgf9kL##9m|X^)!F4Hx^|Zy|Tjv(gD!pD{udCcFdu^G8Uo?hnU9f{LM>Xgj zUGfhVOMCc2Jc_pa3^~nPmxh>dH-g62i?{63vb$TnN?~5(Wl~q#SV*XfQwehc9_K+T z`8RC5RG+&8R@{0q7qyowLl?I_;BH;$iyQi&e);PqHruRxsXLbT{$v`)!{L-jFL?mj$Kv zI#w}LAM%RaL-cRc`#P*^7%J~*Qry?Eu(G*}QS!sq4j`hUHy}qeWG5m=PlS5ZhmFm> zGtT$vScbHYi7lH=Hey%u(fm!mqK95@EZEN)^6lJ2 z7bi$(dM!Nw%&t)1jzF9z!vMu$ONZ-BwM3kv6OATw*zW`Jc|-pE^VH8$;{6$~7Wc># z@7cU!xQI#noW~ms)vMfZt)v<&jJ~^Qv@JKCg)(O3OHM!6LDKy#$dT=*x!0Kxp{15< z@;A6={JWZJxjKKndzAP5AKYNTHza1@DJ$H*@M4kH?wS+@=EJ;65s8?)$Etq`%%4>}QLVl#8dAwm7q_MIWqbW<5ox*Ivm)NyHs?`c zW2Olb!kjUJ|3_95>uE@(dZIQlSmbx|04~b!;UDJZ_jez_SOuuIDg^j>kc1l+UcN)! z?prV_z8F4JnK#^_>hQC1osXCHZm=v^@GH#z#<4K+oMz%+h)hb&-Hjv9^O{DRdZBJE za8fc6&t=!@r|wYe$tw4UfKfqSh;j*cUd3HG9_Cu)qF|m)&X~Vg`#Od$W&&TudYwDU zyno=Tz?bAE5~A@{xU1-vfZByUf3S9x@j)&t;WD1*+B%hnL=(W+g2%mso&N}D5gy!t?<_3en?=%u$(I6Vd)t^>?3TyY_i_R z!13U^s|-i%Pf)k@w%B<$mT&pLd*Hs1c<{u~SBEHA)qOEpJKZY=Rb#Te?09$cEWwJUQ$(K1>}G*V&ipR} zm81f1@xjl%e&&wZM2#)?a;%L_Qmn!Jo4*%1+$pZf&8|Jhhr$2n0d|HA3iRaZq3;q?;1z_Ji2j6UMYy4ZZ z?e%yy)5iVrr=mxHYa2R(U8ZRCt-=R(F8s6#r?Twkoj}Nm#mzlai!hB!BJQ8RH<@`d zkMq=-mHEiZd-%~ObXP@|Xex+WN9JU0FI|JsIAs+&)B@1dv>(M7vLxpDp7Se7rT z?1OfeH09j@5^QN7L6_11zj-U@e##2`Dh_(i{wpQ7`kE8M6;TB059=a*xlJuwj1M4CPT@2T3y z&{Dc2zV)CJH#MH@fc!7V6Tg8?jHhh{yaWi}Jf6kE|LgJeE|kRc&~>4@{~G-x+b|9( zz$Iv@b*ptvL-yp+M`w3-v}}{FjrCKNuV3B#LbCq!N{#=b(e5q{2c-?uR1ox3smv`M`In2r zMs@?EN7EJ_R-Pcx>^n2r~pax5{IXh2`M_`B#HOCEP+-%oXA>{BEMZI*_n>^AhN-1szfTtjC zOV*&$<<3$xVyrDI+}zLf&b`3e^v%^>a*tYC=Pss3WotrhxZKiosgK*?CnU264Y{78 z`g7&K{YtZv?y~GbBa)wDyc!;DtKspdlaCAy`Z%KSafE$5D||$*$}=)9i9&_tV;CQ$ zSU#b}Os-e|jDG5V@+A$D8Udk(MGKWPJOrC)eT#2u{k9=-(Lo3Av+WI%}qWs z;QsCPA8{Ut-g_1MU(lw%bTG5V2k^A32miojS8Efqxc+-dqJ2A`Ezv$$5Aho0kx}Ax zi@MeD1IInAMlKE*B~HOTX^^c_c=<^+|NI;a$e?OvOXa$KpG(6VNbI?2P;Vvisn&io zqZ|0dYya??s!sGA zg^gtmF*9I%!!w@*zPM&k;+Lo>`q^4AzYfVK_+KIP@sv|Kq@0Sosdr8sk>NBsLLJwj zTuFVdR{Og1&-$Sh>xfdU3rcBF9!e!uY6O*>Q_VlWtkpN`58sIT*GLFFQmjX$5D$;Y ze>^m$pz=~LwR7l-(vl_KY-fhFYNc_=kBO(ax~H8g(Z#xjWPfR{K2@dK=8xiva|}2M`r7(cy zydU;=VbBps0ItA1AHiRj@TPz1w8n!L}xGS+^QF6nu=Az(Q#~A;9Z8AK-PZJL^YM zT7IcLcN7h{xjP+&V^SDTf=o$E>FklF~e}KZn676_`T zSGn6y7n^xsZVR^T{S_E$%$-!LAlb1kYKU&e8IDDHZSz6N%tr`B#9MYyZF1cjChb4i zv=w!6bL>vZ3?h^EpsTele85)6(Xz(+SDLp}#B}%0NIIDLF*4$3G8qsvU02V{n?g>E z8TH8wHz!iKt-ee1*2iSFo{!3ypNxK@-&TGbbLYS>hS^-=>Os5lL-H~?CqJ;c+`Wr_ z=sIAe!Ur94XVG3t3y;i@-fn+{iT|*`y$)9uS3eUfr(VO~$#$`Bd{MHwv}(29JDI%* zhsMK8B(Rj!4mS)`4c8=+hOj{4u+>tw+hXwhlJG0lfNcvR0WVHwk3rl#N8~=GXLr@Y z8&n-r8?*~94PM;*&nLtiT!Id8i*jRb9H%=}4+!PrO(J>chE{StLn6HmvpShM0jPcm z+D?GVe+c{Gf=<@_7I8VXXlR8B4q7<05}p7XA{#sm@rTL7|b|!jdwu3Xs6;;|eELg#NI@zN$CTuI&lNwlr+= z6aknVZ~`ZUyP$A_Osx)dGaiJpr1@djC}sng6=3Y{=#9cWvIV20;G>W!;z~Cio5tZ0 z9R0x5SM((q@lY81QB~1u3ez6_N)eWG;fwsh(@&|0?L;lChZD2P$Fg6S?r8ol z8(Gaf3U4x_#dyPCQ+O-A_p`9RYRv6hPZ43_U!V#sJ@SM^gnQej>u4rLLj~o%E1`_j zN~t6P8QM^#-$f18x`dlo8-9Kpl$_|VORtga_%eD@T6?~M}~5GOO=ZpV#@DcIne;abtZ z8KEF=z0c+zbfQSMi4fgDA>6;1=BvZOtZwaT+aR%x`JH>5nlvx4lOCkPRA}9Vpxn>BDaFH zG(D~%5Et2oDm69>R06-LZ2~#jh0DtxuOM4vChy)mfz^4sG5Z~V5ydOK!h)kQP@odk zfSVfT_m;QdUfOb1GxRdhpR@^Y~6|e*g|usAjLGOoCn*~P3o^%7x3a< z$2!uTM@QgH>$HR~-YxL5x=Z;@-YxJ|^H|+2u#KB(1e!=+M{eVY;vY9!(aqd3fXVSx z8L2i`*$P{Hx#g#xuWs&!fs9Rd@!EP#sc|J!>fdVB2deILf=oTYWYDr8Uaz^W zmIym`2C(TC9;t21I6~ZA+@-m7m-xOm(8oFjF|`7%g|(o+k6t&>T+1906)ZE$B6^`; z;!L~E6UlW9q-1~pL9_P;E;0U9#;o0C>U{1rEq-*v%V}tJ=J?+{SgHdr-s!)I2Y)vu ze13#JjVw7J<|?rL#QoteY4Jff^?1#NaQ9qmL2!bj#k`fgz@#o~sNU*L13nXv_3E67(Z0wi8v4p)V;vK2qc?w3US z=`Rflbr*D7BNbOeHUN#dc6tet>X_FYOc%slQ}m@cKlbbWJPI?g+JwrLjp$ zjkZg3{)4UjQN-Qh@NOx9O{N0d(ir`vpX;uI7<9x|RB}{uz=PIABHevgC9&dOSdqjN zRXI`;U&H-w#cAv{Gbp1|^>4Nu6Wt4>K`;CZ_U+Ot#}%s_)$?mrvO$INGmneZ-b|!N zQA@b@@DO+IM)h#6CkCxyQ}y7`ZP$z?(wK;{Z>+f9J1?ka&>jLsKr(8L$G8EU)aD7*Z}qm=`WEE@}}aSO@Q)KI3u$f zh>a4R9{6ip{?(R?btdT)^6VkhD2u2G70gF!O~1~PYZuOwqN7mOFOmyOXXEd!UtF9> zU)~LY{l%q;^d9AU*f-IBnf#F;T=bG8BiG0nu#TusW_QDxo?0*3i|=6wMf^8;M9`yW zKBMCr_y1AIQ!Jdh@+)sv=DNxHVJa;(@OcKx#R3`7l?FbNnHhGbMZKtU4G1@@>hnGL z%3%0hAvY5#Y;@esExd6h3Y6_BVWBOER8JMYFo02qXb5dgw`OlLgQ~2VKr&NFKsTFT z{kgd@0pDKjTh|xmwBKUu`_OtLs6g`oinvY-Z*}YKYIsHno6>9QO_I*A{#EXeBA-C| zI=xElg`YN(l%4A77T(Zg^*vUzT)_O%q*j;`#aZCbPTG_ZQLZ{hRXi5jOZ$jAn4Zx8 z!;5k~6>@oi(>@?poU77WCD$Dx*G9d)v(h8Oa10zV?*>QF#yi_PFqUvNT%!Gzw zPqza_X!i@tAk+3{DGfk5MJ2K`3Y;NAQoOS_hXpM5Qg_A z&vB8Sy*qwYr~0f+)h|p;S_l=`F~aSMP$oO0__4S? zOD5_{l`Txwf0#%=roelBKJ2?BmY>MX#$M>gTUYXQVlP$}Cstx5mYdoJo4;|v_VFrYV)-lucXF)p=fjOr0Vz*=*M+UbGC>fOLuwr)gdcBGbDF57wjNnDL{k9%{2GNN0C8lSnIKjqH zal+}UjGJ#z?>D$!A^^z46LGIumK>j3PP#uCePBRxkD-xkU2PQ6`o#hMMv*~lVQUR7 zY_QR<(~|?b)g;pI1H6s}flv)?pT(FH%k9&|{ky7PEQ#kVFHdIsb;Tx-NN=Y^fYrEb z%FUI`o$ZHBC+X8Q;-q#gf8xK1)fflS-SZDV{~>4{#yAbM=mZE9{Q?b5{$)h`Ed?ga zIyL~rl|&Qmynp7Iw&EeJlgLmAJXy1dE!}d?K z?aYQ40_3NAGH@9y=nhkX8r&u*-2nJ*Pcz<_E z?DK(t17-B^j1c?$jLF>x&E5j+g%DP6UhSvn*=Q#*#rZS9f!4w?;O{X+3M+2YHv}FZ zZ9P@ch{Bvw7O!Sin#*A=9oc7KDGIbw*(MZ=mT8>_bd!KUFJ$M02SI$HZ;Qo^paAhdNuFI4&> zcYzJW6*a9Yi&j^q`8Y;->Yq^rs4KkOrOk2VfWxuUko+bn;O=XGuQLC12QU=&K>cHe!a8iGlPV^x!mYFw+&Fpk=?pyI-(rTV6k0K<`8$fX+K zlT^MG!6UrCmH<%1phZNfm9EndUH%5 zXqo>d2Y6SqYqK2S`-A+~0tfIKYhm{9#CHv(M({>lw{#k{lOz{`$X-2ZS813*LkX98 z{eq^4+wFVn-09y~SQ$qZ~lH3Q?4Qf3cLi;06tpktELlM}o?Bz(k#IPsY_faAS5xzH| zc8R>ZM0=9x`HYvTlBeliwBnKmf`DaDE788C63{1>TZMI7qf@*2$YNyQ&@xkCGV>DY z%htnEFa1PfS8`DrSJ&pPyi82@(%54;*M5u_@K8+A0JS3%R3vnItD3&aq+cTfTLc%A zR6&2qEVUpTwr>0cwjlA`0cbwX#v1sTkF)U+-Av5dVzOV-H)hr=<0xv3`+|RH?}AW~ z&~4(Q`>kmin-eZ?94-ihjuLA;G9NUr(T5Td^8h)c(@8aaXdncn+BULeg|_&g!fPlJ z*9kAGh;CX{%8DOx|=yxksYb(_OzI^Tp?6gn#Iy#!o*Mj{$?rVrJKggUo z-1o%S0d_exMBA+^$#1dCrHq|9(8wIMah)4`1lcz`NA}mkyz?9xO8u<^JEU(xKAyrGcQX3MET3PEqr4p!?`3PXi`mvN%7P@I>nUjTskLviA$ zZC-^NEHJQ#Q3!PD6VA$;`=CWv}d3HR-XU!eH590M#sVd0-rDf^3=p~VU^1tCFFXJ7NvkW z=q+L#XTuGmxk@V>F(}a27IZaQS)%8B4V4E;X^IgTV-N46R>loMvIhI?fk#bW z8K+mevo%W*=t|L=hH>h!LNPDk2o?96HNbTka{ELUwL%^%>TKPaZvrY;O}0Nu zpQ3U;7~fzO??aZ47I4pkP>)~In+Sq*r!DfLhvB_G9Up-i-lhrtaX-Qg(|Mx-`B0zn zi)3zU%`^aYD@E1xTrbfnF?AtlZH``gp>O=NIl zJy4IejaMKWI#y;*$kq1fWj#qYxV za@C)!J2k@(P1gfi^eeC?{Ts%@RaYz*;P=s1##ma(PX2=Y+LLmwn+ko&a2Dw4F>fg( zUx)Hzxmm2=5(A?<;(4KqZysaXB*V^1xLFQFP;Q!))7Vl-FZy!_XjRKs*{_xeL8FFD}xx^oK3GN4Jsc=Pe)3#egucj1P%46pCfx z@lV+qp1{btizK1cmZCJGad8audCglJ6gZe#FoQ;8{rXgHc$t|rMw94G?e#AtCLPkX zOBX3T)snEM#I|k54;L<%~0K4%Leea_p-vRH1(es-l8lEs38orT? zX>mL?l%g6fdn=5R22VbS{9J)3C`*TO;oyq0F3}x`Skq%nkDMB4c%@q-@?~65=BX^m zM^tZSzEMpw8Wwq+CB4!pnGa8JpCh{@46%2aknC)4OS_<*}Xj6)0kYivEc z%6&$Hr-VfM0p?s_oQJ?TaQR$V3J?6pU7aMl}Ip{tQjxF ztaFV3Ax3sHpTgSs*AN23zk(b>#xMM9KD?gFD=@SGLo~_?IkU5|9Lct}3JN@IJ`&6q3;pw_7xDave~$EguYbOkzPQ`{^9xG9hG(-2nYJFS z5_ifCoTX0}^GW8dT-}vGP3rrk;stX@Z734mXVYci4b9~KF@U27Ld*H~Ld z$B^H*w5S*}C*cmJlzW(wiw3-_3O6Ssb}WS;W;v|l+s9F_MTrOTKhdb|hB^~>3EJ|m z3&}n!Wr;;ju|It<4Y8Z@BFI;RCb!h7jciVLSiV(mSAjnRGK*`qv%~l|<^Q#Lc+Xbh zZ>W4|_+Aa+YYZzB={__ejM}7?!m~GdBHeo6>HCg;g4^i30)Z{YD**+a@BAZuA-HGV zanF&UY2*F~_4#t=7RxQ3qjJ4;GZ2~gw#=*CZ^-(!zTq0P$}N^E5VvEq5%Eyvw#3L8 z{YHX>JTu4><#_Q`gox#G_}MQ;kE$50j>h_;y{L;ve49!@lc9lJ*7rD90TDZgTwPQL zxhhNRvKQ60<{$yI%$+;f*J%{^IK27mTjYI;WU*KcDpopLJZwx|pm1Oz&3~9fe?4N_40sG?diD zXv4q~>DNA2LLfYi_=^m6G@sPCqq5h;wS6akz*_Gs**wwymWq_?n~C8I|1r#vN0 zU<$97Ueaf_YeG%mML#F68h%wk>IIrOSR|}`&zjlP!T{PXiP_yG26p@x48oAgH z!;aEzF=|Hh0eDH;A-)SawD)N&*iQ#l$&H|GeMQSdjoAZI1J3W#a<48l#SEW46Y+IgP4%GG z!)gvlOzlXO&A0gsO6@d(<|F>JyFniEXMM<-aOmcb2cT$C8o`j-V$yOxG-fM@bEYaS zx*MLweHu@6Tr1H3e?<>*0eFLXgp+L}1(8ih@aB$$pc=_kDP8*8qzumWxejPiG zTc@%Y;pFj^+jW2RsNRAZZFE~+Qf>FG=e2Cm+IFpl*i$76BL3-%MLo?ZliZoV)Co`o1e@;I{ zpas|7BQ&H*ozg)Wa1YVVB(k>BMpnZ*q_-c|wT8Gv`vw{_O`VbBc#7go$w}C}dVZmA znNOm__%)0Orn622#K^w9#YegH2oW5QD2$K|wyc}(v#WbVasTeqf#A3PX3Av7w7 zt+@uUsE+pMCI2_EcGl++J_IJB;M7rCBo5p=k|P?x2Qi5)_3~5ojxYaa;zU0QMWmMQ zqSW`4dV&$~I`^txexsU-MQH>#y%*$*1tne^bgg;>H~W)hFxcCE9zNQSLyu>$4>AMcX2K3X1Hx=0S=qy4ZYigWD<&`D^=pZi$07v=fu<%6(C-It{Eu z`>`Ni$CI&#E$BllA+Qem>}n66EZUo=uv?{uSkxB;;N3x|2CZPW28 z`ZYXzur@-El+ZAF`|V!zC8_g9-9#Y!(35z7q7&9yj)cCGMQqLU$C0iSb$M$Klx_5` zlVZ}B;ZkS0Tfr&Tl2eCg4$fy?U*3%Twnh*Cv13RF@RollrWc!EA^z9@?!4(C#yyJX zO}hg7e|g@t;(veMw0RHp;s4rs(-z3fjet7~AJT9>*522hXw9jF%&r0 z8oJATH2ekqm-r1PQa7Rh-t6^X4!`RF=yJknz%oq#Ej?n&^>4kCnOBn8;T&9iHj!Rw z?7`-`tS7^L1Pc{L8+h{xX>IOq@tEAjBkcfEAa?jLvE?`*+Idp*UTv%=dsLIAZFMqx z!U%X%gZrTrqMQyOWL{3<4=q20f$yw9?X;K%BN_jwUm`lWj+hz?YP>K1kmx z)$U%EVUOv_yx!*gN5c6F+=g+W!Mx!u->prj9D$!5+Yj4Jz@B$@X2j|6kjr@laNO5m z8}9edKu|D{YKuyLH&uggmJK6@XYXs}O>?*$ZD0es?HN_Wvtz4?1JIJlA7J*-Hk}Rm zXEPTV`4clMEmWLykM0U|{$tDA)HX`h>?Xsp8_irwWiKa%qYpk#&6ngR2?=2;^Vr_f zO&y7kPX+?1BCqozrnT*G4ghM}c{yT{+L)l)A-GPn+VD+N=PGj#iw4(xQi|oMx0$q5 zx`6KtjJoI7Dv3$2sJ8^bL0K$4XZnDWJD)S{+mU}&ipER3Fv)LoUuot` z2Ha0pZv}XiPKI@B^?f|%deslpPu;JGV!X8qmm*B4jLx`--d<2HVq=ybz>TmnS)+P=8j6*5#8s>@16oEOsrD3$3p;A~Pb|~HRs1Th6QL6E-?4?TPHo3sO|%aqgOX7kd|#97OQ^34IbiOtm#HbE=y@%D@LZ>_ z%{9rEcm1t5?FyBuaejLz#NJ1|6#D~vqd$wApH9I#^V@nmKzf)))RjvRI;8P zeXTp3oMmst4_nw*Gz$x}+50+hS)wqIYlyp~(Hq7@*1=909&-H{!e)H*2_@9mXjGU& z`uBH_$|?gQ*{!jtPsb)Xk6*zjeFvN-MPE2g_4u1kh@rtI^%jYC^;HJh_jjb@@9<*% z)3#Yvbl*mJ_@Uw9k2a*8@$mbA2Y-KgKt+dWx!$8>6WFkAGzcu9B;HQ)8)0In{{bsk z)U>LB0%F2BO>omsm_U=ZfBL&yF+C||l(5V2qvi3%a05~L4Xc(HWEb-l_VC3GLp>7h z4}b;tOfS$geItCFZ1}ieG>ZT{wjxicFRwqx^nP^#FCD-d@UoOgwX38};$@UJ!;5&C zru^(e*FxVL;bq}Q)G`!mxuV7j7+$u9V6O1qD)mod8UPFAuz&iyN$Z4{wBM`tWOIa2 zQ)Q@OQ_uUk$R{Fr;Sl}IYKT$u7ole64xr{Pp5SXRY)H)0$%A5wbmh2``9H0I4XUh7dZ z+v>NOG*~F9Hk4EgN|p2|F(5ojs{MR|S{@ZjZr&c0+;1oe^iYDA`-ILZYCx#zJdds_ z;73Ra`01if(z22^i64Wu6MmMfdhpZmpYX$_EWq)gcm68Dk5dIP$*?#X%$aZSbK<8S zKdY^Nn@xtFTEkDRpj1hZpXJKr@q;;4{098!0Is`qJMg0exP|#*`0?|}@Z-Dm4fqjK z0)FtvF#ITK6Zk=dO6BRVZW`R6Uq#$l_(F8Jbge1U8C*|Zw}y`z9S^OF=A&j{c&mzD z7tE(Hrm?c{{rDkP5t2E-A>giJ4W$viSZw(&y587>ZDJIrq~AG?f0El z7FgCxRu=udbsCP`X=U+TB7G871}nm5R~A1&uxxT=G3r08EMn)Gu(my?G2H%o%{wX`7Os9#y2VavlpM5RA>sTL||HXXe9qkOsDx4ah( zAm^hu`qc%iii3V3-Qz1P5?X46^~Ii+I124-WUV{o2~}0}5s?D;IQDZ@SiX~gncvmO;thdl_ zaCofs--2y4T9K_U8V%E{MYv*)Lq#xv^ z?Ozp%_7{1Oi%o4=%Z`azN7i<@t{m+n`VG4?p;`u3WjCC_Obxg~mWjzGW{rURV4Vz` z>Fnn$exJUkmjHNnYm~+mtHiCJs~~T(%!5Q+DhmhJ=QCk`slLqlYztUgVgg_w|S+Joi0ikFQ$Wt!vkoJv`8f_8*hX;T=I84dL12#l4;L6KCAGRHmab zY)LhYPO>GXttjRU0iOW=SeD|?H77$HNuUmKB;zfqV2?`7YMTN?iyh4nGvJ6sdV>JW z4Ls&2knMYW#O*4YB6Wv2lhK=OUwOS$aV8fw&u$!WHgP6XxpAZ2pcxxBthHJhj~Ub! z_oxl777y6kBWt@R+RwC<Sf4Mb&KhgEGQbcBzgRN@u!jng|&+--Vu;AA!lZ5g6Ocy zGzznnZ|;_A>%l7EWTIdVixuWX`xlTciLwkPbBCe*0B^E>ee3n#4Z^e(Loat!oxH6Uve$O=r1%zsYUpx``yT`xFNq ze&iNpg<_)(iO`W+%d%zJi9Y&4AKJee>51g&M0#vDp~*tJa)hiiq5LzRkF1bm4bimz z;FHWa1hrYwTfk1Th@E&jaiUfUrW+rRKC}Z!(jk3%2B`*>`MeK>DG7bAGMiplP?=es zX*r>KFeg}5Cf*^TKZ=Ahueck(1q!e8D1?4I{=9rgFZ|j%YFf~HxOz8WaGYl4$mSoq z@xSea!*QF!Ars7SIIe)hXNow4!)%PhDM6tr#o=i2iJ1_WXt{vH(VM}cPF&J)rpn0$ z9F8jBkmUZZ_Xi$_m^DmM8q-O)W;r4ps-BIt>M5Yj{b)-NIFY9?uT5~YZ;eMvv?u=r zexcZpl~GTYww^AqL{&xqE`Be?d#hRlQz%v2g!KtQWtic@{Yrft7Zj?=!UE_Y*L;TS znkhlQUn^~D1c?8~2u$^UuEy;kskvM)wVJKpixf4Yv z+K;j(!bb+Ax;P+HYg(-zfv)+|-%Wkrk4ckwfUN^dG<#*&m`n}Yxsi6z>>+L4B_`7D z=)oA&+Q^`1;4i5q&+*0@FnQpay9vF%xDFI<1DdZh@H|eNf9$TC)Ctr2x+zTe3z)Xj zF{ZD8?FCHlunA1_O=bhs6f2e6=7T?pcfaMDndi}`)|Xb=BcI8Y-F*Dw=zOXT<0QGl zSzX!hBwJRRY`xI-8wQn}VMr^avd8C{!8dfcM(Pt|OUR#yuIjnMc zjxc_iC8_N8gcOhWcS_@m_aLr#BK;IF4F{55@L@A9(&Umjw0MukOqsbcxTCw%p|-oF zx|b2FZBXl3HMOe<4nkDtgIOKchC{jWXFWo_5dChG^NB3-8 z?mU)OCFO2bxskTq@_gnI9nC+d`8@_ZxL;vKwQDH%-`c^u zM|&xI=ILl>;A_lSH>2HocK0%d@#1jj_@P?5+MQb~>jw7$iz#iwzhjH4Rqi}d`$nsn z^+SA~j*urw9#uw1wI)3uwNF*{4ar)72B+Hv`NNXA;4U{orFoHLbuHm#x?VF8=T?ElgR%%L=e75*!2s~FpT&)Z_%*@))g@h3h+(RZ0oa9 zi>~ksxtxeb51*8ov`v|@pR~wvzHcT}ei@(i$XSPu) z6f}Q|dj{2+Ant|v#RrhZw`hD>xUVl>pJV^ed_i2Y)pG zybT)xXkqhjI8sELf5YoPZP_mW?O4BRTR_Un=H2lu!zXD&b7B7v*RGWR__h5%NO$A? zKjr?~{vV0O{Xbsh=2Ow7d3ym~VWDgb^rL;i-Vl)X0jDs#Z43;rpU@qVaeOnB@i=C( z*%NZ?2lm}pI3-%MHiG&*>6iF0!-*i&!YZXNr;64yYriB1xmv3FeyBC%&bI$aq?xu3 zjC)#OpSKNx3s$$}ler_kpMVA7&RtSo=Kc*)6N^u7TZ#0i!f>veBcZDGm|6z8vulc0 z`OK73M++S5^7u9J_d%|ED#dK4Qq{&309x1dQi z_bRzOC*IGPhjHTUANZQ{*4seAKg&xz{*)8rRqkNM_lH+Y$t82gc_buI8=KcAvO z2p%XM#a-dR&qtwSwFRYYi>`;tpuF0TOy!2KAK4>1gzWB62GF~UOw;&edzIZrFSB*W z>xTtEwtUJob(?*#Ob<7si)T~u=4H3zq5d8T3ev~znNs4cuaF6Tw{!&G^4k{qT4D2Y zSj^WB3J=K~GfA{jl|L*bsiJ$0yED_=C$dCAURCaWvq`OTzfvQbDe>-QSwoy(oF2o(u%Coz1 z1f|#znB8^u0cLmK|AY~Xhw$*j#r@91;mC80(>2DT3PbXIkW|Vs{;Ux!yz2OTBjZK#aFmgH7_lI zk$D<@x{Odo3MM$`a}0Sej_KBzJ!omNez%&IR^c#NVp6)m2mvM!pBdT7sFW5I^!JBV ziCH`B0+`W>a2#_($(>eH>jjTIxZ}W3J7`Ki3p|{0El2xpRrw@CR4w1Xd2AX$%=2$j zH*`*p0Gh|@S+aYgaRTLx!|2wUliJG9s7*}Xmj2yp!sZATGzgNSxW4lripHFJB1o=4JBgcdhIbV(Eq`nqUvTL zBUJb|CDlF>*MC>6ua)vI?VltwpQ5F`C@l>gr*klEMp`$Yq~+Dr$khOi^@k+VTkzr* zj*Yn({uYf`*jc#dVqgDPG$SifLi#7;^9#%|`4}|Tf0Rgzl?C@=uBzI!)7DxB6W{qx z3Y=+)qc^*?LVHbW7zOTR6u7Ys?l$9xMFr0InL;)470}S=l4T!BK1705c z5M5J){#%iaI%06?0KbT82bxZ0AD8lzlJe7UH@!>w>A7}@Ox*LBTh)st{#<`H$Mz1g$1|UsUuLDhJ~2URMk4(-|FUYUf3f9N?XAq!KhLK! z2XQ&fd3eb)eYr%0an0}N^$flwb3-aq@x-<}2n0&=LCOsropqynAeU~094%p6Q^X?k z&liQTb%72heHWUK6Zo>T*1wq0op0!pT!m7<=pyWHbVjMei>YJZZ{DNtlKhfleaaP~ z!NluS>;yzOZ&!p&3w{^kHWa$yJ87GzI=mrO>a&+Lu~thx8rx7Lb8rzoVEWWBbKV-B z`DN0$ZK*yrwxKa|(90O|QMtWuGdqNQn|XWQ$1t* z(YmiTxW8h`is!O?!fqbMESzaGewBMngboL2C1;!|+^f#QYK5EprDsG{?s-$dIKQm5 z4JD4H1huV7Qrl0dEiV0f{IG3!Fj~*|>_oNqf6w!~Ux&)ds4VD5sm!{*YYUe|5UXFa z3u8RT7_!~=)sjdy>2ahYOzOb^9*U@snBVP;5yG|QGHvQDCa3<)IL7-@|Jo(q;N!YF zHaM>P7WHMzcfN+7T;1_0z4doT3eRy^4h&RktwfMk?yGEwD5B=`rGzFuy=CUXQL5sG zV?auG8U=6v0whNPN&DH_@X4M=C|p_d0Jx(B;F&xpvu&;Vp=gt}!;0XP#6!oNo_Ofo z(=fSeo!RkzkLeAG^xnwJ*3a=RaLKmCgsznWY)q>jKh!b35M$NJ1G%_EQE0XKw|Waz z4@lNFTbfOpalkN&U;XKMbjkC$lIPa&OqR(d$y18S*OesCjGq}qEnlstAx#LGrZgD` zXXs(hyLe`C-Uae#WE=#Jmd1Gdgzw2nB8;7fx-F1{I`#`2hH7|T7divHBYgZlAcCm^QrWZA9 zL1N-IdTCp5L~_oD-D5B>=8$V+_Rbo0y^*$55YfYituTOTxv|g3!|Ok3-c@e*|9(4*D@Er!VcZ5+a2Tckh>vz%eF>KXqpRm6sj>6vv3IV9$)qL;JFG24$280ci zw`Jzw_@V*JL}mkSs)vTW0*>=k?60UQI zO@io`BxNTvn88IzfN8YviP~pL+pYA0K-ZM5})V0XnlI;rky3X;d_6Jy`syj_IGsAmmP?eZYBJC9R zu>R9Td&`ai%0uK%g6wYMI}bj{Xw;twl-{0nOm^@J*-OB5r(rA%-!s82L=I?F)Dr;MTv1BuY`*Skd;3}kSad0y)*zjstVT@aKPp(^%+-Jvz z^oC?&FpRQ$s%?Im4Rw1j#Jh-=Q3-CUFbdn~OgwZ{6Q=<+@RM$!$BhF}vJw-H??Ioz ztL+(q*IkpjP!KL#>l`P7qT`0cmiW!Xd$Z3}Rm^o6zA8)ei86v^P`<*ocOupI5C{Cc}#mSpUr?eF$?~3*#cybfpXX#qU z(-e2N&>`ZvW-5H7GB=c2=Gk}jPd=JnpBu6vy@V;Tz-tIn4iXbzWCtvnE%JtIc#oFR%9Ld8F&js> z&#olRrWQZnlewuLyL=x-gYL{j`TpI)cP-GTaeg;!zIE9gTtENZ!{x>(^HwzJ>vw1B zeY?VY3`gIm-1Y2`corTFqV(qA=?^@wv)LHFYthz2POaH7qag{FGG8_KbkEU>zI^zp zzFa)oUGj_`nwi+!cIOP(Mg7nnKeh>koKre=6w`y`%Igm_DKojU8+S4w88W zEl=#_1O%JHmWsW=uq90jqO-trKZAP<0A=!W6Q0YIv-g>n>!O6|D^8pO|&bDIHNaSbt*H9EHuV zOtg)s-{F1%ZH~P7R%)M((dQenKX>`e)`?*0z74oqg5aGN1Tqm*!Uj zyNkIWd@S&-!2d{i&=?rr4w^ZE->FB_c-1WSGru#t6UJs+Z#+Bv8vFCiR-wJ7s2awk zgY1d)7dxrawDD)tyxSIzLKa@9G)*L)r|EgLf1a-AasGLx_?@2h7b~lYo)^rf4>*4C zA<^@)Csn{8&kS9v=jkd;!PZca$I0PQ$ttEl9@TyQDOokxb7tsr)iRTJrK`4(?vy?| zr0aV~U#;{Gr3Vo&JVg&O&~0zwUAQ%ywb4384VShg@leTV`g=^4a^fn`sKo1C?-doQ z`n{+Y&RfW0!O)azdsJD|uEf_yV5$wGb_6(|Y_o%8R`W^92T2YgN`kc<(Yd|P-W7L; z8u%u*Q*B*|!Sig-J&47rP!W+EI#+QG=SM7b;RV2}exl9z#>CQFwt<}=Gg;KIuOJfx z+KmU%=x8+tX=<3!q8sFj_Mj1ld@B2_*0C1^7YdDf`Os0PdmAv<5=(z-te+9I{3%Dl493>O&U&+qeb}2WNq_;Q5Iiyl-LV2UP($+c=#AJ!z@Y#@e z7nj-m#}Vs_*&td96_!$Zsfg6BP25ml8W2@|xMHKGUaF!gyN4s4s1(A_LViw~z8$H+ zD1d#@-tK`=behOWLwrPlQke?MN$7^@A%V0`!G5ga@uV=-2472l^Gwz)GAu^tB-%!z zd&eW3%yfG?Zr{sw%1>b^15+!G?E3`Pw~@UE=!+w(X}mE**HkySio9H)cOFE~5!@eT zJYin;b~o}QVHIuo1axLG(IjRyoHCt>|2G@i@}IU-M=boV!V%T@`Q`>C17&!<#ZSP* ztfyjCz>iRtjiacIT(KSyCMx=gA)3pNn8Ty!i_iZkClRC1VLu^%w%44r0P2b~_5+~s zq3PvdH#X7d9dTJJWW-I!Su)5XOjN)-Axg~05^Y~vLc=&$*tY9+zQIGM{Ld-r9U z2mXS=^c2f62Ie=<0`qjy$NGHB9R*WN2RVW%=IwuqP7bTVKDm%V4?N}{oR6?c;v<;MIz$Rx4QpJ+8J%woE43{_YO# zT50-Y%fnIAU8On0lFW|0=X(7{=!V!%Y0~0Vux1iRpsxs@8DL`qxpx)Y_}T03@027@ho$DV4Ur{#Qa$G znn06G=0^2M4wzw}CMJB&hxgC+~figkFIpzej8UMwhyd#(>NaCn`~`WJq)S| z`oBV|skVP}+tY2>0T|#~yH(*wyo6g7hUhV|KO;D?mPf3_7N|A(_%WwU?EmtD^ZVs@ z3+sFCSGcI3zgyv={^LJanTrm?t=F!4dM~rIZCI$Z@wX~gn7La$A!+$mif>#y(#op^=qt@z2Ds*e&AZ!u$=|IDpTiME^)GEb+7 z$=&CDqR^!hLtoGZ@RP0)ugmpgQ*)L3*<0XQ`#yT<|ElmDs>q%v|7}lVa+L1#X0i{y z4-2q9W73$bCMNY$O=ZR;cB7(B1Dr@7W)PK7;V9~l_ET!E?yG}+n}h$Qa!{5I*RY|g z12>X~iCH60X?wfOeF7oKnj}44L4PMsq2pS|h&NeouX5bv)i}hNa_Y}zgkC9xpKVWB zG@(*u(CW*1O)@s;b`~@^Ee4|YPC@sR zl7=@7TM@n& zM9bB2A&w2{yTBAJwt*;~vWicY^}}j|-eCOEJvv_SVK;n0QxT|}gmEx*oql;YJcAV} zHA0NA72{gZtxDvN$I15S?S@lAva+Zu{kdalEVdDPyWxB8X!xD>X`w_UDIL^2cMm=Y zSSGI9+YrF0iK801Ftfc;lO>qPB?z0MyRfDC*$p{)$0IWinF@@(mbJE;zT=fJN-LO; z>DM{Fo2wgtkjK_TVz0Tnmhd{=xUF^-Ktq3Emz}v*zx1Vd=!*<=dX{a3WPR43%(tQ+ zrKzY-)5&0Mw`xMyy6N;-^m3`0y?s72t~BNbex)4^TiPBWCY-&=ZwoOY)dJ@b1@|79 zGBt5GNL1toE~Jcm`(;0T(+voJ+uMU+E*?Tnu+agq!tg0f+&rzV+O{Y&Bd3mDN?5oP z#e2b>NL?F)f;X+4>Z>>!f1>sNoK3BA-LcSfdR5zsH)=LuCs%>Hu+cYW-cvnl(j8|c z=3)D(?Du?_sfxg}&8c8tce6gA62eWbZM~b%M4bFw$e?NE)}OYstOQF2rMa|9@M7(Z zwc;rBHXGg2Uu=2`HkqUi3eF~`;LLvr3N5gJi?rAXD1-1JM+ zh`zg1H`Dmr;!?48wTZ^ROXHgDcbhFd94*JJAoB#pL}Ts@AP@llzEe*~NE(q(`y9z1 z5Hd|5pG_rwmpqJ36I|Eb*2k02>X!h4%r+M;MmUM4?xs$65zGhA(vvSDKcu&q#**1X zt+QVtwcLK&`^v?!-IfL+XKIy;kzB4{U8%L>a^XEBNS!$2MYvKIr509B8w66Y-ISV8 zJvqdCmFRF$hLwv6i?Hv&5D3pgglJm@+-0)+SId@++B`xuldHQSNZns)7YP>8gy=P0 zV-6%<(Paj!bl3q_o%)&>_syOpD{p+U6c1D$ro2GTkaPPPL6(L6x3H5!?_^=EM3w_=(FGA#?P4^0a)A zJW#pII^;mEW9%JrXTW+&DqakB#}Ayq?{m{?qnL1_51Ll8&qDVMQWEOH*AyP8>Lnr_r+ z99q-%Q|6;o1Ceg5qa2s3S<&~yK-+#21hducvUlZzKvEa-yO>>9ers(D%G?CL+nv(Y zI#LU1sx(oGK0xZsD2T8A@eng*OnkL3ks2fG%u079^OLRGyFFPAGKpfIP=CVH3$l9R z3nQ!b7idgYxDimc_+uqY00_H_OF-5sTu3tWqLJG0fog18j(x#rtaWd)LQv51syzMU3_7kNEU4q`PDP z{?&#~JC^c^XdeRT(dWwwj{T|3%lSRMW7xb@=B15M5KRJdjfKYB9kY|z=vUnm?Gmkw zg(z>P9s}mwUC#j#gyyMF2-_b@h=#pI2dHHld0j8r)k^b-4!*uGzEbN`vPn*u?xSqA z?iG@0IhlEr1COo`p3Rb{gMf^yd&`;!^KaMYUuaNw2Nt*f&m3`Th?{H7yp8f|BN>An zcfo*gTY|LwH7dRAS(;Y>$XwezmV5mMiy6Zl(x#vp`O_9 zu5g}f8@EgNd^?}rHsR&!Lf(1#hAVkmSgnU22V3Tt36iXc;)|89K!i%sBe%y@k zZj$y^y)d&klC*wNZ(>(c_OoZmMUTpW$at}&u(g~aI>$EE2#gwu4c`-wmFNHUj;^_Go zrt{+b4$3OQ=kD79;Igs?tUv@qqBo1ycBVh>PN@!SnM1xHQA%{)E>y$9ba9>oznfR( z-hCQ~vJw-X3Dmk10ISh!QyPT9dWA6^#&4=^C5{9WZ5_SJ-Jqg`8?4rBS>>`I$n7X- z#>u{#`BLXb!W4??nC>1xKzgx|yoZ=WZr~KEa9Q*#p3{21oac}|x&#Eo&)3s!|2!|5 zc_~t4OaF=igk?l!v};M`D@&fIQIN`yQ{{UXDi>3TPQ~aM*RL?X;WF$EDN9zmVt2c$|k_)!Y0YS|kdqo3pzo*w@6TeukD4{k1TC zVjK!>F%xxzR1>g>)x@N0OgkEitlI>B66veeB+xd+gLf%yN0)D$dX7?$Lomi1qR{4Z zw({Oon@doy!VpE1sNS=udnB7YC)@5+)z|C5HP+j+&IXZ8&&QyiP)t(?I*#wZ@FihB zZt0IyGS*&uMIU=PS)rA2fJ&LFSm z#MKh9&l1yhQubwE%EFc%{$FKkZY*^YX#b z-R9MKs*v_)G8f!iTEOr!T`w<%7cDB>ND(YtLo%SCr}z1ajh{sNHrjLhFQDaXL(Ar- z{E}~nXOy4lAXCEDrBGonxgFhTH{LzHTxzn=w8pna8ohiWTVAfN-&vksuAuDzy=wZb zfJXEl=~eMTAbk``x?Q28=tlulkBg|#_j^Yj|6hCW0^UY(^^K1%>r0$C20}zMdXx?>vavuO}36_+njsy}@w8 zk%%~g>-^oma92>!3i%eSjl@>R!6mrB+us(6`MX;~y}DCD>t2wcOKKz<499|93;gkz zf5Eaypsy!b;*Unbs>2r#`PFzL24NO1RF|p#NEqMx{0T?6ucyZmjwBphBB>TffD7zM z^hbmIqA%?5`TfD3V9bZQaMxEH3nJ0P0`Lm>7W9NV7DW3K-H~uC=`-ILIJLZMvn6k;v+J6C{mwq7%5B z^+L?Wnv^U<`6X%{Q$yiU!lX?an(Tp5-DtKr7b>80^iE$m!PcPdk=O|xkx0);ywgJA zcrcbweSPc3U+WC@^q^iM$m(rh6lM`%3mq*HN2=Q+F?4ZfC>#vT$|8sA<3ZWIY)*Wk zLsg@ZxTAP&cgWxE2*n+9&hDG*ILVRd4#nr66c3#pES{Tnm9dp!Uq?@ntQs|9Oe7qB zAIvOe?%0>Xk7J0$%dN%{={O4wb`&2i1ao*#Th!E2?{ca2p0?ViW>3B9S+QdI3U!&M zwY7G!r`3Tr_#?g15Go3V(VULHE=O-95Oj3%)+KzQp7>m;DXza*5JE)!nwlWN%CIo8 zU?5%HqSM+ut!izn+S<0Fsbw)**9lI2u^_BoPv5_&KM|~r#eDs0ZM>BatqfXfpiU3I zctUF_x+&oc`-6^1rx;<3)KGWaapK!CCm-`~*8Nhk;%s;Z`)}ODcVVF!i3ty!U^E$V0^@feLap~EEb6^bVU0*Lg9eR zwQyk|NOrYwA-|w@1{3hRzHk5vLBo468Wt{$1>+dEG%U#IVOl;%JV;K`kqGq$OB~Br zG%apwscm*}dq!=H)%p6mx)bE*`ogQjk+oq*7)($Uel8eIgd*4(Kb}T+5j;*jE}5)2 zHkllqn@rN1=`O5XYaRLsFf&(8i5FchOEC=oafyCDX^f_2MIY{u0X!ySX7(Z? zFyfu6QNS4?xIhtcbxoPt<@c)yr13-maa;ATTj%Qtxty*FRTcf9l}g(oMU6BI6zYsc zdR4zK8b|bp3DN-e)Qi%Lf?Jw;A>3zQ549-ai1#7p#p9hw2Kv#Zp+txqGuL$j-YOAv zw6(T6RtNiM5wE&bLGDrMvS^z0;+zO{y}s2!q#eElvX5vC`AIC%uToM`j8P`+M=~If zK^H{HeU_8%n4CV6B9u{A1bu=2;<>X#eOeyTi=pa=Gl?5kmbywc^7;(R?V~cSoCsVh za;)I~+NPIRvsE;MS8v1yRq(^O-$lB`^&3<%3KQx6)(T`RKV_L}M_;H1emV|YM9_q> zz{XOgs3+3jgk-7jn)CDZAfN{NDTy%&bPRnYL!At8F=Uy2!G&pjw=eGKMb;OJ!m-4H zoxxZTkrDaEzc~%Su<_C=!+WJt&Ye3UN9q4z~cLQljO2*xs4Iog`j!$PMbEl$vg zBW1^%vY{1dk7lxmVgi9t&U?mP|kD2iUvmFWxQC}>s4P!?<)D`w6kTp4qU30VI z>3-g&fpOOoSh!GR!;T(E7>vO$Ay)SK)~O-^m0}@A@^TR=KB7t}&XbXF zBs^bhiI!;|#n@4fwV^~e0(ANsQZj_6IE5{SR+}uQ2&j5i-9-Ph-F=xe41dS zCZ7W?s5=r6B6LI`hwzk00|My*B8cLYc?(C)EjSoB-tq4lA-zadkUaBjndw{hh1!`K z-?A;zRd}?t#Sx4n79kU$utZZ1n&Ih#easg|!iV&MJAQ)!M!rG}QxPz@Dh)KgHt1WO zb|5kUX$~zjUh2jpAAlJ?NGmcQGiewP{87L`u02i_YZq1NtOH_3K}OIP3yFkB;>d9O zXgaPj(NsmEHPp7%HZOFfh7rxnh%W5!clf)5{?#%#5Q$hR@b?6LF*S-visb@CG@g-Z zX%f?TJG5vT38y%OgKN{)L`@Ax99lcF8_69KS~{o1?R3UVG7?WMd6KNqp zMwEY3gy0q+?H`UuTNjDMm^OwPbuZQ~FxQ0oXE__3VW5K!Z3fM{2@h_VjC}Uv_bWq5lo!da<7$Iu|nv= z$_$)`464BZCQtI1Frgl+Tm6wfifia_nh3NXE!Te>CurukAfD(c={~9bWXJND4ojNu z39iGHc@xsKT6g%U$UHrHcPtW)a3rQuf}RkPsrczaeF-DiZgG+8gEOLrv3_H_U>|?i5pF3vw5KY3bhq86I!4O1`b$= zblt8>nSjz#o|xi7Ufp%4A1fv*mn_|o^mVs0l`LYmGoDhkJx%Yb6php+tzheN%N&|t zWDM=A*0(Q!6;FhKrUO$NwA@;%dRl!=YdiBq z&T?mkv(j1Rtj7G%<#M}9U1hFvSB0z6RpqL7JKZj~+g<7|bC0k;Cy0Os94mfWE*b`{_>JnG>X9!pjM1^}-DJokaIBs~Ih8(<<2Pmeq}mRvQ7 zu^B%E9`Js^;zGtsA0A77h94s72K*U3@^--8fO(IMCD&mCZW-XYc*Y^`QP3aB*j~Vk zQ13N9>8sY1AyBBuL9f& zxD#+UApMTlUckM8dA~+Ikh2PKHsC72Dv}=%zqHGC1I~k7(?-UUKLT6@_!i(^z#_=G z4I9v|B6+d(?e(HV~bb!V0q2B?w zVRynFz^cDO9_+2idw(q14!CL$>I3Zl0Q~^C9dH`9N919vQ~+?@hhxcMz}^4Ab^89v zSkjAU*YZAvod8xHl1%Od+>Y&-4E?_D&}6b0aMcVvqYO9=8~pbI?mar0tim&PTisYY z1YA{`Ozr^O3-}peV_7okz;5PlzT4geM4`5YQGWk3DUY$%99SS~xHGstn zlgTJx;|a;+Fks$E*mnas?c`)~8th;fU_D@QE%@W-UUmaE1MXdvOl}0+UWamk>*`UC zzBfSrY0%e_WO5zg?xo4(cEIA6WHJvMNw#{E$yI=jt&j_F)j6nV2I~DzGT9Bd{k&xI zCBSJHK)%CqeG}G10UIyDySxFneHU`f1m9be$<2U`*n_kKFc15dr(hReF`yH0_YWW+ zVB>HyxeJgEG@l=qvsW<7x+2Tr6La$h;Q(d>3lk5cJhpc z$Cl2f&%34HRf;{&SU2jMQlX9{o*j=?imfH&gf z08Yy*2sez+OyDqV_`zkwndNM6GjBoe4WBPz>~O6eP&04KUW{R?&mJK~7Z$Y?=J7hn z*?op>WX~4}POmkt;CTkn0 z8FS(j#kG03)<)MF3X3*d>kFrCvMnxjL?#qYt1T>ARG8-}Xw*(w8OYwkryIU=6QKQv zsDYQOrE;}}jt%zu!glM5355=D5)5h!c;C@0PiyXiPoQ$wqFilZ`v&_a8@)q`_@034 zL*RR^cHMw)qXFLp$|W?u=VbEb?Fau+TqnCgokg{UdEkut@RhdW-Ge%PfOb{1Ws7xD z;k3=vmQ83&@dl;7aKK{y$%Mk<`a%bqxkze-`g#w_?F8?`sC*sDgLH$k$-ddP#Y%OL zpza-ak0p=6In}+%>f^dE#@F|7Z3r>vnbb9CY=doSp~JdZC?Aq8E?~rG8e(7lLy(hb zA;4y<^;^VJLD62ldl$;Vd;7B=7pdsC6i-&7{W>lsg9f`v%VKPorF6R5Mv&KagSK zb7M)eZMK|rGR}6K)=x53Qv3AJpZ{MVZ^p-o^6#O13;N4`CHO;g${ncxGvpa(Bi}KsKiw9n-1c)b zZZ^NwP=Y$tL$+89{tv-^U&J}pBlQlQZL&*Stg-sCY;kD;s{#4#ka5>bW67gQ=SO4I zz!voVCd&_6Y)*(*+@P~&j7H(BwxRx?;W3~;Q@?Xt;$s)~Z2LM2$$`^QaBk|nw!i^W z2g=cji<0^gVnlZLSbHQ zK?D9T5pDG%Z`%yIZq*~`G zNO=oBF7i+&fZsW|3-#JRMKt0%M#z?amFsvhEMOj~Snn`u3&^MC!MDAH4Wx*U{HVeI znZS(#cPzC9W9>2;CtGNoY^FZnM7F&l$ChK^3{wMx54T^Thy$M?$o2edW62oK?dL!) zp|^TyYM$}UUzpVGCTY;N`P1>L0y6A7~r6|Mgf%LK+*FVJd_44`_tB2YO zd)Q>Ru7aAxAf;i#{rPUt*VArtqDM^LU|q|7Gx>Tu5{z}Yc8a{V$qHtmC!1*p{1M{S z@$eh=r;v1l@8uM;FSDh`czTR2E*!SoR@n@P5=Mj2$wS~d`;M{XHB|Taz!QZw-He%lMHql%Nnza8+#e6g~9Wxq5%r8P)z6IV_Q@L}9cPixb_Be5U zCgx9XPPqz&xf|{?c7f+xpnnA4?W&Md+8<=tjF<`gYa;uzsxWAbjkNs{ z52|w}>ZE7v&n3C$3LcxSEh4@}t-DdH2yYm>H1255G>ivAF2p(6=_cyu4HOHl0nh@2 z{Gh1-NRiZ#UmQRk6EXjK2IuxSF=^3!lg!PmeJHt@2aLr9-`4yTQb^FBi~5F8b}@}F z-Ve*o?5e)dYrWdgs$k*GD4qvix8q?X`U!KpD&)}jYdWs+ScJG{eZjC-kw4Iwm^Kmq z1oO!{oQpBLo{txjyA0R1;@ZC|H}M((uN(2q+gYRoZ>BBs_?aGCl;*x+sDq{gu1}h? z?!cV)U*$#PeieAVg~voMq;YZxyj0dWX-LI(K2DYt?y%Tq7>^TJml*dwSW9rcgI|us zcVQ=s3u~-rOeidZyMg6Xy(6f1Gic_b6YMWz@|;!gxbeBeW?MDMuqXLgB408a^>$m5 z$v(1;jiN1^$zMtvMOX;y(|ouRoG{qMk{#OY2wbu1j&3lWX4#)JcDH32)agF zGWi1OYPsNV5Z`Og8W(x2mt>)k`6AcAXBX9tbtZ}<_WQw;$3etAnk&^31kMh;mm^IU_sbxoJPzpm*!d zJa3})nh#NSIhExx+l%nKw7`%5)JYDY&jo*nBbme#o6LS5d=95u4vPAM26&5kzz>0D zCTJMho=@|SseXh{lRg-coosak^zVWG4x%s5pyzRKap91~`fa_nHWg4klt=`QN+u7X za+5O3k+0h>bc3`V^*BLa1^PCuAH9Hc`}_E^jPfil7ySJOJDE6ne3XEdi+IulGBz7) zI8^2j_yV2k?R+JQh_s`Y%XA}1yr92~=${t!vJT!BXeyPrlHENIl3~zKA-lUx(CcF! zw>$EJf6p>INJzSyh-MFdJ(;B6C$kS`)W_RIMIY7Y42a;QyLyzHhP`%0RPMTrax~U% zf|z1z2Tw+0vkakhFX*4Zx$t8=PBuc%h{du$rB7}L?Pl!f`&VN{H>!FMa(a;`CdkkH z1BSwLF+R>=z~kmy^@W&G{D6EiVhC~<BR!lzi$2)1nASrbjmss-7knu_K*!WK-Jrb+v=g9Xp)+o4+zzb0DMM&1u%1^q z4R!^)BkvgBcer<+aV4#2u+G%Cv%}GQ&1jj!1N9;B#OK}m($vboa3kG z_Y%K9 zBG5tK`^?plTTC$)5ZM}R3!gkpp02|BH7(TGf2i@~V}q}2WaUzbUN+m@DJe1aW22>j zPpRdo(Ki5z0mV1G%!XC@lF9q1zfZ>4$dW%@7tb|mspq^@Ogbd;CGq{ zzjVI6xNy*Beb7+Riwj7X=YrqB8jLfTfZb=pFD)NDh}AY^H)PmQV0$+QS1_tEy7e|9 z=VsfG54Yj|2j5l<`;c)IzAf!M7ZtQ=(T9(-nJ6=WG7nI^_=bsm`t}Z3YXk2dw!5&=OPW6@Uw>}=9&M~eJ(Np+}QSdzTGPIj`o@m08 z>sZd~(1uRp30&5Eov($E9h9LB-MDA8zzv?28c*&Yp?Mic^?KQ%SZ_AzHquEw?oI95 zI+i>R=OXT$&UHdxlSvCp-Z0&gH}8?KQ`Ua+gE9B&u+uXJ8ywOnaYuCUs6TPEQ= z&A*59pKqDT6vZ-CS*Ey^TP>DGd>ODPPg>Y^%UnHMc7nz?X_1dX?(!adS1Te6034azU@Vu z@?^f{W4m%kzGaJ2{6N0t>Kx^{e9KRAmFw~?Be@DFFUV7XeKd%;&aSF#vSrY$wqtEfW^C3|iO&7NMybC4grr z!&bJWWC}wE+@B;Qw zj`Cm$dp<|GtAyQ_*Zbjoc3Zyk?tJ!-d<8UDPEdgPX@T}{<@sXv{v_pz`Rvz) z%Kh`%p9_^;#q6n~Q^54TgPyaXwwDi4ez}1C>QJir?WxLLbJ+ujIdFdCH07qbY{T?# zs|H-kW`!1#P>(_#m)*wwT2G+i^mPQ%(bnDYskMPK$y>XjCe9 za#J;97g!Yhs?fu{H*OfVSbS~D9hT{rT9lV8?AL_dVx6_a%ATMypIK*JW{1jW?Zt4h z1M#eV*3$~RNtyLXKKo3W^==OPZO-JOe0F>8tf4&iN$#wj`RtXv!fW%{Kl2WMDxW=& zk3th!!TJg7^VjFC&t0FhURiHnZ(DC&Z&@!r73Nj*5YO>hhffZlA=)?8*(3 z*hPx+-AU|vr2+WADayqY*=0Ej7Rk2eD3?!UJ93Erw>iou1?=xR%5U@8=Q+x+^4T4^ zgnuBHIAg%z{PkSruLbP7yc6CoU~lCqpG{yN9R^Qwaz)UaYW} ztjc#4w#}vtDC`p(VLr0c|JM{jdW$WJlM5{hb~apXQJy)B{mP>J=rH!NW#+~i>~X7d z`(f-c+viq%`9+R${d9JIuJYq)?BU!>UhGpa6Blghwn}4`=Tc zD1Sec-8+#`PffhXa^DoTsZhCfCVQcf>iAQkvU4W;jGx~ynTWPeJ_P^Ynyg%X82f}{ z28)!(rn38rl%GyzKjY`G6)E8Jk0RyWDeUti<&RU?@IlH&Q`xT$Qm&ZF-s0zb4pR2c zWS1SRoOcAf{a`vDK3Ku-kzX9FTs(!1^7Bs*rdACeqC9#CyYmn_fAA2Z**FENcy@~N z;S~1z6q0|#RAT?hROM!nA4X!{HcfeU8hdOS=^!~xxqmvldb)DwboSD8y86y^<=Gi5 zIbC^T1{<28yf}kBGK27M&LF|xpK%bBb>-m-^z+c+%C5uNFAgWgZJMdvJ(JxylhpIt zOuG8#nbh>Z&m{5Do!V#OLl)%?E4$FD;Kzo6FQi>ZRmx`;b{QXLh>O!^CNunY(zmY#UH+wUa`&+Szxl%H{Y^QSlOT7@$|K&0UTY2u3A%SJnUK$bjeo$|hw{n&EMppCt6Q9iY@C#?$3pSRXuX=j&7Du){-N32co zA-0*nu(SVVQ+{G+SK1Y%irZ*_q3#cDmUnDhECbdT8YzyvSqmjKaqvqx&_Bjk{3A_79 z<(Vbyny)FZ*0U=d3xT=zs8&$^*a~Zp?O!*!#mCEQ+_Hw0y3bs@!!?kQzHC1!P!VMN&xZ;HQAo|6L%16uDA5S_O z6}){i728mwJg|seS3~D_)Eor-lQmTAxf)W`%QYm{o*HW3MYUAH&9x+MvX1cE>d_T% zdXz^R*ro;rTEC$|d8&au&_FUg-lV*+gpD*QP}c6IgTV6QCCcy;cFPh4J@eKQ1?szC zDOGhn{x4<2OG$)hmMVW+%3kB=SGFjxE@yYOkjQeFk@4lvK{||)BE}tZtXXp~fB&}a zPP^rNt1_gpKUrsRX)gJx7n3Xa@a4(f^Slk-8t+DMY$%Iy}-}^ ztPuILIheV>nZqyUg6{Y94=rlj1dsBgBK9|{g5~cQZL{!y#IEd^%zm%T#{c(nlt(AC zk8)?@|HgdfS4C{ogxUCiZGrN`B6jD*+4%qHB;~ou?B|7b_z&OC`Ce*S08xKqQFhH_ zH-q;P>>aCe=}fl8rkp<$(dSW+T|HsV)*}&0lubvnCnin7{|%Fs&yQe_79EEF*B-3A zkFtkcW&zkUn`WiG^lo52n2Ypmpm{$N#pj4aTuJPX%kKZ~)S<|q&6vC$mm%{=xIKff@S z$dkFpVy!oiE?!Le$365998W)H9b~%4jaT0q!RkNyXQ2{u=e{mOL!LzDjJ#Up=H)1{F7^ zBiGsQvHa=G-1GAOYI#e|eR>j%KlklINoRWE}H)O()^E$Y`^+u{=mVb zMD*a4I*B2=FwzAiq ze&DYuYkA^yc5OM)JX)@dp27ZBuH62uNq?+Re)6qJ1C`1jzA@Hpe$**5@q2+sR?1?&R;?6o{?-}f`b;|2! zvW@l1OJ}l?`eV@0n>@;UXXNf~I0iKLG%B~9k^4-ea{n2*ukrJTo0OYD{#u|LuGPnP4~ z<9VUO~JETb1ii$-S{vd46T?$F0iX zs@#X$luJ*^J#Urr0rlt^!~p&o)NiQHf4>(cgy;<>+L4^K@&d1CiopD`0V<$dd=W~ zl+P>m5S37)cgXKrD<>T(u4vz9WqmKp`d**)y(R1WH?zL4mfx|8&5u6${RsKJl{P2f zm?!`?gozWRar_v(TS$f_aefR<5cYJq{9Ys>mbm#bEUzOM>Y_lJe1dKOMB5PKPAJ25`YpvSr0c!RWv(M1Ej8OI3(e)ggYc0k#JPPJrc4GS-*r137rzwNa&TYUBak@0}>8OI3(e)ggYc0 zk#JPPJrXj%tY1QhgiZ--B=kzyE@4!{0SN~s9FlNY!W|NhNH{9t9trtF1Q4MpAmngJ z=#;QVLa&7F5=JE)kZ@4KAqj^i+#%tJgrgGfk&p#t`z3Tp=#;QVLa&7F5=JE)kZ@4K zAqj^i+#%tJgrgGfk&t!D`XzKo=#;QVLa&7F5=JE)kZ@4KAqj^i+#%tJgrgGfk&t!C z`XzKo=#;QVLa&7F5=JE)kZ@4KAqj^i+#%tJgrgGfk&tg5fe1z2LJo(7P6=xy^h($+ zVN}8a2?r${l5kkU9TJX6I4a>D30X+CUqXk3P6=xy^h($+VN}8a2?r${l5kkU9TJX6 zI4a>D2^kh?=qQrVA)!;k8VS7;wo4e5a6rOA35O&cmT-rJBNC2ExJN>^TGlV2Lqey7 zH4=IyY?m-9;edpL5(;kV|D%Um(_akEECx6u)IT1`q(pwekAbQC=5NGHkIQ*b@O5k! zSiAlr&L_(2MO(y$ysYa-Yu5+Fh3xVRXJ!iC-Y`dnEl1 ziN9Imhc^m@oK~{m+XSB{q?~U^`Z0+gy))>|6q=w_*&l3WkCGk6k2+J6M z&JXZzi67m>(djTp&+E^oanem;%_`# zeAlMKO0MKDJN#{Nnaz=S_c;P_wZt!xcy0W?A^Aro{#i-C5&c2O4HDlh<^PpHvqNEr zykEX65VQxFj3Z=U`&Ibymc(o82FFSK81NQ$ID6$Vp?Fy>o0Ko~qxrK@Nk1KU zBm18XJjvtDT95d;#1CZQt0g{r{bD)rB+sradCuVYLzy<;l;W`SB_B4RVFbL4(@$aA z{8hVZCjY&h{$O_IpEM0ljBR?E-NdBoz^Jm(85?d3vexiRAbo6Wn z9rSw}ydUK{fi`d7DDj$~c1W>qk@!8i0>Sr@6Q2U&Gnr}gcg`ID4hEj|xjRcvY<1@N z(JVYYgGKU;GztE+FO-hEIQuy6iaw*UF>Fw zzgOZJ--wQN2r0ihS;|B2Ov6EY1L?R%;+@0ddrScKB=FSl+Io=I@2^PuXjZ?|^Zz6d zqaP2zLHil$Xp?yFBjWor0bvC;>h(9NzUk< z*i`R`^bcNn;U0+}{HZ`lx6bkoCjN)v`7hxgG(HCbPkh?X67tZVQaWlSe)K-^9TM;( zF7aM)qvHkspWy~1@t?uAW%c939ACt=^}eGd{cDm>c0VR1pJ*1Jxlj~u@9zYI$0eT* z;Hlo>Ec){${qQG(@W+z=`<(u8wkM1J50c(_uOOu7jp?Yuf)v&3l=i<)pxLPs?~ntR z_IT5Ap~P$Jh+xZ)+kq!JFUqR-DM?>r6&2DmGIZo4aP#(Njo%81FOuuM%LO^}NxVZ2 zEZLoG7w}Z?U{<{!N_yuqQ7=7{K*zy{iF%o2pb9iQR^lCx3WVmbwgFH4v-9zXCH>%! z1idB?{ay^U%ln$Z)3bYYL?oWwFTNiu@efLT_Bee-<8!T`*Yy06#1|C^2AZFliWGsy zwYFXkGJebjp5$ql`uv*2w{ZF*HuN*Wr&QuEFrmL&;TucKEI;r~iH~j< zbo6{H9f6sGUR#GhNub$3fhRp{@l>Lj?Fd1?<2r%izaN3q`y@X4slZ<)3t(@7uq&DP zXmXx-q`(hP6N1n)9CXAaUb{a4PsfiBHN31>x(PN73M74M^A4YESCz!K(<>Hn@E_g) zd;xfB*Y+&?{6M46iX*cflK<1hg)w@bl#V`!=-)g!@2KKf>MfP+dVQ9_ zYxhYI*7@;Ii7$Fje5Yp}>6m$xppUi)K5xo++M?m59nKNtY&Gy?Z+oRYPKm!v(mQ3` z*X;A`qh-4!pV^XrNa9BdMZNSa5*-c@P`gH@e)#V)0N(&S>2n}UpK(b)&Tngcq&;BR z@?$z0LVVU`@u}nZBBtGU`L?9rF6nn<(T{NaR5qHGC%g$f$)D|?^S&aUl-r$*xCAn`S_UG!`R9g8G>NZMhWK(o^%p8Z%L z==pIv{wnd>eaSmyy;J9kdUu>I5cDh!9reI7b`YC}Et&e`RN$$99WwvX^nA9&SIK-S zPZlaXmgr$ua(rq1zCq%PWuBvryIX)KJq%>o`41(&eT9&Ro*|;6;5cFbTeIx>7>RF} zc-dX-REh7-qR&BKB00775lznXK~M73OcnL|WWU@i@!5Irz&t6xj8mFDUjsbpO}lTZ z>FqH|KO*DcgQ8jNV~HP

bmr1^q>yIsgb&)U+_3w)6{&G^3oc%*9Mj)zU~IVB8h zC`D{k#?vxc@+9Dm__qRo6xpp5yiw8*n$X_}ypcR_nb3c1LO*eVu{<+P@biH;s&|PA zUNymAWrBYQcq9Eh13c+xcvvV#hIw|7(^#H5;Enj4Zh~KJ!sk0C^p~38Z#Tg|YJz{- z1pfyU{JSRj&w;-cEv|W181i${e#%|O`neGJB1xkMHv)f<9wE;kGvV{534Mv%xZX=m z@b8%5?MOt8?Cmh%jpUzeLf;C!QM(c*^iP`5|K5cDQ{au{IS2#a2tNmSqjpsRZzN~P zgwKEp{Y}90@$!J^KpCFdyTBXOn^$g(|GEkO8^9aMIRre_JMf^W_Y`SAubc3(Rv7E& z5a2nV#{`2q$)^T*Bl)i|!9Q%m|K}$7*MO&bN2UBS-Dh(vjpeK|!8ZeMBxlHkzR!gI z*CzNsneh401aH9v!APD{P4MTL;I{&AWY2e);2-9Eu&#BX=)gl|KR#>1e@3-2|6_nR zl7BJq+)jQd7(XKUUt>c5kO}^I;Eme#I}`eu3ytYrz#GZA1bCzI(qqEsToe37Cj6f= zp&v29{~mZu&Bh&{nb1!@-dO%}6a1;bQ#@*yd8`(XE(hL7KQ9Asr2jWe=ubR>F~>OR z89NntBmVus8}Yxy1i#G$zr%$8izf7cF~NUqf}eUK<&9IBH!II?0p3XdZ<^ryP4JhP z;BPg-KLosy-Tu~uemC$&`akF-WBk#;8}V;5p-%vBq=!u=^gl4cKWW0}1rz-1z#G}y z$0qzIoNV0Qc_#QK;El#(0C=PJMu9id+Z87Cw*YU{FFQ=|@0jpmHOBHBY=S?=1Yc=_ zUuuFs6L=%}*O=h%Ho-qSK4B&_8B^|BVUW)g<_jJSO;S_rILe&CJlXV3(HhY6p*n$Ul0LjSep#`RX3;Ol`m(%W}U@b{VU`MC-H zMc|F>fvKmvobkP`^}O;o9P>pZ>j5pVNq z#M`zL^ioolzdNuAZ>?%+RnbbifY(|Ff=#W7vZyZ{qZ9ZvlBcwANLGSsiVIhfbs@wR%@l2AP68k!hW@xr!HBHrYu3j7|Auhp}>wM@?=^`0}ATmN25x0AnaS9>v= z8pFG}V*$?^UF}O(q{>!6n}MESWgzGWWoIT?Ya-pwYP?#p8!tu+1T$z3ydijx+cNOM zYYt<5{zQ{^DH^w?O>M1GTN7?4-lvC`vWLCE5&q?d?x#ULNotS_8s>S*o&-3q*^Ev-K&5AWUG=h(^6FR5EvI!0t_oWLE7 ze`8$QRD?Dwzo<5yI&I>-rM22jo7LvpZgt2Hw)&cwJz|b#6r{E2q zcn`Sh>sv=XQ>re)dx$l2?nw8ALFqD-?hN(No12Nn_~N~poFDC&LRRmPdo(6~QhH0K8}0nSD}`8<;H!M`{d7LCnj zwSOv&Vftl{+KVkqh4WU!LA+Zxkbcu7BATlr8`&J9@YJzlX`8xYDU1)nG+jcCCStDA z)T=7#t6tIBhT(Q2x~1RyjdyZJ`|y_IpeNHIuSgFNZwpkXa}nfk%VCPT-u%B_iB#m` z=H-iOoACznz7DmCyf%!s1Q8S++nQ)b?huMl=Tkpd%C}Q9e++Lc@9XR=@iV+ZRlf9C zP4ue%o(L3;7u*ISYFAIB10Fbluob7_kGJj8i(b(+G`x%`FxtZ*)fbET`c=I5IM&ZP zWAs+@KwocfKZ@wSaf8etRn>+SwaYy4nDuxKdi|*-h~Idnao4(aY7}pIrFX!F68-8L zO0>q?j`>Sb>s;g@s1S_gD}7yPz0

^_oQ~A(G~DGfj=`31Qg-SHBGLlqX-(jW#Xe zS3GOvhT zC>X=(i%g&;q`7PN$c&?+c4IvQ{$F2l=;qAsnV9E*`uq<2L~LFM`B2a zorsXySMh$`P%vJF1WolN;0Qdty?!6w+|B!^?Ub~nhVPv{Fng8GgxFGhk<^8Om!@n$gT5)o%tSCTVnoEOQRza*_uR!2zBf zY=R$CQxP*WZqYavcAe=>JS%&skO;3Hk5gT~4Ky|~+ zR`#rDX?3fv($vHY?x6<`Ni9u(DBcB!L+_+O&9x!0B$V;q&h(ss?uIh?{<>zvfp!$Vp(y%Ls+NgwA1n#H-yi_ zpiYl$(LNORRrhdp>hL`P)a=RZLRYEwf^xMJYZM-y zjU8wz@5!)=GI~Wg9QgsSDtJUzC{n_d!85h~w3gasO?4TSWSfAioJU!teQJ*nFO5$K zH*RV&E-W(ayM)0=uNxQbTxI0HJ9{E)#oN=fdc*9+>G~01P)~ML)pf7Yo80x>_CKB~ zV@S{=F^ZlUf!kFP3S$U`0?6yS`g#$;G8FEr>I(I!aX+S0pzwdYFBA)MeHbPs6*M4I z>4+MQA|r9R^@t4SsqyFU3Hrh^mk)U0#17nuG8mU4Z$JvE^Pd@)HEoq4+od(GaGWL% z>6r);sSJAk=*sRSWDG8`;v(K}k6uaVIV6XF&6jCTmIRun43jaLyQGZXsL#@iYt*vz zoG5K$TBO-$7Mlv{6on2fm(8>~cOcU13x!j|2O?!=4_V8g|1YhoRrBcsEysC!S1j#i zDOWAlgtc|u%#PgWVi@%)%_NGYn@m^#vn@N*BDJ-bmh3#EW@rf&#?I= z;|v?@Ofzh;dW_Ca%{@3H!?{Nl<{lb746Lmwb5=<VY#lm~ja$F24 zxBk`^7TVgD)nN$JI$B07d1aL&Rx;m zsI;_6V}e^m4SivM0t?J5_)WTcDIU$fyi2_qLiW-W)4Rr_O(!v_NW~_sQu^q|4z1Qi zL_GLv6)cB)+9zl#C?&0@@0F6?^(HT8hhMF*7=mmtWx+ z*GE3w9goMz5ryI^HeUp>lB##x?r6VE_ObVXI@nba3v&IZRF58#8rF3a-GbhKH|?Wy z(ixR5L#>8i7c(`{2Zqx}x!WZ(@lCCz8O}Tqioa0ak0r=5T8|U=MD%H&Q`{Zyj754g zZV&67PC6od$#og5B=Gfsui8O~Ab~a62v+GcmTl5aGKdyrv~8_lwXk}P!yk3{uw2xG zn^7dpDVak)b5I&BFpU)kK6kLCtK&(PT(Qj1T4^P%wu&(cf6RSv{i814D?e9=q~M5JA-r|0s8@BF>IjH za$em;dSCOejHYV6XEJ|JnF-ewH9Sjw=%Q3knr#w%IgRIrUpn;SzNL#Nm((Fac_*_B zK`Vya6xdTo*V3*UF_o6uGgO(*WDExLzQSV}Et|&Z4s*hXSe$l%m`p5gS*dy&<=y$l z`W1ZS=&aIsL0R9gnJMH~elx3_6i4^5>F)AZo19an`rFI8Na#r$7nGhr|H8bC%+>9J zv%y&8k-d}RCq?udf z*l7_(RKpf;*e8vkre^(J2e;ZB@vj!!MfyE%EZlkf>qGv8idBrJM6kD&yl_{RbmjWZ z&K-T7=KGz+T+`bRc3Lh&b1hLm6v4)57=PAIX-_%#%Cv9Ntu{sC^r6A1Zy=;x)Sn2} z@;z6zaVaGh0|I?fL8rP)+$H6pqfP5-1JYYM!FRD$Oszw(O$6O=^!{}|wF8^M!U*~Z zxi!#X?V={tRZ@y(2e4Yxt3#DklP=0(9{%p~KH3}aLU^V81-DPn?nfxWhQsmu9fbnZ zTYN|_rtOgy)S8I)80iPMBCy#SYBaGH+^%-2w5tKDTG);a8>KCA?lM(PbjNVNRln;* z?d(m!#?RKvQsQ<(P0KsZ!lr4gfG29P9wim*?I+pPrg&>#2Spdw8|-Pi)@jg3VnA`jtBXXjX%01YQ##a2J+H0|BNg(m4h9xu zL`%9>>la*Tyz3Y7v7Aq79rohK>3IS298G6=JZrrEvoQUm#r{wicIZfeaxPnD+f6}fsFHJj>wfwk@kLo(Iv+A*!k>_Yam%-hO0UnBHlykYE}8pC$T zm0_CM)4T>)F__duJg)k=<74oZs3Jo_JZa0wY-l*37vj3bU!AnzE4A6^i|f1u@zGCU zv&8?`{TkNP%T;o2(tCIH(ZeRbjz}z_7Y{>PrztmSs;#M3Qx$iLNaIA0#rrY$=oLdQ zq!+lv#Y1VvXp`|P0v#ea;(yTBYwoFyM_kI&eDR^H?E)P)Tuq`$TuntrsDetVi^Q`~ zA`yTK)1G!n@97cw+m{%$;^vU?I7R%yZUk5fk`;vQI5I1Y=5(OtfjS+~rE9Mf$(jnM zFP}N!!|dM?6}biRC`0kA8ILaXMN`8>#GE>s8j?%a*$|Dk#wM?3LSnR~2oF4b@LO83 zV6+Eekw0qE+uJ)nk>eQ(H&dO;Ag!jK7ObTpuYvj2>5{CJfqlV8(=wmv@)*cLSww2U z==~EqT|ZS7kZAzRno7QF#ljfNjeIjkOfG=^n>-cnmyxT=uq5(Ebt~(2-iYiB z>-GHU18MHup||zn=>r-Al#YKnrO?i`U~Em$gYqe#4UUv=c@r>lZed8Y)`c=drmFUY zI`C8@Gy@r6f>2yAkO~^5ZW=ROJ|uTsKJ;-U(xXj+N=xx%XzHOEo*c`^=u$}J8EANQ z>N`z9bkPcG0)m1XjEQ#NNnxXXv7ovp6ieV~zRXdfE3AzM9#O|VN~0?V|0CmWX7ZJ3 z)n6d9>^06TznmQ-@P@j=bep@3=9$eQq(gk{Q0T%#u_^^1gdSSI!kVQ~1j5aF7?qfA zNop*mVSYec@ZkhY)gFBtX?D}pMvq=}p|W5MH`M9Lm^Pk8zzY8bWNbyKqf6NK z2{#h-t;T#_g)}RIoj|0;UfI@Am6F+m+k!Nv7m532De1Cy#p?S8%%E`f6kkuDbeY@? zs;8Tvi&hBG}@mUwyrX6?fY58(hmz;qf=@bAcQZ1Jsi%k6W=x!9E0@d7!6G)uNxaC}$v)F-LKzm?84G4kMaP*e@5C@| zA)zU|$aS*f`zr5BExrrCh`9)V1R^-;CL{mL%y$ohKZJqa>x0!PSc0$HIUK3B!itPFbaH z^oUc?2x2~Sz=TSA-$1Yz1HUJzvk_q-`Jir{fcX7@sNeT}J5 z))d#);J9IvbIjh7ouM~8Ue_B5)8r9%ddRg>?$iL|NwqiSv!oeoc9WXetM= z((O80uSk*Us7OI(`3Ao~3SCn&DoqN*Ez1Jx++Xr$l)mj^@gUQYJJGS)h#)5}x?HG$ z3|5XJZOUmfP5yEV2B=P2-HKq8)_}Ztjv2wMUke}&{836cmF7@5*wWYA5scB?)!!Y< z7&F3{Tlm9(SQtu;3vf&=0ItOCq^@uN*ho1XKKW0(*{qV>&A8NM+;JeNe}T1Jomcka zfmpoyVY!*ld&`>8UE=~6@8|=aGgbJaNQMHm zQl#%H8e}>ryh8XpVM|aA&4a}+UqGw_trE-P+D`s*6*nemV|w)$-MEaW-%y(x$b926zht~|8SD%|I+|D`lfOb|ffY&H85W}RrFhqdV; z9G)R^1JCIFacd<={M+>X@UJiKWMs*1k!pxrFe2fg>#nkTo`R6Kr*ui%NQXPy89$Uj z#x0!`3|daH;N8?#zF;NtBd$U5+;WgYiQX1!!s^rVaouSzN){)6VwC&(tKMg6& zkW8QTple^?SqnTqz@LHC_FRn%=;M5lw?7r_7Nu{JG|}%Ur8-&{Jm`ULjKeh;1S{h~ zmoA8$l9AifqD?%spOsu5C2QkaY{Xnl%fVqmx{LY?GCxM*WSsHHw6hhr@nz1LIo8wR z5+1%;OhbF%Dl}AbRniV+yUqt4x zIBjRja$@i@G(+eKW}Ba|Tc6G<{^~JI~T-{Ya!NW zMi^IBx>Dq`P$Sl;>6UML(SdIf2nW}yV#S)=2v6@ZrJ!(nDt3Qps|++BZs7aesXxAi zFA(=Y{FyTHy%?B8AouRYQbHn_*|phgMA|AC=3!!GmF{z+Z)p>k{yEu4+R(}pu~`sz<$9@K&X^op?z4`&7JW@ z#WhSAd{jW5MSBvggl0pm1g9lk5qT01`dJD6XcQ|!NDhHI67VH_tR&bie;ubg0B+hh zRO81|7T-C5pN^wjn4GKl;T*w3J10IQ23T^DY3QMRBta4Y!>j3t#~FS%sFy#SEq?6s zU-*AG%I4xDe>RhClP4Nd&57|Tk4GvEg-xUD*saUQ2G=XNVfK)^)@0}qoXt3*Rl28KqbF(Sr z892j>#8dKb$9og#(BNCLfF{3Ij|RIzJEki?gx~$ALxWCPp5!MvHTj8#j=`Plh+n;D#mIW#fb)s<&Sbw`RVv%m%kJ^5-|rKeza#ovM>Fi z$)Vvk;Y45F(JroNa6rnh(QChzs=+I=$`7p;lp56N%*tP%Reo5O*Py09v+}oPmDk=U zuE9vD>_EvNTb{eJ%4_cf*5D+qVhOYLcfTyJ^*u6ctA)y8y#R-+r%4_dy z)8JcVbh;zE{%3GaU!FZL>zA-yjwg+UW}ch+_x7&(Ocbn<9Yf=v4z0Y_uB(C5*T1z;6xcc@%6ALE%j(N(d?w?%zI^)q z!-CI-iz;=AJ`n6$|51b~upcMZ{7l`tI$L|{J Y53ODeNAA_A{EOca<>zG;$ok6u59JP$EdT%j literal 0 HcmV?d00001 diff --git a/PyTorchSimDevice2/torch_openreg/lib/libtorch_openreg.so b/PyTorchSimDevice2/torch_openreg/lib/libtorch_openreg.so new file mode 100644 index 0000000000000000000000000000000000000000..dbfd3478e7e06650efbe50b2bd6de36f52cd3986 GIT binary patch literal 569736 zcmeF4349b)w*Mqs-sipdpVp@P z_pMXs?n~Xe)ww2n!q_e`F*jzZHuk3T_rH9z@z;)qk6Z5Z0;t>nl2365jaTE`4Z&FUo!uUhXHMB0NuzpQ#;q61*JZ-$*}E}eY6P<8%Kh}uJCri_oLt)|5;pIOgG1{xW#c@3LfurJT!=I z&pG~_8Mip*=%t>FK1R2tgO@ChS=GIpQ4n)h7h`bGSg=1f)-m)$&*qpr?;3oR5xX*W zNZm;_N5pvg+|b-@X+}(!Zds1FE~D!*3hEmM_cSt%yY?S(eN9hCW_8k~#!}-uSu_KChzh3*qw``d&ky*TUx#`Yz8fjHRTP!RK}G zz8>DoiQNF7H^O@*X*}m<_+H2KD)|08c;621JK((<-uU`GeBK4`HSqo;vAgNB9zNI7 z_j~B`&-97s-V5LFgLeb52jFuZydNRm2%nF^`*G46=o8Ovg6~hkdn3G?;k^mo_<9aL zH`Dj$;qwLhE|1E8TY$X;@0a2ID!jMC8(*)(XDht7!TT*@@4)BZ>HBv2d>20dLErJb z9rS%CeD0#}AHwHH@ctCuyNRJ~7{0fY{tQ09V8`${zV>MBOVE4ay-(v`X?j2C@8JDC zynlfAzv2BOybr)zylBko3ZLERJM!-I{RsFxlD_wV&z|t^MY=b99)14K*SCJ`J7#Xl zmf6AX4fj01{`RIhG2hIeQ(v09Hs`$L(aHZRjG6rE8}n}X{Da5##BF|gtZT%Xmk%6W zdBmyTWo2Yt@Z>4)uJ$`ij^CR(^xY5E?o6AW-S*kO@?(}YH7s9#!Q|(|@9bK&=E|c= zPaM_!b=H^@JeOYcd)HN`j;_9aXODlx4;gvmslJKLH}t&p?^Unw^u`X~vS!1QX?bf- zxu?hGn9UC@Jn9c$7u}ir_;ruUJ&qBN{r&#NlKNYG>-O!9+x*=% z>ubvj2993zmpcX|?EWMp|I#-cPWT4EXG) zmv+Tpo^bN}XKssIzOHR$;vZJTUO)fCfn(0zU48Z6J{>#lKuYeHkNm55!#y!w2Hkhu zl*Fl1CUu;X_x0&(HZA&c-P

optWg^|cRN^2Nhb?wGjwp`wAK)7$%;9Zb3GoULCU zbwhaQ^$VXoI$`#`4?nZv(W!qM@!%=%eEjU`uMWQMyk5@?d-c7_(7Cb350Bll=!~nD zJp99Vcb0!O_{y_p-S}+#3E$t}_mqkEzxl-tf1G|_)hnMKn3Hw?vD>G&&bZ+7W0R{N z?zt*8;mtwrkI&wE;%7h2xpZ910>`lrFYJHntoOffdUb2B>*s&_{ry|t`SZY2%2s4H zPYYdhan5m`i?X{fJm#v}^|gQNwdI=R3(uG_C$QjywFhQx{pPrY+0)*6ZtU(ypV{!m zsqcLJ!;ROz{`NI(^)F@gdgh(n{Ego=4j5E0<^DJSQul28rtFt*UDxs8zA1^1{PB!4 zW~Ak=c>m0+pa1Hn-j2_HikT5O^~>!sb$=al?8C2YK66n2+b%yY+Wtix%9X znuls{Kkdn9Hr#a4hOMVAyyccrtFC?h>({={==N6E`1qUWZd&_MTJ1Nd_nffe(VfH3 z*)U+x)m_^@e{=o^gTr?X+HuB%o4S5Guy@UuUp=^Q(6UK)jXdJ9NrR@OO?oaj@q!=P z(ms5-b5_XUG>D^yRRPbO~uf=?|!oN-+y0o$Ckd|x5SUB>UwVRvoi*Nv-|yz z?wPgZ!CB28&z>;nqtm@#+|zLKj+@qQUbA)0IXyNnoImB6rN=J3?b_EjopRYbsc(Lr zF>&FBbI%B#_tdWCMH`;Ja^V;0XZ4)$?2&DspZ(o~`;t%3d#+$d%enm_eAn1Z-YG1- z^{1}+i5IM2*!HJc#g!-in0{OB4}V?r^80_-()T{k8<{g=;_sMy!AkPTIX|RQ2s|{^R`-^?lj~+*ew8;+>DC zjeUGquXDFu5cANODS2xu@A~t4+2M5l_k=C;)&Oh1JKmZvyU?2NKh>JwJH?ux zeY!QDHO`u^pJB~s*~D}AKx_N?)2w-FqBZ{r^2{pVhNM{Y@qMlN72~aWZ-+JS2@~pm z7UgvUj44*}xhv0__rBbk|6#JV|5Y#-d8_!JVdMXG8(v^jUdKTG zTls%%l65?%Lp-hQ@43X@qXS7Xycr?@6PrXgL_9a>Sf6yl1#^hN0cbsI+H{0a%yEggo zb{}g$Z8rV?%V$~JzhYBQ`oOifRsQdW_*><3o!8p_`!lThl{Wq11Ls@Y-)A$9Y_Vyl zi)`9y!v)sDKnibFF!nO+R^@O?z2tQ@)FB#-lkl z{wtvUTBY~oTx&jTv^B4{>6d!i*!QrRZ}f$6&@w*I|6B3dd!6seVHuZDR zY1V$SPPFFzp#Qgu=Ng;#GTo-1e9&gz^`6c6cmdQCt9I(L>6doe_jic$jrQZ-f5dDj&YIspmJ_^s8HXTlZK1WNY3r)tYy)=_gm)lnbuMS=Hy$Z1U{_n|$`$q-&?m zxZH4=b-3g5t@-md^Xd|te&YihKQnFYTWr#Gp3OM^n9cmK#^(Ai&Zb@kZ0gmIW32Px zK$G)x#XPv1G-6vhlOj zCZ0#w^glP)^kc`_jK2^~!vSyob+Ju9_O?xV?XsCieUN3H-oZBQdX`N;dCCxLKe<6` z{yUp-^Pe{TfYZj$9GiS7vgrqw+wjE>>v&GKsm}qM{(ON=KQqmyUB7HI?q6Y3e=f4g z|A{vB|1q0+X-pUE^rk=~wW|Nmz&ydK{XPl#Z^a+7X%{s%<+a8p|2@zEtm1QwO+9%K z>c3Td&W7@`;`iIMyVVKS;jXjk#}?#R+dpnoPxji_A89lG{@bP;Pq3L!UNhA?KHu3~ z&z^5H&m3zre{QnzbBRqmeaj|Y-E7MDzV6oXKNBj0RXGl@@qf}3Yd<+Q{cw3-Yx@H* zj$4Jh-DX^Pc%-$TZ*1cEh)uZTZ0hH$rPlt>I?|f&vT2XqY{va>dRhA!2$^8j9(&ld zuNgM|{9kP9|8+LkBLSQK^UF!r@p;0gUZvUa56`sr6K@ml=Qic?M25AWr+QlRVi;$w z>eZ1p_39&Nk5=~Yr&#+>u&IYHz&voU(Z>kRgN_AWkWBHKkqmnWi~&Z%Y^dE9M9_=0LWPDzR@eE&=k0sti#%C1yndXta?S9Fn zH^bP{U-HI8>A>VYARzSLm@2us$Kn#mXXG8LWw=Ahe;DRJ$XkPwLo*bwMR5I#ydYEh z=}Gn@G9-7rDed2-fD4b4ygo_Vn|{7MR&wKP$)}M2OrPYf`(?TyTg2-)N>>5dn|_X= z{Eyf2f9PPD&&}gyL?Afv3W3DQBwq=yLh;9BFSnE&Yt6e5-+R3gc7bmiA5L=Us|_O|JAmfVk%r zS>LJ)qU+n@L6Xh5_+&o!l))HVr%PTuMMh*8#pjkQB=_`^;hObr2DCf$ z6W1W)f46jJoCoz2>;Hb*)dkf|yc*_6`?}uJ4^=zkB(iUvCFAKN`{4k2)I#;Ck1DQ~>p6tF@mapec$w7)&06Kq+e~{X> z8ULj-q@PdoqVwk}oAxyf`WXzj@pm#JFbx*3G-_Y1cSw%=o$&P!81Fm)vX@Zw;4RumAV^C3k!;<8wWQ8)MThJ|*6Gx{T-LRDb$XKk2{$4qhKq ze0Dozx(sT+XN#k7?MU&-xK_sVPU3@L{SeDB0~a*l1>J&p-9B5!v-JlV&-L;&WAH?Z zzgO}SiqBHYHzyg6AwOwU|J&}74$hXR8ONO}m?7AXGv~+B;!xpfDJG8S8;99ei->VJzuu#xVI#KkNAZ&E@WuqLT}2? z`a0>qm;5lknk>t)V~I?cSuXDsP`S`}Wah&@@}sxw`-`Q$Uf=#=ldiL1{w(GXKAAst z6n{76XC3u>Fx?Wb(n9G!?mL;@O5&fxr>JjBWCej9M7+|6NM1+%Y7gSCl}p~{km>ak zUk9I}9%}XTPeUc|AV21KmkJR>`)1sL2d@HZM_r&}N3QoLTZ<&u`;AkJ)&q+pulX8oU$O7YS9hkJvvegd73fxJn2X8f9^U{^0pe8E*e&i<7qyV zpv`AuGo*do8`8cH#q(|$M{u6j(O1UDOji#W&_%v&lks_z;<*Si1bH2`7qdTn2NHr@ z?*~@X_}f9_Z#Jdt5y}S#^`FJ$|5h50+Gsp_koebSvV4QL$#|Om$%8W~o?81&p?vUY z`7n^$my_C;^kx`;uxY(;kJDs^Yb0z7Y>zNZ`TVcA2Oy(KRd|2 zC*@ll^`9y~M^Juh?dT#nUF5UY9^1~ByyhhtpOfW>@h;~ z_*l$;1{iMby6nT#rT=ES9-K_|r*yFFZ`){GOC$f+LHCRC)Z5FIG=HmEBO^%D7h@Fp zY0<{Vi;MCz}C)L+4{C|<49Kg11`C0j|nD-6syud7)uxoMx)U&b>?*Fk1~eg=(? zZ98SgnEm9bmyrE-$s5W42I#nif0|!Gwu{&BB5AL;mod=LME(qve$49voIWA1yFq5~ zeDae`kXmgX(#@uQy+r+g+c`4-&2fL=Op50fP+{QZ5dXqneQGZSG_if3csh*> zjenByNuYXILG8U>YwztDGCz%XrM-C_eS#>j0Y)9ww>a{DH;rGOT4@j6j(Gi!t~VQ- zBscTt{6y(rZ@=OG2=gv+I#b@GM;|&vnf`F+qhO{*h1pJzg%*I>bWc(!?=_BAALUj9z+}C z@1zSIGhKb=%5kKQ+TBw!dBy~4FLm$AdOm>odo-RJcgpxQLj3Uc6xGkVB{KfUQu&@R zLxy|HR0+)M>%Sf)^QT^`=id*N`I({RCob=Z{9hscbSMAc=Sv>c+I2gXV;o&@*Na%f zK7S}6oM+b2b-LN_Et@6%IOzK5VX|)?B6%~-_aOP=^>0X*Xuqpu&JQA9O5<`Xjmyw{ z#H*K0Kd|Lung0e|kLQs6P1G)0w01FVFojF~6%5DX)j{>4QS0}Pv1yNekCW-Dq4g-3 zc8b@rHtlpOmE+U7(d9T15{~(hLGf{tpI#oB&k26nKbZCZ2I^0o)SsC7bDmB6{ew+= z%(iJSr%`)veL~hlb6%H6;nvgo0Js&e$yEP6H2*g_j&E4s95>4JJ}ccB<7u9$uTMT; z)4rauX)nJ!L;A0$`E3=&C$50Xg|4&A{^8 zME2{cJ~(E`2HlU+^+(|!)`#i(%Pg;xhsyZ0Y4tyw#_@U@$1}+P8fagbZ><&5|9Qk) zsJ_)FOWue0?KECzXxFn}4VLM3(0anjWPcXKU#xFXJQIj-bjWaxM(KZ|?6!=}7fBwc z&4)LhOX+$>X21>dG~>_E5HSAwx^6zrH)?3U0m%`s4@5s7XLQgy0GNr_Sv(4 z5U&@(SKi8nyBYLA}EGctSEh{V3nEZR+#4G!DjV`Oux}Nrq24F#FqakWr%k zjFfyB`M++YKVOI?YDfpUmgdB_WnY>mo% z3ugMgzDm;w(^5FcTUb2DJIhy63a*PvJ@d*egUE75+>XpC59KWg`ja4%5$a0LE%cT8 zD)30>qv>Ny13rth(nbdYrJc=PPDoy-U{XSXio)VFUsWhDyTVtL~ zkiTfGue5X~q)f(bsyu7tgp!#RzKR9ukXAZ8c~Y*|uc!=Q>4g?}=Q$1UwDS2SrK{Rh}+qp1-^@AhN>DByVZS%tCKv zN%`zjzc*A|;rA8!i!i+0kjq(F?5pq>VcF-13c(#@XJ;kKlpKboh^VYo$(XB&$+;rL zP@sHrMXm@1u?GUmb$XR&tb~VU&-}a6r0NtjD% zUTAJ1sHZOY779SM^;J}QT~2Qalzh$=+1c|vg;(Txv(c_(Hh7eo`?G_0Lhu#-V5zUr zpIuhw8DA(G#LrJhI{5JggZ}cOY;uwtN_0Zhiz@>2y`}#0*`Z>uzoH^g0f#ZJSO}>( z7x_x7{3DC}kWGOFs@z@Pyeg>QGOZKJT-g^+O7w+NWyWXcy1ejzR7E+=6R13~ETThj zT7>|C$XLq8nVp-2;T}5lU*e2n7TtKwmY`9zh5Uj%#e9rDIL4Jd395SXcxa~Z?PpiV zY^ZsGAodQvQfi#8tdh#0FH~4O-oGFl3-^j&R@JbE6hcQ3@@G#iD>KUcWre{7;0j73 zUDcDoY-lsQny@6K@D??*YLsA?;?m-PR1b%GPTkl!t%l99X{|0d`$D>yv%D^qiO^=2#F*e+Be-nU$6P ziqKdXbj#h3Be$8Ib|p_@i3*Vd_E4|c{yXFMoXCZXsLE3mXy!(S9mK$uaM|0 z^Ud*lXVI*XC}w|JW<`Z>!Bqb&C>&^&uFQ(rFzqW3WkLJR&dyC1`J%P%6ftOHj5^OZ zv50NjDz7$)NiZRSN}m^q%qz2T=D@OnW)3cANhRh6HVbhyy30?OBSt_@!r))%#_dN9 zkxelEl>{n9Uol2bL~$e_M{vaIbUZOHkN1ziDBwc`0*#;q>XY@b1pNvYGIW%39D+86 z7f)cE>h)s!A?0u(gw@+J1yjK?DUw^5PkG)vs5-f+Ue9=#`G}UoGKY5r)JzYjG5xYG zOD0W?sE0ZFh$(4hWy$Pv971Gm%?%)>j3~>_o|+rT6V+JsPc-AUX`jD1?l2QhHfK12 zs*pD@%Uj_qpABuUECdbZN-cRmukx6ahNw<26ri3|Hj13FQ+byFx#1L3Xb4#AEySUs zQaLy~BbT_MV+Gq%$OJwW7u_Qc(2-dqUTGE0gesF$ls!{4jhwN$p_CBL=yNP4^3b)& zE?IWi+B6${!<-w&@l>xC3{?4|nKN@oVmOBv3>3HUmh6SvK>vqQeAvn5%$!h`Ec!Hc zsI(MPA?;Y|WLVJ9R5y{F1MQO9TPJRo4(3_8F9b^-X}+12a8+0(vPR^k`(?5O@b*A z&Xs&KOXj(t_x2W+`YJ2MkPFRd7VJO9xpI^hUiMX%SQ^R*xsuKK-FO&w{iQ4_mWOh! z1~iL6)ewdgf=KLg4^sQ1T|@P7dTnce?k}tTx&J-2|B%7_hP7W@K)7;Y8snQSm*ngg z??kU8X2q^_UwL^T1Z_YpD^|#Pf@srbxnVyM=|`X~{oEd3BoVqFtWa_mD1uI}Tu@f# zhlQ9zHiM9na+Oq0D~A~2oVwFNn1_ftf29}K-#Q&mE%sH8t?>IV@>P`JLMW|QvLq+N z!VawA;F=Nm?|gh@)bU(s`!Wz5QG6{ z1yptIB^z1vA&5w)Mbt>mQ0ttBGUJzQqB*3YRPLKUM^#3H7wjUtNLQUc74CqrL} z`D=ODr4F;^kQ7zjv-Oq6pEvI@M_ibDsBYr$ms*k_k!D*v#JoGD5S9ZfU||fFYIESy z0N&7x`J=lSSGruaHdBfzHS6CcOo#-zMR4f=9s3?oRqaoHU;N}9XdxiX7 ze|4c>EQnB(gE@MUcfQ{Tt1F=jng!+MU?%bdU@R8eG$>q)m1FYetST>=TP1E6(W<0Y zAF+6n%|!v(7w9?V`V~%iKJsyox>=Zuh>YZi#2AD$8CUFGiJk?Jzp#jvTN((Nv%*d% znFnYxFYYj6}54 zuJ~uK?WMp_A!;Novxx{&(M5Vsvr7KT0T{DZ#6C3yy*e((=LK@by&$o&f(=cq(NhIa zs(>}hk}_yVxX=N8+b_P@glUs2+8b|u(MM>ZaVD&umV2QY7Zww#@Kfs(EfH?&L>-+4 zlb)ysZ`OZjT6^z&%(s&iW$}j%}ZT=c`U9c;24IDr(uPl z3XcWME~LpK>M|V?Cmh%u6W&JBBwQvhGHR5<`gkdOLK$=$vZ@|*z_AQZcsg{5UGt<5=rZBFZWOU|PUdVS!_ zGw&U;YebPIvgc_P!FSFA#M-)o%j=z2X1%Z=!xNKO8BWxaf!Pja8@0IRH`^#2Iz$`Q z789b{Xy;>{+o-5-zkX~HQ>kC=M)dAM=vlq&##tr~LG-7>QhRY^nY;r42wSY=^lKJFr@Wm=j<9Xa@!*(v_AU z{HoJ&e}YV>Xb|QeC9&T~Ir9?~$I8{6LU?!R3g}a)m-c)&2%s`g77?xnB zvU4T!>tH$EN-czSd4FOWMsp%;)=Eqjvv+S*d1X})?_k2RJa(S2_UOyT)sjGkbmFOi zYo$VQS07SCHwIkk>IOh$F@$Le2wbAEIg<*>9M2k%VD~l`xliU#O zF0WR7@2q3pKBYOSP@By>hRR;GtyGDm>h;Tx|)x= z7YLJLbuUoWuyk3&peukC5brFp1S0cI4D9^=c%s)_qrs~bzKWd)yY`}LUMki+uvazD z1gho%fSO0xs^)d-lG2NOA)nkYHxK#>C+wjqs|tzfCLYP2mp!d?UN&T>%dt#!3b>*Hi+S^1INrgh*E~U1yOCKCBy76GnV85Z zaV>`@BqqTk670|s`+>!om2!PEdn#@Y(5I1;^cLny!uEu14TGSL3ZsCc%CY^O&}!AS(1j=ftmXDU)p)3U&$RR!95wJ}WD#*yE>Sme7?G zED=R6i&%3M1&KY^!ImYlLE}ySvye%PHz)Gxkxfl;!3@5`5@WRQe*JM6XzA^BGr)xd7s+@Y_!MF#E9>QZ4(! zLrgD!K@chxaLdN+t1K2BNlsaAC^_gWsmK`*-+qHsVb3m-1-s2)bcpPDhRq3ZH>WV< zf=xZZ;U*j`O~6Hb8K`8q=~fKiCgqOy!q(`Cfg;*VfgEQ`cmVD|!yYBEK}>A4DDvV_ z^N^6y+xSopK#jrXmMsceH=FJMrW9$K%)RK9)%Rfcz>5qgZ#wdPBO&6k1P@R$4rOGMA z&l$R4cLMBun+H!gm@|^-AXsEw>CsQuh-HlENXw-S5nI?9sBV0jSLK*4YcG@%djpDx ze$AHXs;rtl8}1r=qh_LVceR|v;hOeL+{O$m@N_Za!u^6?oS+rM<(t2>(hECxXg99y zNn(>>r<1%%w349}wJXuw`;Y@$fy@jzxRG@|bS35b=J|_EcfwT=W^C}>kQXMhur>=T z*mI0%R8=k@^fr2s$#A<8*IzQTCiv(1OJToy!azllIq`C(6joK>(q5jZZm^rssu)FN((p>YNL)`@1S?Ej zNw9DTH>|vJxzOGh+z2~#!eA6l@{X3zKFGThFow>A`#MQ*U#Cdk*MWK`Ly6?lOlg%K ztb!SicsB4iEb{2iG!iJkorNPkUOQM;515PNv0rdUC=%Ee5CS{|XX78%!Vc~I3K0u+ zx9Ko_|2o%V=yE0I`eymf{Q#mO+ve4;Ot?qZX@&~OhYgFCfveNHHIgPtD6h1G2A zpP=f%BKP4;62VMO62&RWc9X7KbM!3= zHEYv5X`2%}hsdtY&3+}i57Io#odZd@XR`#lb*~?Xi|E_!*n9u%G(`u>PXF1MqhUk2vVMPo+ zbg74f5AEaonFVc`2T}VU4y%uW4kH;Ns&YfC3(+%F$d>4Opbv`&JzG_7%c%dC#8%8i zp#G{+uEww8|J>o);i`XtzuW+rlGRT<8CD_}iA#O;u$CMM#k<*+Bcs0422 z;5<-l=C;nHB(IoM!wr^TRj3kI6(c)5wQdU6?C`iNsR}mN;7@13GaCL7?c+9Y$;nZV zR<&T|DtZ+JH4KOFk)3;?cT#RTdnAvqMar&2gch|gSe2rP3BO|pQ*ZIVa11Brd13QC z^eh-_%yPXUbjtDMO1NJ0YPZ}WwTa#=xxLxztE$eyrL$~!;Fccnr^H9@l+1%m5_zLY z^kTA?8k2|*gPKMYvdW=pu}c6r1@Z(Ny;(P@cN^pS8RvDX7)d$hP^Nfm0VnvbbXc5* zhe+V(L+}bp&cH>TfNMegMF#P+2Dp_~JU#@Q9?c&)z-kBi0DG#K4eq~EZ6)gNCe%Kv zscZ!lt~M=ZB(Ul|5dwmGB-bwC_ZlMW88Yj{>L?a=WNWrut$@oYyup&3D}GsJ0`A$x zUjX?flS4?Kx!=!ftprQO-2P*C9D2QC(*kYpg^Q068f(yJf zP-9O_k%Ml9){g9!4_%O0T62t!W^Smo5;y;AKY;_c+&evXT$l$fH3fDMM1CbkE)u(v zVQCEZW0i;G=8#nSF_B-`hhc2Q4GLHjgaR~|`mk$C#Jy_fL%LA4DK7dEzOOp!6m47@ z3BOl?tJOF<$sLN>;whh~ZhB;9`2sYGJpU7!TJWL1b4M99ZpmH|2S;-O@z=WPd_DHt zT<$>XJNIekv8%VaRbj`<%Yh88Uhe=ZUurpN`FNFCctu) zBi-!Nvc%r0arlENmQKVqMKX+1{-`Zo`rx9^HgR~tjz}BeTuu*+js7C|U5ZM};5uI@ zU|;LSscVrwcNKja%_MP~sO}A-l>s@LYr+x7x##(W38_ zr*(?CxL_&487JJ+p*3Y}5$YluetpCIxwpuAH@0gz{ep27Ht*6?*)YKl$}vfME}M2+ z+fBo0nkHw6!m&Id>gJf>a(lHE*RmyY(J1m$x=zMTz%o0q!qcxTILiZpV3v4nGutEb z3J=PM;c=-2pM;00EE|Bd3q`o25!Exo9M5X6RK%+s*ko0TjA<@;KO|yCvmf~oerId! z-I;MR%;@M^p|k|HBpoWosp_Fgu}+l;*?|w=dFgj*VDSbV;;$K1$X_wSc3BE;a|4hQ zE~ji^qJK1pnIsohYO|*)3PInA*DaM|w<9W<_2|fzh_e>?%pR;7h`4t0iHV~!N|GDk zX8X%w(^jE;awoFu4GKW~UQ>F+qxhkiWcG_;xo~|WAm0NQU2OP+Pn|bcMq-iZlCgj+ zDS=-Rg!_1Qv4O$hmqjSLOp|d5BdXv<%~Q|XkabdFu#ZL zf=2%wSae3fkAViOB0q8oCy08keqq85OJAk1qQ$#bSJZkau5-?mKXAnAE$a?tJ56R| z8?M?!E~g+u)RP|4QfvUmpS=rI;4cTb1C=583T;w4$Eygq8y!$nU$|Ao+pFFKKxOx` z#s^RDnK5A$^GkHb5eD;gs0}5M3je{jZo&ku(<3HX&ZJAgD1^o2V(lwtjy(j47^UbGlL_x44Tz7vrTYoY?cxQ6E;EsH4f*)wq{r zwqN?s?rf1L&y_#97a4X&UJRRaVJQJp4tov6qG5RmehUbesa&pUur-H{L%sk>Gq>@= z3YB>P*HVA0OcP5Jn6PBnB7(mS1&+-aihhmt6)*~=kHhf>FT%x&6Hdg>l#srv0sK3%C;_GS&Kgk?qq<}oWvBolLd>E*v!Ndq_T0jWS=6sH=0z4GhiKd z9uyaJAFk*#Imc)4zz5ND(i)em6}Q)DjsrIl~osbs56sck6@>M^H3+HLbS)o zjY(7xsujG7tDcS>gP_NvS+HBHJREyQPgnb zN8Jy~gkN5OmnV0E7w$CTZ#&Kl%)$L0us7EWi>hLR3p2Ug@$8u=*JxZgDlfv@eenA} z&{4{1z;D`*!|41!J;TIAyUbS(KjQ|g0X)Yb|0cslA8xvrhv7+I?4DsxCTa;hBK%)m z@bFX!Ts*3WKj7wfbit>~f|pg3e#J5BDwV1TFtKFDMCYoJZ1P1oYW%cydiXllOn3 zuwl^!9vFvqgik2sc*|hyNyDE$5DRj63c%`1zCWNXDm9Pbr?v#lJdav#Sc$&Q%8q-|=F@`YBEO zQ4o(<4}u371I^A+^Z_N+usu-RRiR~g7oXlk7vUB6jo}Z=6v6{=ID5_gA2fM|--Y|1 zGn1xY4A(Oda_oQFe7^{{IYU>W=FO?FbW~PS2u}s!AB+B1hm6ORMd(IG{)625yL=y@}BY z>y!(LXi95ehct~WlSP{zEDOyN5_{?+CFH?B!BXPGTQ4GC$3W4A{P;`2(60q5;MsOP zQC1FS`V9D+c>#H=W<+HG6)c7p7UQj8(ol@_tuj(QZ20x8N>naXg~W@7 zmzI=QRS&ODOC6q?j4(0cm4!5JLeA(h-oz0}#<($Kyh$URhDlt>>T6P>`Wk_WBT|ep z`T3cnbG)t*Nh8vvNq#;!Ngm-!j-^K_my27Dj6N29Vxp6qrZW+BR-%hbW60#G zIpcCBoei0D4t>Wz-x&k{brJu@7+vYJ8+>**jsSk7_=Yz4AM(!sb%pb~8nJLjjL`$$ z=&dJxLnhCY=N?62M0@IG^cHs9H^v36x~kaW(LSB*kec!CD~@9LGTb<_jt)(xwwJQv zq3Yk!wjaE^P%NZh{1<)Vrx`XX9uE4KT84KtX!Acw#N2P1x^MOP7;>G2ry z)s4c4&Lz_y#@Y-^=8YKx{KT@t|4a*h>_6ZClLMFo&%@ug#=nQ_fcG&aCB5-40Pg{Y zFFgJbJMKAc0sKYwKE}7|csJuIIu3uz+qiYr?eK@Z`xys_Jt}mZ(X&E^ajftkYxE~w zbwWuG18V&#pz#%B^fK1axu~nRJp$n!W1LC$#ryAozx3PJNHfnD$9o&&NH4zz{}OO7 zBagIC*dJxgBt2i)_cZ2`_KV{^jH^ipg+9_)NqT{>Kf<_+^qY^&fxp1q&A6ZRTw&kU zc${=-*e&pv?z1Vsf+X^HPYrK*EK75D93!ayM z_<6s^>xdhdMdyEzc&x@7iN|TYmAFIWEfk*wjXTNSsquQ^X&TQUo}uw(;%<$1yd=}> z(Rg!({5#wE8gHwVyjbG}vt_;nHD33i{DbJ#8t-^S@*0iDJs^3l#@il}yiVhB6wlQf zHy)Mt^%}2ll)ORXZI4UdsBs7JCXE+7CheOwUVp#jEgG+RLh@FPJ1HNwYrK)--==XV z1rJMZTpnGYn<@Webx!Gu(|G*`=_g*}9c1s&xIz9CG=BRk z=mz2C)c88$X&Qfqc!tJbCSIWNnsn*EPUFqQn>1cGKTr2&w zX}o~^?AQ2q@?*@<@{Rn&YTQG992z&sZ-T~K$&XXx8RW;U@dWbY(RefY$=A4(`~)@b zAwSg`ZzMl88h4oK@tomdo*4^yjbTGrJrhzw-K+?c=IG_U$60u$&xo| z-0(==qVWRaZ5nq>k@g)LuOS{=5SsnR}P;|0VsG;ZWd`(lmP6R*>FMxL~9(s zHjUR$llJ>Ho^g@nvA*c|*ARDT+;OqAH)clL$D8@0@%ppnxE!bPx@5@{H0~rnX&P_5 zRNB|-JW298jW-i-(zu)aH)}jd_AMH(p?J1wJV<`R8ZRI}9U2dkpV-3ad|OR^;xz6d zKk*tbAU{rxH=FsU@eJ~lp>YrS$=7(8{1j;1Nq&koo8dAC;4g6 zcpCX>)OZ~EY0|iZ{IqJkfc$LNxIuo}G#*EO_G`SB{1`>i<=BxZ>tU?M4f5mAcq92q z(0Cj9acaDS;_24-cJkxVcq{qI*LWNG32NM+c3!RVX7W>`@mBKFs&Qk9>>t`RUO>D< z<1M7)>AJzZ9&9Chx5n#BrT+qr8)cGLYrKGXtYD8h5xPFVJ{C#iLl` zaUC+;pvLR|C3&63n<<`+8gD&ShTEj^davXy8aEOpkEQD~v!29JJmWN8aJ#gR*Ldsi zBrn!@<8oP!H5#wKLGslauO?lu@%k2d9nqli4&sd(H(rwV%^I&G`xcEi65p=znw2u1 z9U5;Xy+;~5lglg2&7n>AiQyhYtuOq%%*8aL-Lt2J)+Uv66W!n2d$jrYayRY&VWI>1D28B5H%Tncg6?=iFSU)a}i6N1dDNh&ngd1$Az& z^Xc4Nx6^qe%YPov78`D^-{^kK^%$L->nJ)m*F|)0u5;+zT({7-Ne>wG_5@9NyVj?sAntLMBN&Fe_r-n?$pd83V= z78_o`+7S=eyzbHco7WjSH?I$L?qThb`#0xMoSUz)}X2V-q{JDKKi_ga_o~s#mGTv;%!!|sQ?gO9;?kCNL z7ufK8mJi%djSX+G;jK1&zYTZLdYB%6w+#>4@YOcF*@lO0cpU2ocs|s#{67!!+8ke6 zYet0y$x@%;T<+So|PAm zr@2mv^@{UG)=oKZVCA)wwZ~S*X^I9O4P!s!bD6z^mDfVX-HcZ;9%P(uwF>{M8TT;z zW*Z*1;c=|r<9^a?c!3SCwc+t}-vpz`{kPii{Wjb|>-M@Iw+#>4@YOcF*@lO0cpMw| zdAibUc!3SCwc!pnUUUC$8y-vROM3n|>AG9zMnGPF;lJGfYF3V!coMxD7~jEo6XW+X z-pu%ujJGm=JmYPQuYoYl{xi(@-##_xu!e=b&uTEw58I1p)@gVCTICrxCq$jh_ zXM7OjHH?45cmv~Z###^9AE+j89>_nawvi-)_U*%+DI;$HVwzjOR0c6XTxp zNd4iwfZ5;8?3^W~@_WhZCklAzI%IpU) z`|XVXiSahZkCL~PM5>HRp=c`$N+rjMX8Sle*1LI#Y z-pKfSj5jfU1WRu-IBYIfuQ2Q9Ij^O4Az{kw zoy-qyu@$~Nj3+DahEZ(84Z7clt}kZx)iykq@h&XiYHWBM<1x&>)`mM6Ka-_*wGB^T zd>gZ`x8Y94`!V|l8}4R&FSBp9;oBKcVD^oyU*f!t*(Wo5C#w&fhnamGv+rR123C&y z882e?PBuUJfblfOn;Fkw{C39k8DGnIknv|ZXY(JnBsKft(y@jo%1z<535PR2(tp2oPB@eIa4W!%m9y^MPpf12@p##b^P z&(?oRS-H3we}nOS#-}n~!1x%(s~LZW@p{Hxj5jd;KI2V{KgW18<0BYvVf<^xTN%HY z@$HP?!+0Cx+Zo@__)Cn(vHCWYaR=i+GM>SB57r+xF`mNgTNtOUx}r40jQ_;!3s}Fw z`F>{q0JD!}*B!lCe9{=doY~h5MxVAH{eA<7cq=I2n&;JdN>LjAt%e3Ud;GAj0YLN zgz;*|XEI*H_-w{&8Q;Ko9pk+jU(I+SF#bH_`HcUW@nXhLW<1FFw~SXazJl=@#y@Ag zmhtCV{OcHhmGRY#Z)Lom@mHAt2F4#`ypi#4j5jg<59Ys_@d9Sw!uTi5zLoL4jBjWB zQRb(O@t>G|2jg^$T^tRvcC?w<#|P#5z>&b1d&9vv8wQO8#^nx1>CDNv+&Lq88sjn~ z$uk(ou+5j7@l+Ne598*ZOFSo^ad=Qwy$TrbsVH3cF}|Pq4>Ar9w5nG%fWxS7~us?zEzKpMCoX;WZ8SlsJ8yG*D@kYjXvvf5vek`+ZX1qV+EsP(>cq`+d zG5^~cuV%cB@o|iY8INcFI~YHn@%@aSz_>9tQg2UUJeKi+jK?v4GUM@#?_u$DFy6#? z0^<(m$I1A8%s!3r!OT8`@ga=686V2Hhw;;x|9r;(#drbZr!hapjGxYUknwAnpK8X3 zG5Z?EH!=HK#?NH-b&Q|I_-e+_X1tzpC*uu_yBKd|JdyDx#y?{5Y-T)#*|#vB%6Kc| zBN^Y$_>rumY-2o)*@qcFhw%=^&t-f+FnZ`~v1Djqx$eK7;Wr#@&o(Gwxx0EaUl%k7K-m@o6kR#f*Q?c#!cN=BJwR z(agSv@e7%KE#u=EuVZ`(^Rt@q3CzBp@t&++H84(F`ovj{j89~Kni!wNcr)XZ8E;|S z!+0y>QyAaQ_*BN*7|&%q%y=H-9gI(7d_UtCF>X{w>OWtnjb%Ka*~c+{3FGmMU&_kG z!8x-}VEi)1os3`3cpBr=8P8yRGYi+v_zY(6VZ4Cxe8yKXKLw2Yn0+zhGZ_yuUdVVg z<3)_uFz#o(mhoAP*D*eu@zsnMGhWYl3F8fncV+2qWPA>@Z(@8nvu|enT*g}%znSq? z#!H$1?TnW(-o|*4@i60a8Sh{mex5|V_A^dDSt5^xBK1F{tl>T~<5i5uF+Pv+c*f^5 z?qGZY;|Yvk#kiC4g^Z^$zKHP*#=Eifx*5NQ*?SnD$I2z2@y8i2V7!L;DQ5g6W*=mH zF|)5`dtcX-_H0d#@iUbh4C=sEi67AjQ@_=?`Qm0#*L~-{lAUzSjKN>JdW|zjK?!R ziN(jk_}h#pFy6|zlkq#5|1`#b&v*vo-Ff|I`~~LU!}t)!^BI4d@dC#0V&N7u{sglR zGX7V_s~P_z^Han4-Hg{VzLxPi#?zVq)r|LI`B2aJI%eO%_&v;jBjbN&yovF@Fy74g zeT=s-p25OxWqdK?+Zn%(@ixXAnEx>2_cPwX_=Ak^XZ#_?jd_v!zmD-(#vf)pj`2qr zk7vA*aR=k4uyiFb{wT9|GJZC*Ph^e`UOm@uwJH&G_FKuV?%&7M}*jH!}N1#-CxliScI{ zZ)Uui@fOB6G2Y7fbBu3id{}V%!T5H@KVZC#@tur^8Q;Zt2jd?yzMt`PSUipDNc~^Hcr4>D zG9JhHN6ddb;~z8bV0<^@35`X=jQ@-A)r>#P{MR%76|-+({AOWuqlLPlFv9EoT(p`mKuXHz|*DBpz=v7MNRBd1TVx@7=+Sfi;=~$tCO7{?Y ziqg1&VPAWu(nksHQo5JWgOu(qbU&s02pywzU!nK@WTrPx=v_+p6MCD{M+?14>0^Xm zuk^7(uT{Fg(5sX_PUyu-4-k5;((yw3ls;bQDN3Ipbf(fL3hh$*B%uc>Jy7U=N}nur zjMAqFz4u2m{SKjbDLqK&ZAuRodXv(p3cX(GAwsWJdZ^H=ls--9#Y&$p^jxJAg!U;t zOz0^}pCNRn(q{_oQu-{R2Pr*V=zdC%5IRQbvxVOKZ!`T)p?4|m5_+4`i9&BuI!WmD zN+%1wR_PR>S1Fw;^kSt)3O!fpG@*S;rwctr>2rk6RQg<@T}q!P^dO~23EfZW^M#I4 z`U0W%{$Qp*L+D*fX9~Sd>Cr-OQhJQg>y^$DdacsgLa$PKtk8><9w+o%rQJgNl+F=) ziqaPfovHMApFfQhKG({gl2*=oqDM7JBbDX8P-d-lg;^p|>f0i_n{t{+-b4mA+Nz zwMyS6^eUxq7kaVMcL+UK>D5B}l)h8wDN6rd=uD;mAhb*AyM!L3^ctc2Dg8&GW0byI z=)GT?>8}@hm(pv6-lp`Qgx;j|JwmTn`p-hIRr+3`S1J7$p%*KCpU`uaZV=k1^!-9l zQThR)GnIZ&XqVCt2|Y;Zbwc-3`eC7Clzv3$yZxni$((8rZru3siZ&La(q1P+@ zxX^2r-XQcUrJoRbvC>ZpJy+=_p?ymKRp=>7KP7ag(ti`$rS#K64^n!g(EXHtM(7x& zpA~xVzs&SE3%yI}O+s%|`Z=LDDZN?f^-4c4^jf922)#<_7ldA{^ov5zRk}rJpVBW0 zJw@r4h0av^6`@^9zbf<~rMC**PwCf$j#2t`q4(}H)88ueE~U2#y-n#igx;j|n?kQw z`YoZ?D*d+5tCW66=*3F^UFf+=Zx`C9^t(b&QTjchGnM{_&@QFl7kZG=JB041^an!6 zD7{nYy?f2{w+X#V>0Lr^Q~E=pH!1y*(Cd}{Sm?D%e{+H0} zmHtZTwMu_2^eUyl5qh!G-wHif>HR|cl>ScWDN27Ybf(fj2<=k(-$D;k`bVMrDgBes zF-jj0dheHJ`tivJxc*lf7kuIRUuj(Eh3kK%apT#(_Vr5R10nm`*D8%Sl=ii+Qu+v? z7b|_F&~ufJ722mX-hkfMK1J!CLT4&{l+Z4v@kZ*t_CZSHgIoLB`zhT==oqE@3cdHA zX8Q5LX-L1)_#il>U+JTT-lR0%V1x84eXP)HmF_R}Dy8uODp>zedVtV#m5vwMr}Xhc zPf_{=p)-{}QD~RaxRDFeuQWcu1nE~AA4J;M9-}mFXxP`jcaNEVhtRu}9whWOr3VYW zN$FFCUT4BQ=uj$^J?yl({wfy*6(|a`iiKcgG`fW{b)$|rkZ`AY#P3z^Qm**PI{x(go z)buh@wE4>|uIV2y)XHDedo=xtrgv!iZB1|0^cGET)bs{TuhaCs znqH&n+cdpW)5|oyNYhoCF4MGM)6+FQP16%KovrCnnoiO5a7_=@^gvDb*K}`9ch~ff zIa>K^dXJ_*(ew^Yzpd%5n%<)6jhfz|>2;dESJP`WeVe9NYI>Qb7iqdm(`B0WYkIn- zr)hekrn5CYO4BKt9{+jNs>F%2T(XExgruS(26HV{X^xK-=s_8A7-l*vf znqH^rdo{gA)3<4QrKXo@dXc89G+m}?zow^adYYyuYC2ofqcokO>EW6ls_B86?yu?I zn(nUYAIE9sujxIS{zTI|H2t=ww`zKerZ;MOgQnMM`d&@1(e!PaUa9G2nqH*oDovMZ z+OO&9nx3ZViJH#V^e9cIXnMG&hiZDDru%EUx2C&m`p28+aH zqUnvA-k|Aqn!Z=lYczeErdMiunWh(Mx=Pb!n)YjYx~8XTdZMPYH9bnxDViRx>7kk) zsOkQi?yc$Wn*K3cD}PPz(ex*p-l6HYHN92STQt2<(;GCsPSf{ldX1)U)AUMBFVplQ zO;>5UOw)c%PuKJ`O;6Nxwx&mEIz`jNH9b_*12x@W)4es_UDH2iY2~l!J(~VR(>pZ% zwx+jgdW)tvYI=jF*J=7*O|Q}PZJJ)G>1CQ;r0FV6mucE>>Iuug3~z@UmlKxn4!6Rm z8+0u!EugDsb;a(Yx2?t?;3wfM7{Y%Lmg69Lr^F4Up@-N-XcZ9QH zFNic<5gOuf5Btfz>_Dh{cm#TY5KiME{5=vvA2|N`ixZZA8}4T9U=OT`p@UCgtzAWt zIoJxvrGw3|8^a<750QgE+c>yUIKUwG!ChEP)s<)o`a{xJkbZ;orJym!Q!&7m3x}kI z_rv{GJR^gSRKtx?)J~yyhZB_LYh-yG9hpRyM}USiFeiV2T@~myKfIQV+hLs@-QoRM zjNrRcqw#CXI7G%z!$BCZjvZ z_-yW^CpnqQ;n#2r8iN}Juuc^et4l+^fOuHcrCz`>3f<7#k}u+)>IxApaTl;rTC&i#6Qot&0Y`pMtGt_&UYCe7Ad0tNXxnZda>&SgU(^w;|!> zSHX!;iDQf0XG%9v6Ng|uSn_=QX7`FkHQ`xkvV3RwCRoC+S=3~NdI)?KxI{c3Fn&G@ zJJ~UEvxWXL{0QlbK_fo^H{!7>oKD3M7v2K|^C5*S%HSR;7Ka}`kq;!kh0fdqdq}Xf z8nLog)I{HqcGtZfQ-FldYtq5;}Pmot%T(jO!OmW$Kg3sFaL`aHAK=s4p3fBclUQN$W80 zj-wv>pIB>GoHxY1G7Ed*7iBLTCwk#G!f*_1UlulQ__k@*2LxSN4 z@O;P)+yoF_L!tpi9f^7rOGvCnF^@zIiV`6f4yg`%;TuNvVj}tBiS(@?JVqQl5I$FY zej82_pI?U0L<>l^BWL;h;geBdeu{&;!+*qHDAIXgv10eJP^e!{&XyAk&r_i?XWR4+=xz5FNGd%PmM z4shS4mDpRYTvQV|)LsAx%kl__57g&p3ZOR)-~XvB@U@>q>`I0BHhk+RARd8+BsMrN z-yAL!EpKP|DzL()@YrWCvi=SGuD`;#(|(cgygU35iUlAbZw`e2IG%-*N#UepINd0m zvoM@-=zGP+49uQ6IONGBh?KGDn>ky&Fp{(9K?rcF$k`x{5j+1ku1IePQti9ij7ZZao?p^$h>6P+}%i0c<$3f@Mf}m<4wy-HSX&~nfUZ@q(6-(~m07Yy!P zQIntxfxhf)@f{;P6u#BSS%DF%!eFq4x>xo;3I6`|3rodx!RY(s0`&2Si`gzs?a2of%b-!yuBM$5RYH?<>E+TyJyuby1<89?}K@^1@p>9-c0amzzb6c5rv;Usi2wKL9B@(AvHi;&I@O_RmCgtVg?#cVUXZ48IDU3ugUW z(7)J?j7?-)Pqqzc3&o7vg2Hp=VqQNzM2tH#P(TZIhX0It2)GVy=0oN{Anuh9$x&=1 zjnmhNt{vv)*qAENJDeK~uO1^?dxnS;R41(GP;cw-(`>OqU==?BVuzVpPgafGN(i+2 zstw~K!J&)pb?BUPp5%-}khWx)41 zNM`s3G4LMU*`fW7u*a-50-#@$nTkX#94cIF!*|;B=f%9we4XzkxLJf1T zT-FFYXZeet-|Akm42OR)Hea*@s{XZb%~+31CcVWp0-MD7!0}2FBIt(m#F092#t=9% zVfnwny-2`H=?e$7e00iufB!z@rb5MVV182bSLd^!t@=O3c~+1QE)4phRFss-s5JZCI0I7dtyL(rDtyJhLu!T<}$ zMCsoMC*t(A+xfEeUxl#Xm?(WG`M{t)7BjS+;Ty4bSd@MdmZVwwFGQDq{y(fsKLfaR z>7NEi4zKk8{T?Kom;PnEbY6%m{d(-*#LU#P^w)!{Lzn(|to|yY?Y#6e(7)TZ^rr~_ zR;52u9Fe8}1JvHA((j4&+bsPZqHfF5kEt+A|3y4Umj0rqzE4&)!wgi+77v-tb z7mz}=>2sP~uz3PBR->PWO8pn1cRwR$c6YujjaLbHAe@eI0OQ5zWF@Y}%y!8EJwO&0 zV;x;FChT5**N}SXp(ZSUs2hI5L!l21?GAFcdr5Pgd%5iS!qp;C3x{NcPbNpFp=%tF z@g$fTCHz0$-UU9&;(8xXB#}jlZ&1*9L81mtR5VmkBBEI#@J2T^0ztf>pny<8C~h>0 zU~qReuj|@qu~M(`(u&pgi^?S;mvBo!MGi@ng@T-UI6O zqIYC;*!NB((3#ctQ~654J&(-FnN=x4M^AJ`mGwnc?i4li-Q-EmS}D&Hn=2a!ntfNA zc`J*pw}b!J9Gc`yEuRT{Kpg$_7ZC^bg$vKLdvp^94|8YlpZaDe14mH7dyXd^9p!Y{_eIfR?1Fje+Fej6ot?~U@iC=qL)DX z5?Hzd(jiU0d=G#zLbg&-?1y&Cbh(8n$HiaS1k+vAB|w8w?V%QCtuTXd&g z0hoGu%xcM1n%<@T^+{7_mMFIBNv(&4YP;>XQPG4lG9tX;U{|APAT~KCKM)IIyEK+j ze100VL8(tL#fm!s-F_M_KbW!fF9t9xdOj*N&1b=;1{1yD?3W_D z8;sE~Pq81{3iM+A29g|B13CP6LAb|jibqR(DjvPZfo<(tUU&#PiD_yCV4Jal|M*pU zid`#0hx0Ft2*t;>YMTQjGO>ALQyyeuee0f1pgQ)t>@_-+ZC`wEr8`nRE?yeRR@C{;)4Y)z?m2PRQ!JqYKa)K=yyN*zLyT19j1Nim_+sfdG8 zeUm7a(PjUX8ZE+<{>d>vN2xS)8JL=Etts`qXpgu|%?D{blzN4^ic*a#IM=@LKA}_!N`g|ok|@>GdH<9;ivuc}zToJe zqtxHu5=#AENZPL}%9B!OgU}sP>N!+tEmOle@F7!CR6GDmP38pDno@U)GL2K}LD0iP zDNA&VjJ`!GIM?nG6-s@8I4E^&5~bE3ynjk{kwKMyZ`RLIYW@bH)FdHkzt4_RlsW{2 z?vPS5QKdDd@~Ch?sj;H+sJtG2me_Np5Mu~NV3Ba|{G33+4yE5SWxhLu7$E!P_OlJ0 z-C43R&G5Jj_dQ{rmNs%->2g=z1y}6dlk((UVY=f_kjvht`O)EC)!{{Ba=-SCA@@rXiKB6FNQ?OYaNlS5ADbcyz{e}hh-FhpQE7F8)UGcLfoEPxuB zfWC99K#v@Uth5}8rJV;bs4V9fyX;3%-(UNCR*Cid`b4xG%UCRxGVZWDq2O*HcQQ~_ z;L}9X?Jv*q!nm9`#QNodNPk@DJZYUHShL-dIP&R)I4ojDT z?Q!fn;4CGa!vF_XGX^He!;wDPf3<(K{~D(pAmjJ_L|!^UDnwp3zpv%x0a$F5mrp(N z@;F5;I&isgRLe^-^6eY|q@W0&cm3vOpmoA_qDA5X<+OTDar^duF4s#>jdT4=%qYpE z7W#bhCApok8jxIe!F7c2o?VUZoe5A0&Oary63Mg}`vWL5W@QqY9sn?f56BZEd3K%M z0eMCO)Py*FNQ5<~03^ioG1#Oaz_KG4{dzQ7`3)>ur#C7jmG7FX%1>71V~O(TvHZWG zsguf&qj?8`2*|S^l9zOAm(wWM2AA)()-lVU$lt z5aQxo%qsCVMdtwa3Lom(_k-)pi5g7ztEAyouz_n1WhV|I9Os96Z|w=nG60%qNaS z3=8>Rwr#It8we%l?Vp*>S$b65U9J2Ouwsl1{nhr3V2CzXoXxwF$$?c1v-~3{A}b1k zOWAOU$+*+wbJAD@$$5y%bqM0;M)ea^KQr-TS5RuBJqi$B(Fi4OTIszJHe=l%`T_H@ z8TxFG89Of1E*0tSBr78`oh2E3m;Zs#y}Q98#X@O4{;i_C#`^?Q#*?%FZJNs`0 z(b=EWPiIs{EE7L=l%kr#GuLRJ24thNweqvto<`}5b^j|ntvVSU`0-)Hz28GD0y?mN zX6n1>9(qzO^*Z5d3c#_#maBvo;2d%vL@tW2M#Hxvh#!`BURduM0u9cY;HL~wtHJ4m zFy0RYo%+(UkENf=pMpKX9+A>ry)<*-cxyGAE5E+b zT)C%{nObkY^HZ=p0HkXGzLrL%C*AsNzH;FEsGCxLTA3Ldco9m1C|?G( zCG4zmph|56o%&GAHGZqcbsVMcP zc*7VI6{U8x67vy;1BXb$qf>@RX25FnU%lEkSF?GgV^#Y_#-jBKhpZ$VG6jeJXr({f z?q6yL?h^3X`vLE)qI4#~@1TG7*LYHRKaidzq_+CZIi(^MlY{V;_!BiBx=;sY<8!_O z%tRfb_YNd^$~)&SOp0^+q|CH_G-vPJ2vPA+AS~{qed~&?FKCOefM$-S<$P;Qg(k!Q zft*}3IwGeK?J9a3-M|AbD<-m6nbde3V33c&*my!xf1}%dIaQJp7MBV$rcHYshSV?} zyI!RU^;xhgIMcM|s4O&v)yEv|V>BYid^5wjh{X?I2CiEXoXcG}IzuPvVp zO=b=4UO)86@hl81fZ4nH%+=}QgG5(jY9p<wYu-`k-k^hfJoP#j z1!PwaoY^RmBrpCYH{IYR5C}R62zfa0YgHxD?-AwGCIc?$BVrS$Pj6{GQSPFtQ&}#l zeZC`_-2Q7*;Zy^LvM7)qe%q9b(Ehc;p}0i8j%ixS3!%m33WbMEo;o%mS5LEC+#V*M zUb`ttzEGohf`%^gH4Qbq^0llO2Z!}T4+qT76RfEs$DM>PN2EA|q_F0H*3|OrCY!Gg znmT3L-WA(5Z#6B21U`j?MY{NdOqeS%Y` zqr8&fIDP)YI-)Q9E2iRTJuyH=jKAq-l)p0pe^wHFfc*8;n?$jA<>U$!5_rfjx~Y6h zYd#hr-*0X9ue5tbwis2n!zc>vyWY5)pD~@iZwDx5+`R!|zqQpbt3z}7C0E?t|AVzt z&JI{#&E996l9F<3zO$6ZRGeR-y@2orOwHMQ6@)p2wwGm%yB|Q(+&Dq~Si7zMJx;DTaxE(ATM^H5nUt$1QJZ;qFFP zzp-eY*$F2^*nWN++Pujp>pdV_Jn8*jk*3S6&4Awf#=Yb6_E_7jcL!VFq}C0N6{Z@C z(heTN{{~m3b@RtY68J#~&Tf$oEuY;Yh~F~|od~VYuR7Xa%R$|DQ{V5fRDNtN_a|EH zIhcyg*wwswRSHjntFg+6`pklNj4;1Oz@sZ9n8*pBNYwPz7iLN-JXs-+g%tl3vljZ6 ze;1ksoSMUTvMQ*MzXSn<`n~oO7FiZn6veR-IXJKAp2JqyB3sHvsbZ@u(oEzDDNO7n zfg;@b(~U4gwT@#8&r#MOG|qwmwaWRX76@yL?eh z62^p7Hw#KEa_r2f?E5h4GNOB{m2wIPQ3Ipvs@>Jf`&P?k%BxU5tBXK(|P9 ze&Eqo4wMXJwQ0&3E>Hj;R#Bl^+c^=WVLd$6%=8=GKplr3f~rRp=}`Saj)mqB=;ZL& z*q%?4#0IWi4Wh4k&0WT=apqxtNRtCcS56pVkHsPDZGRaTLwz@Cs{>}-X0WdvGfoaW zmDFIC$wnbCIhcb@9hguQAlK8 zVW|y6Z31kxx--3kq*inT5S(o>>6&X zom(2*@LFe%Cp-yZ4|OqrlJ5S?HGRdhqoq82e>5J)1Leu+dxFLor|Ps&2%Dfaw(nBx zEz^hclJew{xEf5)7iki;nqEJI&KJ_vw&3b1eX@X^*jd-@AJo(@y z|AVi#PyQtf;N(a0i`>iM_IDFtLzD6W>w7X|m;L2(n$ElJ7?w`$aNrieP&88AOmsai zR@@FzVs-Qq!a#pTgk+~jr~a7c9orma(SuEkN6DLm>CK@+AEv4=F?=wH>J$=3=X#lDHz-0V?^TPq)Kk(20d;AML_@DLj`2S_!0pK6QH{kx)^uO)p z1pPx8pWYQ|wYwVq7iRM@IED}hpL*&qdd4Nq5qt#_`$+eq6qX*pQXUQ1ky>d+HW}gT z#di8}1$Qt_>s`OKf%~I@*aCG0VjQ&#^fxAO_|o{5Phf1|<6rUAdsL<}D6IZGT5+up zP7Mg7Rgg%+@;N)?N1hOnPH zA6s@6$hY5;8I|&TpYn%;rGLKy+CG+X zOkJmxC47k#lDiyXSVl)-i`TyRC74Q)|4XRVCt)49D)v!N_$h?dc|_m!+}(=xn2Hq# zpfv-U{W;cD=xpEhJmdMpUdb4PqjX2siShLW0sBXw(n*=_-~lW(3TE%*Ers^R&=3@s zF%*bBD^eue!yHR;sBkVU9y5l);@a7_?7%RI4g4MPv+}(|$f|&D-th~@%)5bBW9H>S z9gUeWd)khf1%GwN%pXyQXKtHF30^xiRZC0X)67&FHR-2IK29c{-9Y%NIA|9@j9 z2HU0YdO2*vfw>+sv3`SUA)oCJn;W5>x#{?OZKNg-oAWSiC`6ol-ErX=FVA!^UcNxO zJ^RIk+;6~GcE`Z0o^S)g9)niT$B8{k5o8sEGn8&o~Zy~cPiL9 zlqI2usDoH;3=p^hn-*ZZnhvnloc_tb(x~q$Q6`mhfEd0N3>B&=;r+k3;{6)?x$^-z_nn#-ZU^|zh@WO_JJ z;-gB{jDXny*AIvXX9YIHKj75JU(N<>jlU_ZhhOS%`vokMDB(EL+(8MiK_RHW$qOmr ziRq540N=u*A^?=da6cVt8H>q4Y-x_o_!A(uh!{$|EG5o|_J(l0{~27qEGRn=Eo1X` zq{XFyn-*#oi%WzXb%o-dVX9N?9RFCMj=!<=$wFqe7KC8*P=e8jOc)c24EE;xSXA|! zQ-kVizXukeix!1SUob{@_n;#GfIq1O|JT0(S1gFupdZOSyAD(Kn{(Koc3vH(to zw*7ms5)khAlm;9IFlMEGZ^Mr`!Ki6qREOjCJuF?@jN5Ca3$Sc}{~(2brNSQ;hyCdT z@HVbM+&)U$#=zrVl4opQ3%#M!>QD}2EN@j3o;C$ z$-mr|mSSgtoXN7f2t1eJmxVI7!8<;|)lok`{OpnBt+FaX2$9M<{>IY(Ary@dzx#K! zBcxqg3F)hge_lv?qem3d5)MCX^>8Li7@tARqMWQ;;}a)e?tZ+mtr5tHof(Cl7q>rO z6rCY<=`r5)r8>PL(|O`YPx+8qWJ zczCUIt0z3u6P~TZtb4B~{eDmSgP!z1c*0L2EcWi|#CEvEF`%2F7Q~BlL{76OTiF9Y zmtSn{F|D^u>qCFbn9Kxhu~(FRTx@;ABW~-hfVFC{wZ6Txe1FR@3|8z`B-lo`757Yj z!81HVQT|Rm&(6liscbpppp{JrEX+x(2Fp>mkvAg8{>I1*&Gug4JHD5G1~oKk=e|qC3{=@?;{|$8Rf#7o&`g0)V zAB2`akn-0ZVEI*0!3P5W!2>Km{m%ykAB&dgwJ;>6jxh_Gf(5R=H$yWsQ-a4~u4pOC z!acIggijs)_vj06hSp}IPWgMqQM!*r{t?V9#!B?Ws$GxMZ$^4yEd(H^}WU?bf+&R2&FsKCpcZFaVkl|$%x~m z;3wnNHwmxz+QBQ{zOQM1D}3H`@$p{{OXjur(^9y3T5SCY?%t9CG9k|IGyRpnoylg_Y>k|)^vY5p5SZx1L+R> z24hW5oZfNr$vA68-=4|zJq%F~eZT97zDjzdmx{z4CfhKs%pGAy&ol@0`2dT{AkK%T zK;Sa;2vhn9491;C5u%&~Zq1?7Qp@qIs^|m^d!MS)(ukMl{t;Z>^kuw1ME zr>H2sIs1I;Yr#h*_sgIb1 zv#EEG?!}6JP|=~8y5iAzj|^z2bFD#{Qq_piCl!~@F<-Rk=iOMs=Uj_)sXxe3S@WW+Mgh{a8 z^|V)*@FnSGp3q>1>hM5fgPGS5e6l$-6)!=|jSj_25I2Y%Wh?IRC5X4sZzEC#VD3W@ zlNIHJdjrykNPZ>q)l(dY{Q-_4#iXdKpW;3&$?Y29Oq7zgjYv9z?YIAHfxjD6bld+3 zgUFSh0qOOP#P56LtM>1I2<)^{|Br)rG!u3KvVVeJ@xbB_49T8tx{!1zSNgT43u22MY^h= z>#2{KP0MwzuK$v%pDXpBO|<{Vc>Vp4i`A;W8W)Qa^)wsMa zQU7wczN-)LU|%bJU_`Ev^3(}NxJ1|?s>C zVq%(g&lO=!myg9RbyJm^r+Bc8!-ibCs|<#p-hzj*!v#pPN11Qph7lsU%qx_gflut| zFqTvzd{Oe9f#9)y2566SxKp0Q0pYKZk7>SXBd0LaTHDlx0c^}PL4mdH#LW+&Dmr;1 zR1|;Dn#<}t?5hxOrT?+}xpqrhikcPq4iF}*J5Aio9mVwp{F)G;8(cO)M$>=06_Y~@ zsv*R(ovw1`Vdd-UKS3^o6IVavBT zT7=|sNzDorG~OxO*8PeQ^Xf3}u zdaDmDaW8}|a?b*DJ|1-QgG6kxcykK6IC~D1k|HxEWnoiCpa32J(Kg z)1}$bpv0Eo(TWb+%;>R#mu=q<8br2~H^D%|gfug${~`lmeynVw*-#|?$FKBXiWhtc z33M+lKS4;*(V)SW;4#Ynw-W7CdfI_OG&9B1@$RY}2yPkN3X7!I_*K1z;cgauXoRnZ z7`Sbjv)4-RPb1I)aJw}kd{Rdw1UBX2#nDCVKjf5)^mLRej?NNEFq(QL4_(u*YCTHe<@+$SH|@M+#^O*d56G z&h8{=DGBHy*l6y5bzi*KMd)=B_h$*&W}??c=(XZLes!<$-e+9;p9K`GUWM~X;G-V! z=xh?xdnxdE<+~^!V0Bj&)3JyG(;@B;ahj3tDu$m2fPsR^xON$N7yb{TKT5#yBZ{L7 zawt4hYLnm!~F-sn1s@vXuQWz@7%NTa{68&6wffmmeY&D`su*xuK zCxwuU4d!O;FPpq^f_=E5V*=3|MsZrebkS&ppM%URy!9TO5&kFTBI?op|3L4nMepDG zJ#o3!{WcjZm(===MGoxeoizf366TQk|n@o{|*yCU9BwWpm{ zdcLL^$K$Q0Ndl4d&4^K+T2 z+i9)mPgm{iPtSYilV1bUa*91IBtIg!MFDFsTxUd{p`j4MFii>Oabvc>?5~Y}SMT?1 z1M!mgd+w*jfeP5Cl?u61;dHmclWi)H{v5R&qFD1n72P0a2x7&dHPggI`s+RM(a5z% zNp$=u?$B@x2v_TNVCdQRq6jP+6M2ORf&Q}ybRim0d&>Kj=jXoLuRQ0Xg^uKTDTwyV z?3d5({yBNBV+lQOTFG;nG_!wsmS@~O^88zX6T4+U@?3&49m;b)GdqyyHK55am*?#f zX{G~tUdP-Hmx zOPbL#wVAm$m5=VZ2O8YWH0IP{=h6sYj8f1>;fwGaFX3~}LR8H=KZC+~m~~wgW^W)0 zxk%9{^hEIIQP_=+5`|s(^`Ov*sO;CcXv;SE8IXbGJ$UsfzAls{*FfV?xTkD{n;<30 zbt0g7yik^Ni#Mw3%xhEUWiNVJ&NJSq>|IYzZsRguK2Bv=XPy+AQ2WD#S=9!(R!k9f~#ELa^b9x#Cq<0QwC^oXmqHYgk3HtVcz{` z5B-D5=S_dtG{hED{2a_}Zm6$m5#`;D7j8WVdd-%zQV{hn&b9BRa&F59Azg?~q0L;arfd_U1itx%RQDbS*eg(R1LY_0oaU&;ht@{p;^=#M;J* z2c38h>~jWFr&P;%uQQWjv4KCIF@i=ofW(Vw1Ee=6be=Y6He$W2p@uQF8^7s~l1hgIIf=3=7 zmJ%Y5&mc&7oP*!EJpK-mBze3AwEC~f`TgHcc!9|k7RMbLFKR$u ztjD99_|q>3CGAVWcXr|^bmD&_k6UN{5_vpjmcrvn{8AoIfm}9u)+K4=gbT}UpBdE%xZE7`sll5ht=u2t@Aomiq0rllg7`VV>FEyR`%g1nC zwC^Ul^3H8-xk5=r?y6n;Vc|VC8n~^WU7vC`uT$b~6*f_LDHhh-Md`4Ta&AM7?6K3Q zk7ZRn%_F|Oy}5I5+yaa46L@$Gpo{$7oJZ6Hm5*W25Q{P5BX9BUslHc!7t3J37YTIH z4xk9`l?P%2^F+g;q!F8b zRd5F&6zo<9185!wXs_E@3wOlh2L{=a-dU%^UQN2;x$8ER*MpJYuF#W1h6HK zwPXWc(c^u7UZ;8k71||uoyl?RHfyQM8s9&y`X7w% zkKR$py#p=g8Q)(a`4^7w#WGs{C*yl5=B;+cE!^SwKAQc3O4#4{E`s|6#`mv5PmJ$# zVY_jB@47?D!(y?W+REI8FcsSy->2yG)-q=>y~FX{cjfy9l4_eFK38o&< z_->+&)t*BAuZ-`RpolxH_CLPg4Yp6~*K~SoT0g_|R^$6l(jUWaH-0(3?}WbnCF6Vl z_GI4M@o{}Tf^6Uqv(&gAf;k8C_6{uO!*ya$ev2T-^%wY!+mqW6NgCHDuh8SVw_av& z7u~zQ>Bq_J7p`w60XmUNRY-|F2@GyWGFC5FN$&Rer|A{S#W6`W=G-Oi6!)N-x%DhUB)Dhnr;{9e6r0fjsnpF;02dK)nfA z4`HV4SRO84`rns_>!Fv>GUVY?ESl}^K;S=;hkd{OCGt=-UG;Mrekl({%^azoyh;@U90sJ% zeU%%1aycJp!y%AQa1dQs-+WLun{Vf2@sd3_y#@55Hp*rfU3a<cD*W5^QpAo2>}VXHDFwfs?)1?u%!&CW$? zktGxq-F6R_EOfGPrlBFFFVJ7-I>e|4vr#z3If!}H(pAm1Y<2?wxqRCQ(B<%R1K*6u zkD|AjPiL)N5@SJf@In^st6cC7MceYCpYqzuZdxISFs5~WPRk~?h7+pubDXo$I()XC zBOI)qm<4?65VeaT0(fTNQqnn=@!AYLGjN|G{j~_g{pTX&*q@_s6o$Cx4+FTZw8Ivs zGB9X1WLbY93zLifg5%6Dy&P!usc0Zekd&^%qy=n$k*-cgcq=Fu+SA#H@b#}iv~yrU z`gw~fU0)!h1QJ8PaNZjr5!0vQQz!vQ8P10rWQj+@i`W%MQ(8)MxsAMqN$>;DUUH1; zr38D+9 z9wM@lyV0iN*67JCEx4(L0!>|He+dy_4o|W8BD#D$CgQ@;`u+pn=5bC0>)ib>1zqtv zS3#1Gn!5Sm;2+VUB}Yj&`}I~F+=m}~CHmO1QMVqohan2KJ5nMefS)DDDT++IP1wGT zm3BhB+OYjN5-`^uvkyc=D9awn5JX@Wghb^H)+v*bl33oT`%WG4yNeOt0JKAUaQ0si z=l8`6$?x}&k<4$f6l?lyFuXx{9%nedbVo+NE)*c6pH|&^paT7Q>_##g8Uv5G1jjdr zegj5dBMH%Aq6J(=^RvrT1p^j=(UTOT!Cndh7>zF!A5}Ft$3P(}g}(vE@)?T7xBmv* z!YQuNbTH<3V-eVk&4{>yD#tky*{j#XIWgA?s2AQ7!kVi{MEuXv1xO7TmC zF)}$=Z3kUy*`!MU-kyON*!sL;YX;f+tnUAg_<#8wN(ONy;`G6c3hNBt%& zpQ2M9LP~spl%rxm`qQt#;?Q$cm-zm8@;n6}bqp5eQLBB_kJJ+s{2x1C#Zkj)s4(R! zE&oK4_t%=0iz!Y7aLeReG?k<;;8BsvNd@~XN6WRQt1$wh2r~WhSd*M@Zw_Y3xDH~u z5987-*y?)tpTq&^wT5FPa!7XjT|ZJ5@o8OvhoA@_9{JWb-XICxh>7+!@(c$GEy0(h zhrXug*$>up9eOdr@O?ksf23=dV(0|Oq+}5>>icUXNgU>fBWV9=QrxN_L++*;2Sa@J z9SBhgr-7ktPnZgRK#e8Y3uskiPeSX4ccGt2&WvzmJQKb~MtH4=S$h3bFdRcO^I<&} zr37|-Eo(EFXh@|%!VLi{65fK@dTf9v+$ybMhNn;V-_5cyu`@x7HIdH zk9aSghGI?6c@__=wEYk1Hu=;B-<$AXUX158M;2Qj$Sc%5hRcGh5H8p>nt2-o1)mvV z3uG*|))ue)F%4{r{LH4`lA4(C`~8A=YqQHp1VM zIk9IC+Yu4Eq|m$VHC(~E{UCEha5&l%?RPX@ll5k@eLoKrEI(OIN#f-XrGuF$p*{!WB9_*l%AS80NT~5A?Y#;V z_}4jU@aG0_)exS{0m}~LHR1vMG_znGoPr>gkTaV15IbzYe`^{=>)`4K*Z9@mm)Lur#QaRi&2_|A%SrB4z0?x%spoG;$}RdgHh95{t4VJGE3k~3B;yl zI^RMd%;l;^OK0G38bj;xt=KetnXzdZX7meh!R-7YRG1#mGXLQ~eviI%+24%bh9?7} zYPG|=3~=!e-3f^llPPqkjfwEqW+O6yY_m3)pf+I|}qLcVWl{4`~h! z;<`+xvNDb#Gd8`^z7d8SziS2ZmHT*1LcX&SjG^+(;JFP$_+k_2QIE;?8n6LwtQc!t zfu-pcn0#wiTr=s9fZ+>yfiHs#m1js;?o)6C z!#Ofoqs;L?dZ%FASrfdyQ?L*L(wMn!G+Sco-=$-KBD%gV{slq+Ud)NMkEc%&T@!^j7n=Q z_R8!-F_EjO{i*K|LP{Pw5+4sD59-}j&JC@WO=i^Z3sq1U2GiyP`=b~o)?QK{Z0h;b z^W2jehh;dp-B8#JWbkn69{$L3oLt69RyO&ar5~us<~y1v2oH?#kH9~```*PkE$fQ4 z*G)bs;T5+NXnAp=3@PE2 zv^)tW>^G^8yYa2+AEoQVyAJicBKHGwdu4K#6Vd1b|}@)bgmF zShEN2m7$X>Lx;W^96I38RwK-fPIsOvk~uxH7D*7?Ky17$Z7>0AiRA5DerH;Hc|DGD zl-h>`VO_@OC@Zs~*?#WYI3Dazwx*}_Cp*!fY~3F|wNHCCTj(uR*&s2f+L=*#y|S&|d6p%z(6p_F`w_g>-_f zs51u)y!Ovr>i&~M7gKtd=Fm_oXk@`(gllpO-%4#LszDM_sYda(*P7Xel~V*19uTL2kT}%f*bd@Z)X_Tnez5 zqw0}lzY7k+ewDt^Qbb?M*TW;VCJ^oS@SmV&3U(Xex7eJgD>G}*mGxMM>#odA?g~gm zF}yr2xi_09qP-(@Z?FLpso)Ut^y)b@6D^r1gMvPgpnpvXj?Sr|aNE`hZwE=0b?S_P zhyibGgA~?9OG(f9M8Ybo1u5o(6!RMjxx1GVMM^`VWRkVqH>xk1mlCR&2c7Y6a<5`8 z=GxL4Bf<-_p^7TRrd1n}N#q|Mlo(u9UW-qCf*>WeXf8&q@v>k=B|c5F7TqYB?{C4A zK9w`)fgGzxU1>z9M+4Dv$j(R@pSZv`z0%$SVQwgrQ*z#zG4bK(;sLinRepoQW&=OC zK#tDmgxoM>4+H_wpf$;4YZ|mo(K^r6TtJ4z#&5pL^%QjXnJC?np~O5|HXr@O8){`W z4jlxw{TeN*!6JVd+|o$}BRm3GYEp<4lO4#{Y5{XI`K+{D5od2|*&Hc^3@S;bGDRJ9 zMLPJ=x5YUUiXnM^JR7`HemL>>d7#T6_cN=Y5AFPaL%o&M3frSV9i*Ed7|`!%;E20& zfUDFY7CA$dKIAT{KRU82Xj>e+Msy;)dk^MZpnP88{cVJKL_{%MiCLg;baX*-{%zCybK4!=g&(JQPl}UukzvwC~5o?vmK8a%Q7A7PqRopP`_&W-cO z(SY3xMW7iMNv({d@#AX7e&eK_7YXQ9^fy_KuZLti)6oMpx2<;Pw$)9?Kx^09M{Or& zGi&U_7{I4;&$&v?Xg^_QWBg>q+tfWt>TdZ`>dv({sX&drURU}jq(X|+BB$w#r26!K z=_Z+vvKpFL&C>Xd>dEvK#N8_MbbzYL2y?rU*isMJKQy6h=&&n~EUBhh%(@%&Ll=c+ z^tW8Js09Mez5adcqC@t`0TLfSl>?-kq07uDRM&v?kxz06L+x`h$%#--kZ23S`k?m! z{*xmlxD4&G=RyL+F7eI@*dmM{Aw7H@&johbyT-F1EIoAJ^Y#OugE$+*jD~4+h%H#1 z?P9f6rspAzz^d*EVuhX8WUMfMwujY0?O_%1V3n$|x*~y9eJiZw-E6^Xb7&8YhQD#5 z;c>_m!D`k<7!9WiRwJmoJu@w`;OG2X&a~XJ=0Z-?m6+Pp&0;sql9|?X7&2un1b7~F zJL#G0rDqcx04mOzz#*}J(cya&{|9}EfsENV@+^Y#-UQCs@QhM+LMMJg+#}}$3(V9w zXfq-eD68^$&^1!>1ISln{rKUKwlWln>z@wWr|qlTU(#Xw*9iO#;HcX^y~+~pfABc* z!&f|X`%er*`!{;qhr9%$6PhVAkvB2SqeC_aqPH}XTajTX2&Fh=V`$bobnP+_PQ=y> zeT}yVK9z^YceM=9#FUG9oy+e!dnd*dX^#zwo~4iB>)!4fNiUkPe%FhoV6cOQ5{^X^ z3#FalI^lU2N+T)3$qOY|5&%mHaUQ37kI%Qz_phK^aeB+(lXd(z_WXaKzxd{zOuz>d zXnZjeghr^vKlRiVHy9%@#(&XbqyXhO{r1H#W(x=~wq1St5X^&Tlz75_bXJ|+8Dini zhoqf%eF8!%O9biK6xk^}b{Blg4PI=KcahVr#FHofxBFSEbQudGNee&FIqj_gM{!?@4+NQ(ZfcU3zfqNqCQTvFi zIOXrMU-*I@!h0EzX{PVnO3#ob7+o@sLi~;(uPtsR2B|5n6nT59iF234@!xiI;Js zrF-zzht24VtfoK2yX9(!>OTE2chZ&=mtAk82#$F(Q}A?()We=(_*^6=Q@J-%N_eN- zADU%D+4R89GJ=~CJO;sHYKfO%y~zWHIAhI@4y~0L^*vOS8TFkWTtE!P0wR6$A0X1< z+X)iZ$03*+#KU0dd`zibe?wD3J)A9G2KuWKWDk-z;y2-Iu=sGK#+FK|yEH%aO32C( z5R}jMvPUvPOz%`X7cC2-*pwV&p9<@WO24oes*PI;)%H8wW(E;=K{SV=+rz`u010N7^LN%X4Cy}r&u`40s;DwLc{%qhCh`y-u z2J|UIMX%sMhS(tgjPP|}6>ex^C$0$RMe50YGY>obYfYYsh4vI0k@xUzP`SZ41)uA- z7mbvbO5Iv49omy=+}#hkW~d?4o})^PNiLCX-2FXH+)*Oio{kdCH5S=ySb@kl?q046 zihcE){k(o$A7W>vjqEZ&5}hV@;(>axmupVkH}qwrx} z!Hu|J2a^x%dV1(l1TeClA=REdOscH|HNcwos?{MYFrd$2N!9jbkZvPzF1DnGc zCA1`0MV^;PmWsS65g#LF?A}cL8j*RDwg>7rH4u9+Mbg4Rg{}vg(Cs5~rqMOOG!VNh z8<}S8LBQ5e%BG?0?l5`f7>I)N}9!CJQnMn|1G%##?=#?hM>-L)B#MQ=5GIB>&F_hhb>8q-2OpT zhWUpe3-eE)6!XQQoree%_O&gP8m4L}4D%0ASeSnaC^so6bCK8{e!s9ml@2i~jm;8JFU%XD6&w3dF8H_riMS#Pt-8|KZM zsq;3fymP#H-EQUUm$n zsJSEf_1NjRAj*BRXen0f`a?9WPMSPZzy{b$oZeGq7M$iBDYXZ47roA53m{-I@N{Py z$PdZGfoFv0!=#gGMXOLJhy-oFLmgZ@A3*Uwjaodl*R?2vH9gi+h&4Z@RAc}oo7^*hf@A3P2*xU0a z`~+h7fDQ=8PcP;3*V+a3i-{B8CF2=*9h9sDnql7b0P@h1jreC{x`Kx;m!dd9H}b~e z(?TeYtjRl#Je)e&V=#ZCj)M;H_Lh9L1uW?ortYz zd>It+xr~Fc75L>J_WW!U(>jqA!zYls18>6Fe#i1ZSyH35J>sP>rtaK>P~ z)qs{VkxETKgxO*iOR*r`>pw$)&)c(Cuut(Cu@4Cab;do@5a3s;Mj=N&k85o!&Rb{S z2J|>Z^1GP)5G_Pex)Q1@H1d{eGHWgkz+Mo~wpG_~o{?(*#jmPEVs^V6CAI9nSny!r z+q_~~NOD>(&laTRqX64Xk^%ckU{Z|l0LL;mlwlFoh4{w$jRncD)Vdu>b`0dva=$=f zi7m+r*rCpOkb3A9$o9&p$o7Ny`Ngum8S+;U6WMMC8TLSkzxBvAE^|?~ku~`zBM&sN z!xWQfdWO{5Rp*t@xD2`ie()N_!6!y{<{L+a0vvwCj~Ip?~~z5z`un8QlF)@?Te6|tFpfs zq2(Xh%hM>T5c&ob_<(2wICU9Zq9MaYp>Lpg0n z=oem~s<#pP^YvgVl#9h8^vO{T0ZwBjk@dlx;^?&O0NzlAxR=Tm-ay`4 zc3_}3%g4~Qu2dco$GTe95%!sGW~uYRHyr+jpxTRKi3hBD8u->%RGxK|Gt37hoTvo+ z+J}Dxt??L+2(|x_`iz{a2QQ@7^<2RA-#2re2Wwl_15S5=kN}TH-4^jt70#__EMEEX zO5{p(yG8RrI)~}_tsKnss|-YzY^e_7CUl{$r4A1ZVU(I)Ms+ zad$UZ|(mF%9o_4NnRX-lPP)Yc#xA#${$= zUv?|PP*|@NLt))EK!jS`(^j2RCB~|X!!z1_AK&Y4B**JC2Pj6{FqZK_5k6V-9)h05 z!&*euKBHcL#3i)@f4gf?fs2rl$#8}qD4a$uX`Auu!Dl_92ZYac8qSr;a4w0%!IRNW zi7ZnTy-cVrvBqHMR9^^9updoF@Vxe^(<9IYCGqn zjM7)|@+k5-Cm|kM*V4>b*QQ$R+IJaB$<3HNB(ZB>HCTkX1xa>S*p+m3UM?H#v4JB3 z$g@`Ut+O|sqjR_{f$X0bUDU_3AVkE4LlTyfN~#F>>cO2+yAQ_UToqf zI*y>7jh=8fG{bb$TJ7*QwhFIx&WeXuJM;x{%besX9|In4`g5M}{hsjq@w#i9&UFiO zb>cQeDRk3!A#7qVKMVQBp}IuVWbqHc@|1>Ep&7lJW~@5^#aW- z`zf&km&z<{kJunvg7$Gbu?{WS1|t{<=aMN{(Ncffd)F{=mwh4=F)Uy&Jw z%5};?{RnY~eGCw6dN^MFHC6{KY2a@f)&~XqhqdTz1?#Ors$4S=p#hu1#^jA^3{skA zlEBvKH|m_mRNy}pOD3M(TqwW0>~pzoyW)rF!oMPp z0CtSvfG2QdtrcHmDOv}t&)cPSKLBw+&18xh;WIVfajLy}778BSw%~9~rtDz6;4>^Z z5UB}Um39VYL-|+6-?-(IL?#Jk#nz#`=TJzyST+<@f&eKz3@WRVgP>U~F0RTYm96}e zc?fh1vWxnD5iP00Hk=VA`D&4raHQsa;Xrg5o6~-mjrOVw7{S}@mZZa%(~RA=5lGEP zDf?OE`uTQwgq84^Fg!oj$%)XZfGj6MhaO3!X4S#RqW%sn=E8;CX!H75)WO8wcPSOd z%qp`XG7V$|B8bw53$wHMSX|r#DY2QA?g7vtg-|<1peFcPWqQ0APsn6@aeSOV15`C! z87l(=1{MhdEkS#)x#47+onx!j*Cs_b}l~8L$@0 zmVc<8&XaEsuKLqNbUP4RNRkuBmW50xv6_KwihffDjb z=Qps-UvK1zAQ+L`eI&cch#9>c2W)VsxOd(Vp`SHrU|Nxl%tHSf zB6cFNQ6*M4MC7DIm85Nol5 zb!mMgZhzs@7v9=8qt!Or$F0-C+qrYNR~EpPdn7G=jik+zgU9l@P(rE+lu5A*FC-Eo zra+!JB>Dy9YJeEm5JaaTlBGj})Gch|mm&4SZA7Yf5>jUhQY*W0b@ta%wi_$2QB4pr zOcXL6ftzGnf(9^}XGS05@N9_Wq7sP?hCD=uG(`IA6f-&U@W zVjCi(+?W-~M-V`3jc#T0P`Xe>@%>3S`dZSmoUeR^QQqcEMzR%Q2$3Pvn(mXMlsr_3 zO0%ucKvWw%0@1^|K~fYP;;Gf91_AqcGN{_Xk>QmDhbamUM`2RGENoOnWyH3DBV#uS4pS5y4)G*79|;_)gEnwBGBF7b zQxqIlaN$H_l%}?4OWByrN|A%-ql4XR5fF=3Y5Z_so=U|iKJ8XqNWtdyoJ2+C{v0lp zij&CA>$vJ9_JZ)QnSd#DU zVtF%0JG4=q*I93yOhve{;c?)Ls>>EJ$F$^( z=s(&3#%cf2r3jj_p|4V*g8fIXA!K|1GVX8#r0 z>U8gFj3V(Mjr$o2Ewn`hG8^HkAPY8Xa9kW2hoGLZ^~MfbUN#NgaQAhjw%*n`M@p+~ z?&_Q@aoHt07D08cBI6SPT7Mtz>mTF&{T#s#`+J&n=w(Q+?(agD7IM&I_u1*}?;2#Q z{!Tsx{e2KcbbsMb+JGG}Y^u2XA2KDvl;hCC?yPHyS&?cXyG+}3mDQGm6`8BjsdNId zWgN{AjT)6m2|;Y0ijm)l)u|YVB4YFLWB+>jsx-Jex7Pycd%o+;A&^PC>*9#{|-?Ug&z`zX--|z4o3eFX|9x6v>iTaAvj=^*p&pIlBDJ#ktmH) zZCzU;9WE#px_s00QF4>5#BhK35aon0d&IXRe1xwWpWJrvIqy4)QTzB%av>TTA7Ymf zB9FWr5I!;}H9m7!w588=a*QJQNb_pJrRYPX)YRhQ+o! z5j{ACnirS+UB8q0yVw+pa4vX8B6La@1&GOu+WNc6tbOwE{|5-JWDx6J z5|R2}h(L=O;i+F4uV1O_U#HrylwOx5+P^s7zDqy0Px{F|c~O)>($ay*K1vuHGa6Bi&a zMU3Bb9)kOYsEoa*@b9n(?G|valt;eC$Z`{XPNN=YYu#$X;^ z|4jWMXo&SMNNPS`nwR>QeJrFCThF(datixzg!{|=bny~n+PRZxs3nQfFFZj_;3FM<*rxdT!q9VubEZY zO;oz40Jk`ly&A(_s=7ydLNC!r8Lys}IF~Fs9lNiwfyd(WLH4J|BVPpgMxg5Ol@Yln zq98u~6~=|y3T9hk);R@`={AXH12bF^ArIhfW}$UGHER%8$a$1kn>L!1YSXs zT?YhtVC78NsRQ35pwCnicRr_qHg--Phv_m-Q+AdLb9%A6sxYTLJ4J^O+BSZHH*Y# z{a}Z6@fTSiaDp1{W1i}{8V+pLjc7YDQ4$kvxH!W>MZK`o-w{_W=oOFs+DqWoFceAv zOF=9Fj9rem;Lf^i!QnkFU5k6v?_jAxRKEus;_D0M`$xd#=H{8;iS+}7#kswurHt-g z^?o@;K<0NdPz5P(A$%{wOE4qCd3GlcpgiXn%In*VrQZTzd~dGQv_56MjYzvo!@w~^ z`XV%eP3Bej_XhJDk+unGd|rsBpxI1(oBzRQXctPoY2?8{k6Gem2|VbXX-wY0q(b}T zV_RBQQ~(*fE2d(8iaAP5fgrTL~+nr+7P zsjp>9v1z>YHwsXB!)8p1KzS1})Xt_KVWkS^6J!NH+(2JT!FTEcHWZ&#J*l7L@-hhQRPr+E;kNQ}TS8uzAiM-yB9s?CXOko^S=?udr4Iox{XSt_ zUOpoX$_S)@GV|ac70gDL3P?Lcm3f0nHE2Rf4CLz>%93&eV_E~m`sF{V`gCGkgSx+} zV43^GX@q6#bbYL3`)$8fm8Bo#S{C9tCJSxUg>Z47Vp-3$=0rusc4WXo3m+H$9k%t9 zSe8>wXtBgryohDC2vieln@TLRU9rrfSfY$8mLqVusbvx$$A!l+>Zr#(1w^fYpiE*c zpdf@9U#MoRe?lT5=Zg0fRLW!}Y9VDDzrJkdLf#5pIlK!wkPt10V?+-7BaKZKf(+uH zCT5Z=8B$TX7$lf`SJ#t&iUR>XxMK~uL-6m!_heM4~{@~|{;gIjZBeKQ* z_^2e|D*{$4#1c)8k2v{I~}4_5mU9Hvm>codW*ahs6VG4y;_Ml#-~Q!9X3|&$kyI>2mhb;}e_(@rARc!j|ue zf#Phb%UP4GL7NHAlKNUHfwLy+kj$k@+q@+p6lYmbm^v|8)wntfdX8+RJ|IIu*vFKz zAZIzsYmnoz_+^A!vv?xX*!DD$^h%b1bXV)&8vNSJsa?Tq5;4K+^T2D;xs{%Q`WHT@ zfv;Zv2cZ+mZ>}HhojsEHJsJ&=-(^Q9@f(cx*JAq>oa^!WEhKox@5}kqZ|8$*q>7q& z2iaSYPS$+_=JRl#;ny+z&q8{VAZXR|hJfoa=P817b^3PJYJ`|f2`Xqd!j};;bO-aA z5l?+c;+TS<`H{8bJYk$XrN@99M14I!-NS?>$2CFnPRYfQKX(c!+nw z?XvD3`kQ>!@GU15%NeczsrW-XGd=W#KMTBh4!hwXtUwV?2*)8fT*hSrf!czmCnVMi zu78BHW7I+Tx^E@06n_c6XWKsfzg@py_)oSEzhiiBUA|v@|89suw7!Mzm(t&^>=zq)%aDBZ9mrf5- zdG^&;0t2`Q><5cHV&U%3DE+L6lO3=E6ovSuXUtq~qY6A*JK9);tI>mvMS+}S@Y18t z?6RkL<);vt9mQQ%+HayOqpoD@!2Yx}c|oz9G6mowQ!jMmERtA%;#(1c*bSMuc#W4~ zliG%W2CP%~3}^UpNE*HlAcFmGI}05!(Rca};8Rz(UV{Hq%ljFN%2KNE`h+#e=dVxW zor)yEz8py4tETdVW1@W%4l5g)Uq7gEG46Tj zAW@bXXJGktecB^t>c$J?`vp*$U+|sp_h9_V{wF#$Do$A=as!Tn(6Nc2uIE;qr+=}2 zd$^XHgZfRYCR797=ZuFLe_7&hFdk(5X^H=p@o|hlDDgRrk6`>RiQmomV8rDuq4>UD zzZtWUemc`H$4rq(Uw{{8?CwmzFe&{U!ubKKTKUppqWn=z|AOg8Qu%MOPPX|biL)EP z66N1!`U0lYD=3lvGSeSpI?uim>Az=snCX1eC6Rs}u6o#$nf`E6dR{N2k6`+pN$H0% z{d}h1kd%H+Po(EC{nDiLd-9RqmFedurB7%2zwn}h(<>=G?M$S9%=FGl=|>a(8m8|8 zumt|A*e`zn-T5#n{ZOXQVfu=s^t-Xtx9?#3b4lr&m_Cl_zeBp;+U#Fx_lj&WswyC( zp?%jIcc&vTd*2SYm>PG#fw14&s-D;~mtS(l-TgmUJLPPQl-c`?Q&LiHJ;ULPNbdLw z?M3}JFmcV^tLo1oti3F0-2DK;rnS+u-tmX()6Cu5&6QsrYR>-t$-7fiZe1IQ-I0?y zD0X)Kl0HxegRQ}M=s3okb9i7n^fgXl@#w03H3s8gjFGPRQu8{qQvhapw4X6DEv3mP z_9##-p7egNNYiDwx3%)C-Z$YqTzEF$(9u$EYqn9IB5qN$SrXXJY z#Jcu&)Z@(sB&hG~6(Zo{SM-?LpC+mPvR?cw_d>G(ir)y|$*Q12zI+%NLwR2*ShXJ7 zh|fM}7F*qO*a}-@OSBJ)Ek4?7B2P#G(-rWe0O2;m4Apw_xAsDZP{)DdX2C98irvQ; z3>6P&-iuQK)&`n5@9EdXu&GgAg{P>z<{2KOuB)8o!`pxC`1c}ebls4&m&Iv4Xs z$U7g=X%NeCU|q>E5-qPp%^Ld|5;uV|IzhBy#}w6IC$*BT+bbT1++iW;*}wL9@wMCj zeW=p3#O~Kcv54O2ha8s(8wh*Y<$lit@2ifWL6XE0o3yct8cmoC>2gmS!c&9=p))+SNd{}Vi!tHUK`Z#GR9|X@9Kh^Erz{tg#>b7sWCd7Npuvsa8 zCNy;xW873qVIbd4?k^+J?u@|%>&jk%MO`2|4p-Iign199{D^hU@(L7Zq#3@!RDYTF zA5ak5s1ytcxUj+7$<-)nsr41L)hMHzeiR0Yvg?9G*&Z3CF!=w-d-M3Hs{4;S z0Rpi?Cn#vFThv&CYf?c;5lw`^9i3=YP^=qup}3CbZ_*El=Ur1Wy-(AxHcj)ZKd*eQe_k~ERct1@F-;$M*ZfW0x}Ep3C16Q= z)~D-ZvfpL7v)UE9L!;LsizouJv2cJVQh~#YFkKZj_IQ#o99BblM zo|*O^PPMf}`b@R`vA3(3eY*xcupo6bb9ad;FTNl(g6A#x;YX&AQ`tQ9!|3DF?cMnQ z@)v%y=;Ixx0%`hK+Bu+)E6E6||KCC%H-kjs_`B#MO&W4)IkMmg^MC{b3JH^k8Od~4 zd{NlL0@M`nyCqp1(2gt+-RN2CZW48+>Ea6Hs_B9AC!mFM5zoQ%Z+Z6hXZXCVZ=Nos z3g&Lf-S4er#^p9jgeExdAcC@H#xa+i2#$v=rJG(Br3|%@vSXo$z#j49BM_y|HI5ww z2f;$YYj^vK|9L8@aLNkR$)5@=)o(Iyd(KnQ>vH${&_KGytN58t2m`rh(Yea%&SNQC z0ti)?ZORs!>YrY~p4xt|V&qd!n{=qhv5*;yAcdRFUl@diKrlQr(3%K6o~oeLWbZ>- zEn+f5!}^jn7!0R_(apS#wzfXH=X+$0?yDoVGP*lPIccNr?@#wNKELPTaR#F@4|a2- zSw`QpM{_A!6W3j#8%^2z|7kRDUhhUDgy=7XtF~j@>k1_&^J=y>n$t#SMsvX)+aAqZ zNH*c7^A@AITUbl>xcn%KoBGxN(`eqluEWv%TT!9l)*sE^2W$m5cb%UZ&Hc>dwuPG> z{%B;sWaH*#VJ+F?xg+gp&SMGM%G8Q{U`aQ=AEiAtzQfTd&N(!iE?XPTdlzIz^FHp6 zZH?wADR6M}8!(YQnmjj}pS#fvWtsVZ7|p~p-DrdW{e>t^anO3TQX!G|^C-uD`_e$k zk`5;KBJw1A|7@fHn|BUBRyN4f`;7iG+MVw+5)R+zHUHWbB_5I;0Z&ExaA(R1$Wp8G znQfU`2Iz)7xcF@DD-&LmJ_p4=O87o@?vA=0F3M zq|A73ZFCD_=Yl%og{|qn+Ux{##n`HQyKN1#hV#tK2}nahY^}%uQ)#+BCi{KxGY6vw zc-5a8DpKpyc%e7ShQ9>etu2Nz026}KXM!PcLNeI198eg{Z+l#yQgZLJkcR%6Hw(nV z(Ofq(-Zsnz%U0eB%(??JbYovUehzRK4gp12wuJ%UB7JI^J*T0U_jedYO>6CuWkjH9 zhJF)(?I247t1Z|toXtCuL0Bdw5Qgdy08OEk28l%$dVp-fE{>PzUj}OJ0QN<$1K6%0 zCIql2tqs&;xY#w>`#3|*YUzI?tea2SI*<-$X$K(f766I+Qy{VPt1MGl>ayz~0g&_p zLE!^22eF9l2x>=lG@gOXJV}7|klE~mD-D^a3%3oKH;A_oDdhtD5M=J@L<7kl=ls$_ z=GL_VWJnu}bO`9E#?sLo%X{L(ZIjZ29Y879vrsAypp;iKSfm87GNg2_crReYyr)Hc zA|-b^l}}GKX8pzhYVUycFw~NgBkiHK*au}AYE!LK%33)c($l?dn=RCy1xyZVzR70J zSDVS+CzcCvJ>F_>(_qiYkF^b(rdQmN(h6}qe=4w4zsbB^hPYQNo0EAr^rIcV8omOk zR(^8L@X;Af^XuN*)-`U)%aBoaCh!FhGc>xX|RpX6_8S2wVUZk%uQwIW!eC5`df$B;!Np$1Gq z7f2GK7_`ogIDvBzIOYqS-i~wL08>s5zI6J$)Pw2g1*r$p&+}6MOg}G3Jsf)eZ|M24^z*#b2guo=pa1gH)wh}HJPT4! zhSC>=(wBtNUkatamVTa>dNcjJAoWHl|9heI4@2pzLg}rc^o=|R{&Mx#`Uw*gio8XG ziXbJsXS;LTN%&L-73iJ2@zU#Q%HP5IDIO%YaEmrr;|nj%1|amHU59LGSJD|cOJ zk_tfFGH1ysI5$s=ocpt++Rb0{w2Q7v+(|2+z3s~9&RY2#tPG`D>z|ceGUk>)JAm;+ zUv3V0FN63j`loKiUvLYt*{(l-;n%bNH60UDnqC4XpE||{L0o{g107S@-@gb$AfQ(tKR|(n zo2@pmjcQM#TIedM?{kOm4v<~hb^@y);C<_i(9PDBrZ+fD-g6^iP-ClxaWQ~;@VfoNr zevlt-AnORnZ(ed}k)0>ZbMH?TB3V+maGO&2{knGVm$!RAyWRVV?cR@W_x_7^?|Zj< z-=W=mr*`jaS7-O{gUox8`u|eq-HvB&yZ8U3->rTyFMYPTEgE>`6{6+>G&-QiZ_GVE zF#pk52tTC$9lm#d8!LkT;fCGD&uY0#cK4mIO?8p+3E{h&p%2u9RW}tFSyfn-IulO$ z%EK2P*Dd!ydWj0ohE)-TUO0U>6R)Vd^rPh=@l*P}d&~TQzbc1m{RD>9vQ7Ui#Zpq% zY!_}mR?t~!MP#L(g{GV~D~O=y=Gq?aRs>CowAldpRFaWT8chFSC1(`4l%&%-6WO*> znmM>>{@l0GlE-79w{mZ$7w2Bx!@ML0d-)5Totc-fR5#5xkG&eT4r$lK0WPkF@WLDwqza{$oZ*9`lIBuDEYxIA!@DnmPUFWQ<^$P>CQeN2FZ%s^{mvsiW7UfHGk7#P|U(fu;$Z9 zul&_3uJPu4+(k=Id3evYypAFf{su88)(U*@$%l2xVNJPL@x))|)KQyOf}IK zt*PM9D?2@D=G$k?VYC)8P2cL9?Od5)Z|75D+A>cw7ARzqf9S5Qg5#w6(N&6H4`8%{ zH^v>LMNorzg1IeUj<8=`u)UYiQEVJ98@FN6CmJSHy`ZqXo&KUBU^k9;KGpf%^~}`u!f}EP;`|0QFC@{~e4~e{C%7>IG`zq3t*`rzn5PH7dw(bDppV+Gu z&aoD*nDt%7tNy&i2nW{0H9!<^jOMBNUM_RTZsrk8Ldx``XHYQ>E6QSwS|pmqdbQ-_ z>rbOhN)^1~qrjh`vaD))GJT+mXMQ1H9T%B>s`;=cDC(&{wFOQfJs=ppSY%FAF1p&p zul~%wbf-FTF$Y3(zTyT_M4v?v5yY!`eSEHweiiBvJ8K-pl$wKW8kmTGWp-dY4c$41 z?r?F1P#n-SV+guWhM$okfcx?c33DP_K|3NI^QoQzk)1h_8#Zz1>`%uKP2Cw>S&&BjpwC%Wfb*pXW(nA6+whWwf zvfO86t~06SSazS0Ehnf^G4e?LH^z9!D3KVjee}hE22X|^$q*neEe0xdmbNbjrYWDa#Q7iE8fRJXq{V<| zKDS_lt6={M5f0o{#7mb7s=t-o0c)v6bB;xoBc_m5APFc)-m(BJ zIr=nmP=9+yXS<}FqL%K~zqEjG%a+=+WZTEeSBzU7+iOFt%*6Vb+7H=Uo}XLycETeV ztERgSI@(EblZhzz+V=R_LEOc9R%`wKxplL=M1hw$k&^{_Pd$*-v+Wr$_+HI9_$%CHLT0Ic$3lHdc!ULfp za@=Nf)L!P*54@X45&SLu*4D_x2AcZ?srfCA5HF0vP2Q0yWf9uv_?qt!XacN zu$cE>L`NhKTB-GVMLXKNQJ-8PA|AMa?y)BEPpl%Oh#3X{tBHfH&hJ1S{QEsJ)w^?= z(ZN*1GnWR7(i=tdd~Y)c$Ox8&kr$_7^EPlyJ;4Xx{E8D_9)mr>p5#2$* zc1iaugn#PJ;6KCtMOoah-8nPzu4ZxE$NnWYf@Y0540$T-Z-$-7ak$MuXe!SBDssYy+k`DJ51>DcHyMZLdn}_*-vH1UStwn>J zyJ1q{pACGVFz*?XVefy~ub+gl5AVVK`&cbI@NIemq8DHViP6UikknNrt?u)(Cip%t z6V%#f9tWN&b{WC46**?tx6}BfCDWaPDY1MOKNGawW_nUCz^9!oS_{;*!li$b7PWci zG+~P#yv(mqdzv_Pb+*6!yYY1gE6_f^Zqi~jm0gde8p78pHiyNRmaB!YVf?1?bv!Rw z_}b~`KO(-~&_XYr`W&VT;cF$y;OklYbxV9*fvAHq52PpHt2ef>@b$dJHFe!40o?(< zwgW>MeEmb1wfuKAEt%6P*a5zN{nq!v*FVUm9_&La!q-2N^*jn+wfv^>btx}d_!?FGBjRi0;jUAau2TVj{e(urSBw3+CBEiK>m&wV zOHaVpdDb{vZFW~G1N>M5wF7*e`eqnk16&F8wmD zuff|7!`B;liF|y$Acql^{T#6o!q*>d4vVjQc@(~G5_*!kW^-74&Erw{dW7FJ zzW&8a7QVXf^&{e|oZ2vrFZi%mRq<0bnccUlZD-U~B_?qoYBiWlB z;cMe-VSKf~2M%9vVC*4 zoFRO@jw^}fe%Tuz@+f@0!fzU1FY=Owuao!u5%F~@wX0KQu2X?Lyntlzb*%lmCBFVE zI41^PL{GriaO_LrtA~TWc~w%b1OAG>8phWEC%=$~8?jl;R0?*0ue)FQKKN=Omqv3A zt%$!`z}pYQS2_Lg@pZ7ANmRBIVk3mF^=NA!U!BPze0@n$8eeO8$->v)cK;Fabv3oC zQQTRH;mqzsjTr@^BYg zGOK`02l)E@rSF5U0o1P1{Fh(hYXJQA!|-(~t|A{_$H{9%Wup)qA$;v@b6Eb`hezS7 z7r$wIMR>`=*H^p#i1@lgo-#VM4BiOg>+dolk_RR1*DdjN5_Ai`y6GTBtsw#H)`qr053nUjiz*qUwFuqQ67zSUb(vrE6f*s)NcQ1Y)e7!|3jpig; z5x(96Z$AuQcc6=XeBC5B5|wpDY=rQ2ip^p1bq3u@S=8 zDK>}WFCHy_@teli@w{Z=>#*KGBEH^`i-Jz|be#(1;YyOlU-s*k`1(Ea27JZn3Ha)V znJE7H%$G)wN?LY+uRhO)@%5}Q>--zEWUi%P2l$%w%=f|9v*glf2GNS}^(^F_~hkdpLrTO4tV~A63o%T;f14WJN?FWuFppg=yK}U3~Wm zOMkj&FTSRd{ptnkI(?xZZkdbt7uB_42{1XuI`CLuQ7wvRp(nmSjN32S2SP z1M`Vxrru}m0CE!jM(c;Q74D1e#gkod>Sp%#3HwjfE;I+OafoU4ym zK>Ftmq@%62IOTxTkdEv|M-t;%OFn|r4%%wBYpov&jU{+XbmL&BVnhS3SZ@(e{I%5- zL0=Uf7@am;O=k^hsD|W6%~|}JW8vfU0##)bYi`d8?(A$?@@l7?Svv}h$$rCMV?iQ0 z@POAlv3^b0&6kYC;y%70N@PsFIaH0&=mGW@djw+k*J~68SdoP{mgy4R8k_4*;6UJ^ z1Xmy2{M+$7_BeTCC*lDPSv;0Vn?v$ycpE!vtrt14Iqgr@(x8^zdfAh-B8__}V4{A0 zML>_b*=_PqZ4^cC(?6lp=XANw9UkTA)i%g~#}}&ANa9+p2ioxt9P%-PTLn1mdtzvT zJ@1KMi2LfvE$>)(5!F#Yp&%!6TXX6%@Q6R36L&mat4zGuJOa#abO7s4WlB3@fmU*9 z>4r|Uq%mH;!HqF8xs|LM=V($QlRx0ij#U9+B`z%F-III_rr6cu`bB#zs@nF77H8U< z`jKmIl?>9ysY-v%f>xnd{!An}9~Zy(IF;ycPc=cWAq{V@XT173xn9{bUY}>Q3R`5( zPd9pVW?djW&O#(cFwSDX-C`Pu%xDhveeZNNo~@?Ab_pVS_e0>^fT9qbflZGo>mf9t zlN0$zJA-U4xwG(rnP{OL(Ryr>iG7@Y$#277j>ou|i$-(6Y(}pu$3hZ0b0Y16aoW zJR}G#5VJ)Ap(m!#iQzl>x6NLcttg@%im9Yz63`>`7n`wM%dD83x8Eg0~_o&`q zQLnyU2ge8cp(B2-4A`s3)OA8!2_ZkjdOu*F4_-F*Y@zf9`*z6ED{H~Puw`K2DpYi^ zUS|1s2Tsz$sCe>&E&11T{1|EaFFdYj9$04oW)a5z)I0Ve$-D5fJR%@VK9F#c=l5Yq4?Olt!v}mBs>0baTA03M$lRt;ts6Ov^Bu6HH$XkZ) zgeC-&?n49w?Rbha=MQnKQ#7mu@e;bW>7#hm&QE>JfBw2= z7;Yn=^FR`u`<{c_xxfQACY03 z-_6Ldzw=D(yQubs^xpiM{{s2-qX@E*v@Z$JG&c}r$N4E&l9I9iwjsar>dx-Se#yJ* zO&tyuGGE*ZJeDGNtJWSZgW}ED7IH1R6}ff=;wf#%tR>nUOJ=d)+2eeUJ% z9nS#m+2;V-@16o^2lAf}G-tAUmkF2~q+p=HpRqxQbcJ@kOZL~b@;2oypDReEM+&5cwo+iqzJ6u0C5z`n_OBGcZ1etW0W9_90%TjT3+ zOTbQ%+b-;y?!uR_thJW{c``C2zutb!_5VO?rORq*Y|hv*h(c?{{9D2z+lo;mTQOq( z%60}f@^O?gRTeBgo}!fiVi%~zFV4X;5MjU|*US>r^vDCoFdK zQwETd5wB@_s>!mCJ>Ndg=+oHhj6NMQ(aVOK}8q@89!_+I$1G zNf_zDZ(6mvd!tP~mq)6&46o|&QV*c*->uYKI<)8|%Y;J@5H zOP`K`i8RO=Hsz4CJ{@5hdf))0oB0#kCWCPCFUGyK-$z1q0X4q8+^2tR>p%&Oq}Gr= zP5Jt?lH|zbwr0?tr=XBhukv3=zXovJ5E@q(e_UOFZAia1`qd#U4_SQW-tb`rfSXIO zFakL@@LP(S(@Al1>ySK;LStXC<3&sMXA~F_{Lk4kj}EZ|=q@54F%`8vlRoZH-xiVu z?eLG+Ga}&>p6SXRYWNB|Y<|jrfjBtMi39&NEfq>Sm>HD%unvPcLN6s14o8qmD)cws zR=JdYNeRsd*mw1<93HX6Z?<%JY|KqNJmg{8rE(&ZxtvHNCrad4w}-6tR~}Up<2>`V z+DcTlb9?mX3`yVZF&wu_mXF|4X;-@W)JQ5Ma5vRNU4c9anQB|}gB-+oTJA-!Me$EW z*@I9}$pMK)&J}VNoEfF90_zL;1+}J7vTf#p8JYS8UPjM}klOyOy}q11d9t-Idfit&UC(t)|rSkTgt?n=>p8UKP%;tUS~Ci(>x0ZHv0` zS;v;<(%I(VC{Y^CO`XBmr)1!%FyHLSWL2|yXAtnbNq(hmoGbbW8B>y_=2yT~P-Ob1 zan=&`zuwGY=E;z@10>U}?@2T=-KTE5ZEK5-+;ZS$yn6V_!bHLhT_Uc2et|rdp$*h3 zm3I3bDMz9O zZ&d}S-BCqeEA_<^=Z$dQzbZCIeN=n6X^ z|K04)L*>9+&ck}$vWPvH)t$J?wx}|8VOE7>Vyg_tlqRYQqlrq6Q3mn=E?&37OH|YA zcLDiwCCOOUZC>v|GaS-ttYjRw+`OlU?>Mb>4P*65jlyPnP)_G(i@=VDu`{YKB;V|g z|JP+lmt@&Jh@xC+$aG(W7-)OOkEDrQ$nJF4qit()J!HA)Pbxj&+zvfxP7tDL1dDIY#vA+bTtiVi&5DuEjraE%!bD} zI;WJ6QvMkC^Av0>e|e5ciUU};DdF1!wOJHtSSE(m7CzYm%GzI9wKb}U?Q93E?u>oM z2TJwNSyYwJAwOiVvb#8m-^aX&^sFj~G<={!J*$gUu+-!H^oEa>Gm*D(W9J-u7?Br0 z=$-E9wncew@}s|iv&}lXtd1t~PLk^`F%AWFA(SvkD*(+@;Kg@;P-X2xKxbr1q@j#{ zCx+(ZW#jA>7D+^7q8EI4q_R|t<<^)X1rv)9j%zQar z1A=B1@?YU4v0MS>iA!aGF#EB4J`lL}o-jT{^?~dK=GevJEo~1EBO5s798p=K2xQ#`KDM%LH`>Z{qI^osC%x3<`;ZU zGee3+JMOU7H&BuKJ8$NLe+IVFBJ*?GID%ixTQ%l4*dUg%;`4%S$Maf_GX*kDEU~$8 z@)E+yyE7(1wLxP;rgXRBw1Y9OdB`nY-ES)zSCXS6Fvh8x1#|`!?;*+%u~@R8KcGI( zjL=6*uC;9chiqN)??`^ivBes5D^;h?0(XAD?F2e_J}s1ft^eHeNk+b<_osBk{|mRv z@c%PBZ^8fTe?4Y=F@;_kxe*BNX$nZb(wH5w< zt|R`}2gm>OGyMM#>gbUFAJoU*z0`5|pRrp0hsM7b|9|Af}n zmr!rd|BT78}cFzhf|jvJq_C5q!5L!HlcH*q0T;Zi8TC`FVwH{c|4l= zf`JH1t%nX>;{E~b>^k)IJ!GS8>~iTWIBQ`+e8>hVboB0!)-Ae(UCC7AQ?oxJM18I| zdG#AQMXvw53X{M@Htj(=hHhb@jSeiAj6LP|kpnez%yc*m_Z|?TW_8nctnoKU+>Ct_ z{{Uc_Jk66Pp|7>!zKDauHBf~@h?YiMUgNSN;=kZQFM;zzZgy9v<*K^Fu(CPqkB{2w zu2U8~-dgJ~&IeZmdE|tgSn@n;1`@j9lXh^yu+{SE)cjO;*TuoIziPGb+V)3q)MRWl zu=o3E_i$={ZEY>rq0D4dUDF44?y^$yYqV@;vcSOFO&77p%)|W{M}IP=?LUWe9Dq5d z2Ayh`797%31>_6EOH)dY5 za}pZQ;n$w+YhJK2m-&IvD;1H7(E;7iwX|C;vQPrGBZQ#>G=Zt<0LIgw~!VkfwXv@QPqz(OpZ=&^>-VlWk)`;qlmV$WXB3~9aQTO zm+1Y;XPt8dG6BYvnTK7Pk&7# zQ^fd|H3&a7SExDOzlO##AhiB-PTz=cumN2_@ak-ND+o@IynUI@)K3ru#{$7ud6V?5 zUsD9wVu^j{uac%r7aL(Cs%Flb7`vf@O zbW`M#&$-&>+|LB)HRg1Fs}qOC;)u~``Mb5-bEW%`{M;C4#G)lXHAnD0jG!BTsli#S zWe}v&4nZRpQCNKywa2gQSc{#D>{9#RpcuqOqtjF#X`Ey$m_H^o@kV4>Jv0vbXnES| zp>q)lNq6#v>H0fr(cXTW?LFjO)7-$In>9lxiF0UFkK0L`^|UHa_B-%VS%L$vA<2xu z776%C;4|y*2EUi{Um)N51`h**hf9KoA^wACAhJr`5XI%`iIi_!^t;B~2h(+EN;dt) zePlecML2QhIf10q&NDCMW6P7Oz0A(W)|e$I2Gw~2b^7!b_(PxJ1pUe+rjhL$MM<(QkZ8bi=i*Tf(@0F+)ZQe~UEeyc9c9 zyQALxf#8f;_sHl-_PgMrPGD=jxm=?H7=;`fd?nw-(JfsfGtO+4ppDFUHqxl`;OfUO zm(*PmX?%k>OP|QwQfQ;HjXTCNw71k0_-u@D=v`~RQ{s*e)1f`|p<|P3y?I>*0}#Db zY6!@OMDCS=oUk6CQB%s=QXit$Gxm|I9J#U9tU!de{3|VgSD1eRf{grKYW@;_@3G`z z$T5ij`==$x1O5d$o=PtB<)7(NGH-7Zpxo>EJnOFxzCZbg&AOAa-ZU*kqdNo?xo_~$ z*Mi?{JdjaYs$8Qd#<}x78t;1Zo>ii2%-BO+dK)C~$}I46env`08^Cv}+ujHN@)Ke! z-3Q-i-WK@CnYx|4*E z>8f7Gl~2^Kt?o@d_&3i_bI;V^?x?a3#P}NB|GYw{>j$h5wWrtdPi52V3(#+xULn=I-4}%D(~=+b+3yrM z25J!jpN_jxB`kef0){>Z{=reW2>94<9Qj%Lv^7AV{XYKM(&zj%VREvM2#{mkeTw9o zdi0?qQY`G^KLs@T6PUk4n(Y1K(&W1!@_(brCp)6a8-1F*N;G+Eh$cN~lJmhsCxLB~ zCL`FoUZOos9@&m2t$k=2z|rN1f|Lo<0 z>1q%W&UDglF2wMxzOS$(s$&&dEZ}TKrM>9r=Q($oeT+***8VY|nB{O(Kru9g6yrj2 zD|6%zu_7!+g05vievOd)stL=li8hyqq>CZg5MCol&`t6qo#e7(@g>NSXY+vj}_KyUTK{Sy~IdjRyZSovkUUPY*=T*>0JJ~5grF!){&HtUw zuUTtNwuWN$E+eqYAPynJ7g&;400YV^NVP!}dY@W6kqijP(iFyRo9P1}C5C)CrTLfW z;TAYDL=QV|2;e0ikwnbk2o~;9PNl;8eamH_8*?x z6TxWG)b<#Q`H9NWjuK3(BqicOu+31w5jrW46rTco`_9sVBv64pDl=^#l72e$Dq_F} zZZo?9$`COmttIxUX}n#QB6=>ZuR4Ea>B0I^95o;`Hh&P@86rfz@EwHEi{e>?AReNO zWW&r)d|C*Ipq)IB3@~5(KEU}K=-Gh%vvIy2F#I2J{+zACD{``7>o|7?foQ2c&YeiW zR1nUc6v)Q;XrJ-NSbF;&IR6HVdP|)5Z;$gF+PjvkQhHuhkm?Fv!}#VLThdEYzYx9; z1dJhk-$nIe{@ec09^X3>zKK4_#`gq<)S~jl2Rf$r@r^&t%72UR z&bI{kPNBd4e}nJt(rsJt`2UXYBYu~K?}s&w1K;=E+8*EgfD~W<-yO(%A?fv5T0el- zFaKvYuP6JBSwmkZtulPivIV4%VC%i8Q6w-@vfpvLwEzCo9k;LG`Gsj#^3PL4*qM39 zQ6ofQwLZVED!tXG4z?=k^|io`d`+}cVee_vQ;k`6b3ieVq2U6Gp>?Yj##pV;Bd{l( zM7IXQXv@hNt>C68PBGkAJW+!|H)MF#R`iOJeH%gQ+NuX;Hxqdd&2Fp;VRq9*vOl|V zXM6`@;Uzx!3S+9(9MuOg(snIOJxDy!(6&Y30 z7qKC1rBy|Re~wh+P$9ks7G|T zupeTkfJVac4yVCbpH^?qhR#~qXs{t+jm_?^y7;%8B<@A3mf~5=UFyZxMsArC>$BEt zToP&g1D&-|6_4^z_Es6;gx#w9tm16$o%2Nu%rd&{d~CAIetKC9#mtfH@ETv5uphx; z6OOKK{Hkt$hRCV(@iPh{J%?)*Dtr#<6djSf{6@QXcLKXs>+smEkM;S=G?3P^U=-gu zd6Dh6ZDwyYCH2)#Z&|K5kta3)8~!G+YU#y$6`P~^?j_XHA}@DdiY;Jh(42Jidc-SV z>_vvo@shhOU@$o?iGjmyo}ZPy2`1K7#Oeb8D@&~2PC{9;SqYuV<$1XfieRb`P@TwmPolEaEcSD(&^CZXXTm67 zds#l!N0H=EN^QOk8skeWF`8f}-8alykq*2s^=7L!{+i`TEsAlaRo|30zz6-+Wo`Yv z0mLxXR?0P5gl8QHuuzQ8o#XN66-3M5sl7l;`IXodc9G00H{%w`zjp{sm8qcaX?s^@ zo7o@*Z|frClWFSTQ?5S_6Q5V$?X}o#T3zh*S*$8c%p--kPs-n^`-2KfU~)*g1%f=h zX|P1eD{|;n>>%?`CdA_9!)%N0jPA5}j5M;sp(`l_c_dbteA4%ed5IkpgIJvgR#YSc zVzFM6pqxd6#8f%7*x!uuF<^Hivcg>I4LIn;mxP0#GJ%dIE(Yvp;0$3Z2?k4TaX-#3 z7B^m8d+_=!w}F6BG88BSk?r1`O?j12VB~{ z8y$+}k((3p%jeXs68G1CBF0{)_M=IC%!LI0-Q}ij-!e6tjh7!?UNvfH5t(#`iS`=D?^(?JL8tx)X;)?wG&0QmnB40=H zA1)QR^(~9!E%k`7inN#Z;J8SX^QCrznzY0eg|Z9kZjnc!NJ{PuSrQ`6Sb6KXWQa6b zjTd1CYxhMW(ewt364P51mcQe+mO2;qFZOV>w+X$On!qx)lOv$Fgihw9Y?#B8`ptQf z#vR4lFfLG0!;$1j2^MwQtZJe(J9H^MQN^@CQN!r9TrlNIWJq zk}vw!9YD)lbnI_;1|93Gjy>x-R_Hobs*Vk& zW42D!cYmn9|8A|mlhgG*qWbK-FxhYM9d@AK-cNmBB8tqO*Mr^Uz_*ofD5PMOFE>Z* z96S(TjXI7mu>9Uqr4=)P+fXdfYhA3TY%Ket(;jLXL@Z3hJoM+tbbRJWyKZ(U9|N&r zW?+7c?)DJ<1rk8wfam_yos;Hz%N`z%l|oV@+Xo%9n^WjmOLvztFV)2Nbf7~~5a`fu z`+7;cPzMpk1v@(0Dl}7e1*K<*)V_P1qwBw8oX7G}#IPfhND^rP1nc*)B^o>o+eQmr zzpI7RbTZ%0aTg71ieow zKqyH6Hk8h}2if*ppwHRY!t$p*%ESw4a!`!*l&G{ZPLeXvM=ST63hsyBZZU347+Me;kk|QnI zGeD@sE^4y(&z`j_t0mNrP$XK-8ErB>6w7k*qRa|^5T{lo5CnoXh*P^x5(EzfqqtJHl<76mr8qOl%)oVW z9|ab*y|&e_WB6+BWay5nLRT~q5B{lHhKN#!&qtStwm+8yA^mF0S)9kpyIXG$-524n zxD&pRcE>CAd1YWkm(|#8@~=ojeP>qiIyNmjy{U@Y2o7vZ_d}9%^746RaS^9 zy)3F+PLR#P)}WBZ#S(dS+(hGVc=02E%gL5Hy%>U|B3Ax&q(MiNq=C>Jof*(ZTImP21z!fcu-n^CSx55b(B#RR~{RW(H z6_X{MCuAbM94mxFHEsB2{ru5c>`;-DA!?{ZM^Vf8ZKjf6@zH~DocQQ6DPMshUmB1PQ+!t3w)~d#mfkKBSQ9lH5J@HVn{p!! zpMf7Q;kCNk%obI&D#N0R4f?@eZ~U0=Rb`)v2_^Z>XJ|*ub5;wiFrR%cj3Cm*bZCKb z*y(FA`#aZ95Wl@f@vd$!x)s(aB6$lX zaRze~XH$H^UOiD}MomN-|3!&*s=`aufNZJlY-Fb~Jb)T2;|nWit5>OfDB9ZnzJ3{q zY;U<1O+h0ef>20D>tsyI{pqKWF zH@0f{Q&*FmcvNM`BomG@fiE)NXvpIihHdk`d{*iQ>>XD}jWm#QP|QLbjob0VqcW69Y$N{r85 z!P_Zl7&a`=wnf>qycb}ermd!6Y0+c{{aMghk19dl8z z=|Wbi}7-p+BiO6D@97tzNpS33zgFLfJ$2@;PE5~qa{ zPYDubp9asr4?W)<%KQtTk0XtmoCS=^>}s~?&g^D+@l*4FHSE`zY5c;I`YtS)Z%`d_ zn{Kd(;p~wrGnG90Hs7LX9F$o#U7m;@u+4PL`dst( zD}=mp6ZtYlU)c7`gT74ROAC72!rhFgbvx(H5C?inLk?s;%Y{VYJ5RNj*eN9Ioo6*M z1W};5gWFLA-vb3XNIf}7D(oZ*)6W!Q4$7lF(ux}eiwroZcsAw?%Dh5F0t1%4$r59U zx{Ui2S7}k&SrUu4I)90*(u}usvg(!frIVOq$$=v#!m8z8)Rm`xN`j-`7!{RdT!4`V z?H`E654K*u`i1${h>kbG=HExf0GY|E7<48!u6@1t8v+9xeOYbpYSRm!@S) z$brLeUQ5L`JZ7(4Dm-=Idwk6Qm~X~e3-dj5U#R`XK}XW=IsdBYc`c`>U1+6&M-fK{ zh}GYo?L;jA6D|LYxHEeghtT2m+3Yr3_f;1xZSCI;vb==yEz5j)lCd9AJCb5CxSU>1%cFD|S(bU?Ck;bJIzz9@Ns^c4&0pMQv9CNV0 z+FIlcO3GE9fK;gDrb>jGH7WTfEvP!(o`m>3)B*>2>Y?c#cq71Sq7k6o=mTsI1BD{{N26<_wh}E1!oapEjT@y{Q5u&d_tCB z$IfpjS^i!^lKC6#yo)6+#dPeUVevZb!tsEv3L8}vR(N*Ni;$k}`#D6mUb~vfk zi;tVq&eFDCm9tqimHxgL$y!zR71sCkSaRGHf$*WR8f>E~*3>UiW8%YS+CrVQMw;;8 z{Oajcx`H0yi81<7T{qfE+ZB{UhkRRkWzWN(%MwC0idWdal0tSgZax8MeIyJ9))) zbJzJ|LZFs7<8~R?4id3GU)y;w9$XlIv|y7QGY!rYJDT$;6L~DDs&!b<@abh=-OzZM zG9`x3EVHxI$YXbDBo#M23VXCQJy*8mhNMJ7tj|kwMpfJ}Ti(%FqEU5M#^*G>Ly0qa zj>cb(uWfpFaQur{vaTuqT6`07q3PYojJpKs6Jzl=HTfMC@5`a~uQi=s+zHvXv9fJA z>V4dFTq{K)W$v1+YnJqB8*M3hN{~2Bi7Rd5T@K^Hq;#68=SFzQxd~sX`42NvPT&Hd zo}G>kZ{nDi>24xA&r0d4=~*&|?d?7d`M9UJ@Da>Ehtx^vtT*p&R)@0YpL0}S_WZMc z({`K#PkQ|(wzl1;3UFFw3I2Vkq* zAX#>wm?YV62sL7w^tc&Wb^+v(G5IME#oO{>idXZ$M(8y)iYy%6=cYQ_S=>a8fpX1$ zxn{lll&x}oY&j9!!bTBcZfaMRn^85rsZr#{)4!VakW6L)zh{jum5Z1p{R|D zE}kZ>G(D@Lw4mC`%H@4}1O%{NQDy)bRg#KwE)Ihh|2H!CIVoTSb4vueVdb{kc8_t+m8DA86 z%raWjBJ+gR?p00n-}iM+L$C`jX39BI2Joun_>#zDRSh>JFXcm7TiI8LY}woh@QNGi zCJkaP`_NIlTT_1lvzE&c7p)VLK6%VpT0{kR8cJaM6=iQLqSRuO3YTa)7myX%@V zdf-SmfV?;ZI2(j2{-*}2wQvhlty=vxR&G2C08wdJ%Dv>SSg5M1cjl{-TrXJ6x8L>wo4M zJyv@;{l5pGn1TEk2)`?W2Yb*nL7u536l zhilF>2jDU0VVc=nQzfz-6UwrK#=V^;O3>=vU&$w73w^c57P{Sh#zRZ9YD`_oz?J>Y zi(EaOsK>=uH@;L`fQYQam^-|znWHv{$MceT@87L?T{lujW%RYG>LCA|ADI8{ADDkq zI{%dH{>*FlE)$UWGA}WqP_m?ce*U~Efj>w6|DpMxDguwGm(C2}<=*GEd;d?n_j}Us z(t2@SIF!#31pMpw*x!L{`TE+P+!8hMsQqhum^zTc!Gc|4$#ZOn6&|W{P+j~5=aF!< z%{SCg5Ym^pHVM}~?Of2Q74Ee6tnMcaxqup|z)U@hUT45%Oxypq;mp|)8hP;gLtadw74bjlgBCk)&Oz3HXH|cBrOo;FRkS0pyOpH{| z6VSQoXw;NSDr{u~eV+~T=31}mhCDC6?j)%PTYY15mDqd_X6Gx+&c~dAu=PmeMU2ynuZ;Ei!1a7nr`j>ZwRWyOi2rkIap119H~27w z#eeHe{jU4Q9Am4GG!CPJmN#sAV)y&VqSkwu1N_hZ_^jas1$&sD{uiR=?3b2*Q1?X? zY;bkF=wn9WR+X)~ z%d>GI5oldhugtO4gDU$ z;>9~_lW%%F9ErA4Lpp`tD>atbr)7%Sg<&wR9W<`p`4W~mzI^kM1HoAN_(!IV+46Dt zZ#s~V!CH~y1ACzNQCUz@EF`NzMRK^K6^H}(xK53QsYt`^vUw8lL0mFJK^C|=$WxW0 zV5*7HCti#_UPX++a0+Ig zp^x!#D`19JDl@9gtN*M>+$MpgSTFW6)tQ)VhO;u#k$5qfre5p6b9Hrf1%QlEa`XfIh$v9p?+<=@Q z$JFHOmJ^*kEPpT3a6E()#m5w7lm*og1%x}Ce8V6kp}SIq6_@N|3_k6E>KMWQ{UgZ} z_!^71RL8%pW~nas_~fvIa-wrS?pmF^ysbKSL5w|CiE8u!Gu(B)T>_=c7C(#s$^6^QEGqjP@FR(%n+7qQU*|8ZG>|B7r3@G~tS&Owq;C zC{8z{hebbK)AR#XC5NZw4}>a-k*3vHIZM5f`~3 zb#Kmxc@m_t+%Kdi%bPh1+va&E(vYt)qyKi3_KY+Nsv5Un&Hv2HD)bY+$ zK>L{FsXchp_@l`K_eg1nfO(1mhKwid_q0Bgqiuwh30dOKL)?|Gk2JWjrlCa&i4)lu z`>atKOO7oSXArBVKqdk{TlSz{W{%a#OA5@58)S?wEE4GvcBOdKXyVKb)rm{P( zh9x4W_=jvL(8jeQb)&iuwhj^H)$zC0MMA@`Y>m$OCNIhviGp(F!md!@TalZd<|Udu zb4AOs8U8DR|7s`50^|BtW{D>i@r;1PHXCHu2d?zu*R*N^8Q&4J*jDJ(w-ha;s`^#A zwZL;wE7!cj$II*fMfnY4C`66|bi0T*;_sSa4AhhL^^Q$vd8#P~@*wY)pDTa}T9MuhM>;X1*zpCAm97EC+MjH0Es5k}Tc}O&V7W0vz z1p-0C4y4Bb7NfMA2~hzKMA%r!iz2JPBMtjOPquPl*K*JM$HS}^&5gV8(eF3ydm(m8 zQ5VX6M2MGse(5YTA0PpM)TVTK5cvgLcy|)U+F>$`JR#F=z1I+KRzwD1uyAR7tCk}% zzs{{aUJM7Bycxq9{n*+ZqKL$v8a)Q?t~oI_9TcSqX3EkZhY}N3bX&O^>Kd{9Ub(KEp>PJ!-3w9UK3|^saW2mmaGBk|!DZA>am0jlFwS ze#>3~Us`v9#6Yxs{`hWOS&eO5CR(gcj8eC&RY9x)=2l0%A8y=lPL4`Mnp1Ksu2tj*+j zLg{@fxJ#h(so-gzHP0Y_TdFfeXX(MhT2bJMQ&xD1XYeD`7Dbxy zhtiZt!t6_q#^$RyURdErP*|b4{8&7m7}T}_koJ`ssT2iP$}?dN`Wh<;mXjWH*N#&V z85la>TP846&AFu2DC1UUC6R{1q)R|~MKsZcnJ!kAa&4#vTFfng?s`Qb`GT5Trj^Ar zzpzARhwQa?ER~^6#jQ2UHl~_2D_%SRz(E4cZOVwxqIHqKGK^M$m?r^e&sj2W zf8hp5g=^fH#@I1+VN8NU1!Ijg{Myp4I$FQV3BQkyA?PfvV|Uz%X7yX_L|!=5*1Pkq6es9zq{@mQOH9lv z;=Jd7uP;y9Bg#-HD2@s;SOIn-nq_b=Bw5}HWwIq3Y)11h&=-rpKWmfNG1;$1qe>3E zn7fJ0D<`^z5tt}-3{y(s!HX|Q9Vah)W_oD-Ni`#|eYfzHncqQR-}Rj(*0w2u{VZ?w zNw~r5SB(xK#%^`2|N3W<$+ua1x%cw~IdBE$pWK!3;FE6OI-ULwUL^%kkt1ptk*3B- zW7j-}#`d?g$68wo!v3~ozX#5xd>6{M%vF4Azu)WqOAc0U`Hy|SM(-c#{T};%hTdP% z`z`i;u-@nE{W|-8nBE`ZJ*Bm@K;I_&9q>5mQU&?aVCcP^V8v^->P)DCzO8pXwt7$`r%pWdn^51rSF-Q{>UiO zV@mIqmA+&c>3x;H29cJ5|K%FjZc2YWEB%SHN$;Zcr?b)zQ~IhD>ADXsQ_poue?{px zlU^BLUOC6?iP_v#0*%&xeNN>1>G0E}uRq1Uj9h;KiIwqp+!Cnw)Ul^t-~Y7*7x!hl zHtFlgemOZ~4^Ev*mcU-B-$eN zwDB+8|2@_oYEUpJ*|&7oK}e~=hzE8a$UKX;Ro1U&(#s6ebVsrchDW+rMrJJbI?JR< z^m{tVB8OX_qN~yKU){5@$7_kZ%`|{}d%QX# z5=n*c#!Ikr!a<>0rpjD<(DVn_%y6Ny*^8lvoK(eB$SG9YJVd6sHKGcT^d$f(JWPkY5yn$@Sf@Ef4~N| zA6V&lG{=0+8DjRP;8J^4Y>^gx5_t=5=$sSTiGvm*J7MQDSy^ESVthLH1o2RYBP^@UkbnNflFqjCdvcyH+WFFd>T6+jnDUfN%_M*Lf* z11FjDutf0qc?4J^&6p|QH-?|;@=t2_h{eB<9%C5w^G2H+E3hca->zd;`KYoiY(%@| z-~+QiGxFEZ?LDa>(0d6v|YJ*WvBWNI#0xLGx;1T=0sgZG`p?#3+ibY9Nn zoxAF3fyZS&c{w)sdE_SAyke#}olLorn>ty|uI!8Tv3{Y)^rx#Vs#l|lL-gBDJ@6;sxNUHjhRx#uwSX^FIyB7r64B zA~zkHE`POO9(TC`BKx0^@)XlU@>>y_vjd(?BJ&qKCAq|c4^XlwHg6DqCO z^-q^MkYxjepxY_J?46J${=B_734%j1BsV85$;lC)K547)BC}G52KAbUsL%F6V_oDb z)hw*SGNwCj8rMTmDH|Qu1Z8oQL^$tosm9})uvME=snQmdZ1JeF?5k0jwquyrmc7Jo zsW!ltlb>VC$S28DW9Jt(U*>BW&V;s6bbpN5PDAGYlj}Q6kjO0n|J&owKa?UGsh)C{ z*F_)cZG?tAMmLye_Tejkjnc2mPV>BYL-9m)KfPr?($h_3Z9^liph>qu)eXfDD7)HG zPC6DUtK5h#ciw%bO3jTYk`XA^G8xTm3yx0>(s4e~J2bm%97)L987YO4K05G7JB@W8 zlfF-(rc52m#Poyj@~fg-+&TU<)t4)`p0vFsHR$$K*L_6y7}`MkTAABF((@5J6fC#$(`dw8?!Rb|<~;vpqFzSWmuri!GRNr&(Lcmb8DWm_pGKNr@FdB-zx`cr_O`$5 zPf`vjw#XFdiJ4lyO>v?pG!A*Zhc2ssfnZ-_!Th33qGSOXuUN7MRH=|yIU(mx}8 z3T6!VRUr|p?ExiqI6_u*#+Ld+M!cmC$D9cCY1vCp!@}636cKMLd+Ju3L1V`(dK&qj zV|sRm z3-S8ffOD9O5~+u6O@l=%RXIgd}KNfoaum3DF z=#fzRM_8vp{xv*j`tMe}ZD%y@lFZTh^FF-)vHdrlDtYB5i-#}xf%5+-P)th?6wKDl-5y9UWoPATe9YQNmd zm*BhzTIKskZp-4`L-rh_>`_)TK>CY!h*%mP)YR5zbE2&!|cQR3S2z2w zl#E^H>uc=4HwI&y1v0?U@O|5_mRUH9L!Y}2o^|6XqX*Y@9C%KtyT{|cYFJhT7I zFLAO0i*`WM^3 z>1*Nh2Y)80ncf}!vy_!x=-&~rti}(BP>T|#Ce8XXI@m;kSYzOIMCi8Sb1X&3ru zDc6zGnf@LAW%DJUH9OG3SrpjWe1*GF^Cc=6%+~~pIH^S;@ssNSBYsf56O`mGGJL~6;Qe!G+1_$kpp)-3+>fS5fZx|Shk=Pb`1RQeVYjZgFGCgmda z`>KFX2&0_{VOi(0hkY5*%Pwzv!q?2l)eE0E=9De&CT=y?nmXNg5&bN@@sj_?_V33K z{O*sel*!KHu-V2Ibe$}0tM@@ zj^~(bl3NoSWF&@}d0$L|K9=84_vp2ps#P?j?!IP}FOSwVG+`%Aq3VQ}&u!e%sB71~ zCCd;+Vi*!ju1|jEtZR)Puv;|M2P?cjcv;HFFJXS|@&)uF4;Q`bT4j{BYfYZB!d8-p0Bje+%6D|2QNX&Tl?evg_vt=y~L4z9O_o+MPJ0hBRzlbx+ z>IEZdg6;&7X(GKPiu12A%&NxdrIb%8;ETqZNqpx9!gJGwO}=z_ZS2rvPI3#{rDd#_ z`&1`|M1*~!No1_OSn{Q4(i{9-wLP!Ww#O{@BxZQkX!i#_ zm_=Q{+ms(4oqTpI=5onZnpmua_FwMwS*nRTpcm$myvBk?y;tJ%k+sXB2|XFoTx1JD ztp*3GqxK05kI53Lf19~MQRMVl=GPFcP0g}ujK+}OdfBB0)C)kzvRDOoP&QvP$$yxy zZNQ{iiRJm#i7!URPpl@SRgXtUCtt>%DaMM^V;27{Cl%4Ehn?(=d92^b-ju9e#7Rog z;8#^k?LsHfQdGk;F#?}Q?H-N{jq^VpjS9NdyLOVoJ5ncFI~U z&EF0{1fIYTpDIq{D-g`G#U{j`j`ViXOy0)1kYjVk8XEagPH!~4irx)M75))3mT2l4j8Cs%yP~mW z{k|_1FP6)NuU)Lr92GMQb0O}*^eQz*roQ#x0YDwMVIIj^U(>A93Lt8&nC_;5<9D9u zwMy4WzqHn@(sf2p%R9Zx&er|6h1I@lb>*Q~%#3KvTgW->;=1Bb#nl&%! z!wc`d(6rHV6joJ~A7%t)IY}$yeF300HpoIpPkn6D$*bG2md|F33_-n>7)^|mY~Z9p z{Z$^a+Ry_ZXURE(Z2N&e5uF)s(0HhO=)QnS@bwiz=Vc9)Z@+3F~I z_fZqMM=*uE0lbcO+QeYD+D4|c^eh+>{)g3tlP8mVA~wssm!y3dp6BQt9Bv=%(R{){ zX7hVk^Pg6BIW$0TXK}FsTR)yuiT|{2`oO&SI=o86a3_J;OJN+R279q)*q8l*F!*p2 zKi=U7h|5ARjRh*|;=+AcuDW4)k@^@DlNi-EUna|7KA@}7pt**HQaMkW%}e&#wd!6muUE2ijt-sa@yLM+D_Sq_cE;PwxU~ zGYfA~soM=WlMQMCYxCNnZQrLd|GbC#6wQU*w^SsN(7qhzBrG#C5*CUd1Xw7|8UA<` zKeg12^gTJ#|WzllTmagWu`PJy--%X-{A^)ev!3xuN|BDG<>vmO>?Z}LvD-woavek1c2S*K41LL9g7;TN^r z2JExg$0TM^pnskuZP>Doj=DwK!tB<}6U;Srz?$+$D#p%I)-~)_ISeDu-y;3Yp<<+> zW-h_;7R%#Ret+0M+N#1Z1hqG`!YvKsEL1m=3g<-{K4ulzGjnM+Hhk#lA7S+bij&z2 z*+`g0LcAA`JMf;z5?M4fn^!maZY~X~^j<$uQ#D&9u*TxChXMbDG&W<;MOR9QjmjP9 zBV|fYa{Wn`?(K;ait_8sp^S zq~;c2rMm+Oc>@mC3UHg-$6IG=(qSas+PRKfQ-`a@>ND}|sG4MdmcqJRbv`nlAEbSz zjlekOSFS1k!qHT|WJu(NHS9$-}3uz_#Oy&H$xpp`HF)29%Yuz+y{r>SrHV@$vFzIPB%7(Tob&|Q0qpXv>{)&p z`pWQJ|I{F?v;R9Y@h$1ca^lxz;+Ln}2W88y%!zN#39rw|GnepJ-JdwuZp$E9Wz1AM z-Jdvxdw@IKU-;MLnQMf{QahDr&MW)c@=SkbF)Po^_@2KHEBicDgYcy-Y%{V zVNe{ix9$$(S-q$D#X@kq_Z&+JZr|%kU*cfVs~#YTAhFt{4 zOs%2pu^OzAkv%GE>i#=|$49w>$AM4%AQU`q7grYw9=(IP*qP+mP zVPlv+)0$|sx%Tc%O&xNEa-D{hq2#g2Rv6ZHLI@hZ#DVQ`gP11fbS^)9E~?_!n+UQE z#g2D?ADGxXmCeMCEBYys_X$gtkSlh;Dr2~`Q?cVtM=Z999h*GY$LHz}gWbvxmlY!Q z&CEeY?9fXa-H9Fho7OB8JLsHsdbo8oD|%>PT}6+fx7%fOhOJ$`bGhTFd4b$kjP3xbK{}1Gj4Hi|!>~|-3?7Lfz+~Fa2%u*-DYeSi^@%KnWjdjLU zrE4Ls>Hn}94*^K>#p-Vmy<$iC;#kQStG&M!`&_@Ig!hryed(bk{Y56cJe`lk8cG%A zL0I{JmlL0t6YiT6uFMHf55oR)8QnRb-2UbA!hUo%$Cx&2&zCWz-RU#*i6vWso*ww* zhft&aI!K#7x$U?kc(dRU8f98#0YO7n+0-XqWyDB!N8wf33 zlxWXb4dO+i5ADHd?kJ$glA1P z?8l$2)uEsBsu}FZ7vo?*uIVdyFh4;{-|87@SX`hcbO(~FBja>YpJS|nbQ5^shLwB= z5HW~vG+kVRQGnknu-VAWb$$iLDaAXoFrR|B3)*xky2)+*ki`X9I zgURv!2|kj`1l_>#HVHW;m!Z9m{NfHOblj5*^t?}EYf)sz`}8%+O&maXYmdl`SJn08 zn~{OV-{;^A%krh^9{%g?b?Qr`q@ZWy^rex01wHIax8CuF2XGRt|E%7;qG$Z%Psowr zpW;I(j=6F^xj!NM_0h_|U4ygpl7bWkdvNOLT77=%&*cpyLh9%t1%83iM-|9J%OD3w zFvm#0I=#rd!?@JFh^pO`N&J>S@4Ak+m)R|OC4Cq!dD?WQIMD8B=niQAm1@xY<1VD? z-iCSW$V|TiwjePaw)SzBzSq-RRivE$^_U#IeibtmyjhgBJK`X)PSqPwnGW~PzLAEv zfaW{VAO6lr!I(1iT#@?E1mY=7>~Y$&k#K6%Rd@D?)R(GS_wjjDvhZYOafe1u2DW-I z8m+Wk$nb7qvg1+UP3#y-J607V|8f1Lwp~lpjv+{{Np?5~U^Be*t$|J-jy*QmP(L2h2F=9iGB&L zpMjj{jhzs#Eww9L59>v?5Ux2VVkvUq`pMp^B#vhkC1e&LyClPpD-QsBQ=!k3opl=& z=w{5w`@T7^!9+>y2T#=l?!jPi68puv4#XU3s8Sb z2QvCMj-d7*uC(G4_*rmoU-4|?#iB*lX3s!$T-KZcaC50@04+W>Xp8& z1D_UnS(`#fgb(yB&!@?C>7%S~breO9da0BCcJ>?R&`x?p_Iq%Xmh=}$lRm!Ys~{1g zD3&u{2Sx9vV z2$gu@BnC2G>*p5dbFjN`r6~x89uB;ceBoInS1LIeW^?*V)s>p+q%T&0yJ_DNyYMRg z^5(I8KHXoLejfcw7=Aw|ye237dJwKYYhmvnM_T+Fv^=Q8G2Q%FkGwN^XxW^y20WoXHDdO%3hja-v{qbJOO`-p$$vdP`R7L`yPjO3gA z@a6blAM1<9k@Rz@U6K~jT(I)HAFR-SI#qNc_qTh~FP0^)U7wDo=717#)A@EBw^sE6 zJEq|Eg+4yy_=Wq_SEZ?s`_vnvoB*lr%7#XMA`O4wUvoywTq5tzkv&2%p@0+eOsm9jN2XKI-;{Qa+=HHWVoq`yOu*)+05k zMao6*6+M`PKgRynBeR!9 zD?f_N)B`1r{c-=MMC+>XqZT$e%voC0WB%v3y+o(k8N@|Xgj6phR$jq5Aqqbi1>amFgwL=p!gXtXWNfzQP9 zB7t2WCdq&2aPaEReoc<2mRz5 z5nmUYPYieH{eJR5CErN0cLe`*V)%pmJ6-V09_+FZzZ?ftbdO)2=@BmK*TpYS|2YT0 zwEmbe{$Ir}_if*ni(i)D$o0RAU-n`U{{nv5``!Ol_+`T6|1J3C?LPk^etD1rS^To% zpWFV=@ykpr{6C9d4#;#+d*w^<%hoT#vE*j`3jDJCOA2+yFCQJ1#V_t4!!Nx~07G21 zjb`2AU*kb2#4nfl$s4Lk{+^#a1e_A$ms9-YRZ1R0vbTmocE&H&XUWPVn%p8c;A`X} z1_lQIAJ=^_UMMVeNs)8t5%N4%;w;^UrLO!5BtMKn0|w@>Gxg;Txsd~48>*V?%NKx> zpi1Un0+Y``-Zr;3-sc4%rG}01^CbH;DPIdd4E3AZXU*uIJhFtN25eHJsop2K{aa-s zDqn{87(K^fgo79Gagq8VQXz0HJ0a7zZ^>_z!Oa`&hSXCvi~*@JV-TQH{)e2cc1NMKKHy$E4DIH%IgTfvHVXEcs^>vBFw- z!68y#tKL++#!)?^C}I>&{--{@kcDfn9B&WmZ(%-c&oz%nD5hqo6+}Ze1a{pXj zW`yG=e`1pNn%6{Rx+c1-KVvCw z-84^(9@;DB9)~&l`!a)1zT_i2?yw*wiUMsMj+J6i-NCB)x z9(}l65ciLh4$K&S3hm?5kWBY1k(dm(6!qCN7)rQ-d~@MI0)VvO)l! zqpbLcv;(TK8C!#v7?01p{G>w1xyfepC5e;vJ0Lot}fdX`cz7 zWI@|lx?nwg_{LHiZy-%7BS5!WypLQ2mm+GGUQo49G^ef~I>~h6O|B|z?}cv0TR;!< zWcYqS!IntF;Tl)vw;~Od8gybMVi_*9elDhHk-_WW&FKhLX5`K9x&_;v{1=S}$4Y&c zQ&}3hX(?Q3(f&{3&>VdcTYjRGEGVz%p|tIq4)@{mR<-F4oiIYq!M1o)Z}Yi|fx3qb zPSVag4N~udo0SGMuRwz6FZIbUdn%5ArI)A(Pk+Msc*P^oD$>iu4*;d*kUdk`Dw$70mB zmRUC^q@&=6IWN1k2zL_utN)|f_l=D6z3b~}?x%4-3 z*FT!7Es74WMc4L})-;37cpwv7J*5glqdbU_{ww8XML_^F8_60;6w_X`(!-}W0bnQ4 zC*n3=zHERP@5q#<0UQDFO40!O6hx~reQNVtFg$9x|jCF0Y|() zga`E;FJHAMF3!tG>CEm3S{8TQG^Fqm+TKetQp=Y}6V-~byeT;Z9s z(p;zG=(y3*OowH{^IB~bJ&Z8Q7T!zHnxstFPkvcn49)w_$Jz0{n1O+n z>GueCGd|v|(StSqy7BhBh#GWpqLadOpB+BVN$tjZc4AOhkqHL1VqK)(fmd=Cvv2a*{WcPh^F~^^Frsm6Mv^3ME#&=-=*g z@|)X^2frTcB!`VoE?@U{^5yH^e16+&z2keybxgx*p7$DF`~ieFzDs+e8r;0!Xo@GA zzKRiV~mY*FNB+cA?Ot6mpV9@#m}@-ZR^N7wX4wMCNx$z4e@w z-%EPDQfNaxRgc@L(pX_ma~O!xG`LC)NPl9Ei^8xuFAGNLU3@C3peR+q{Fu`e`CO|3 zDn+$b$|}y`QPrUFV(y9goc9zsJs%Fr`>m)xQHVf3nCwk6^c6&=9ME_b^uc_B8r@$N zsjuln{uE!SDMg9xyF?m>_f^C`bA~Z97(plZmOr;h{nb+c$^bjK9;5O~4o+E8p<&k2YIcp zcl1|ZIVZEut}5|68`5y%zr441;CNRA?TxFly?y9&zqh-7X>Ui+0|@INI%_YaGQ9Vh zO`W@4PcNdWO9zGE)5`4VPvD~&;~g*7XF>_=HMakpz4E3-fe!i_k7P7rT9jh{wjpnZi#kJO)YB8&?Ju)rtj)8m@xao88;bo8I+)%!r?4faUeNPsL zN6Hi1S@Qc$Qt0W=;`j6csQF9o==qainS!Nu+$VewfE@W;@F8o;;O9&iVZGOyh^78) zj55+{#2%)Pv7roB81Z5sJ9R|#UKza5KrTP@1W82df5;ozwCiid!#lP|?PigYh9LrA zgu*h$koTB`LTu-OpIZotJsPOJt`Ge=$%12fA|`$XU5mOW7R4%1^Pj?_R?8yw>omN? zXC;vts})+B?g{op`RpXuIM05leQon6kMn(d;h!})wr;N77)`E^6=f~x8=XR&(4UzxN2R;|%o!p>kWm@v`) z3VNG=s%0XxE@3plMe^lCrGc3Kc~cBmDZ5P!)Ife2#)=qATSizq0>wqcwwla&yRN@2 zQq%mE2|v#zAV-m^$H&jX3bm4;zs$fY-;Z-}S;dZ#NZlT}p^emN!knAczOTnG%JSkk z|A+?%^YUvoF>QE@BU|3Pm)~jGB{bC9zPq-{;oK5hXb3ZW@IIO%o;QyDv9eZA_?rQk znXB&b7rb|Th!9>dd0xvjri%{&@4`R&*zCs5`FY;&=z$Wnqd0i}J^4UR&K7xX?1fFxrt-thxu;!ut(8A0@Cg0Gwo&DGPZrn19R!!}AVTd%Whec1P z`3HGM{%0HB6tTzvOR!aKM+iGP&fMJCw`i6%I2 z`MTvssobwJ<)W$m(l5|>G<9BC`e#Ikaw0dWlFw%4DgWs~TZ1;7k1hY6N?f>dRk2wmX@6>IkHgbmhx|2x#i$Hs1zE9q@-?&Elux-A^ zo{>j=;yEcP|K9aRt|I-eC~6$lQ_W~Mb0fG^*#x_SQ^n~ovxAeP%9=+F%0pXKrKCv1 zW<#ROhe><0fEb~C;Rlfb?RnnZi4V{CnP?J95D;Y0Mj8BM4F{qg`(4{Av692G57_TP zdGx4}`aWs|(&2Y;lT-1bH|bsG3ftk*5@VX2O3no%4UY#8y-oH`gzeC%iqsb!fG%x? zP4n42*h@WWVM#fg5w`J8x&~K@PCtP|Ny*VDWO|N(5BS^s$ykBPj;qUV{Ra z5#Odif|_6b$%=|kO}qZ!CHM-LP7E!dT2V{HEY?_zV+JDT0Pdar+hc=yKFWx+Vaii4 zN1tQ(p4QT7!&z*R9iYB`7o*<92BJg#N{lYWmdkuVoMZG>eFJ`cWAgskKxuE@a!_Lm znh@31e+9kYZw2aL&fMY=dvnS%*5-L=zb$aW?ug$~&Km){lHn_dQHBL)_m66qnoIR4 zu6Y!GUO_QJ5+P;~NBp`d4C`ij{ABDep=#+Br1)^g5&MwzsYcuCwB$~T_YpwSYrNf{ z6h7CAmF%AxwBHZv6N>as6HCLU+e@72&bI!P?AA|}B)0DwY0%9Up{046oYV;zECdq; zMzaY^UCsvF8ecY2Osp$(+1Ed?GXVQiJsKMzIdC zU=y9xWfbXSjF*$YQartryw}*gkUMg)EU14)oJ_?UQ%mGGhw`zL>St#jp9g0irezed zLwEhQkkLFQCnKcM-k`C`w3LbCHM8TK>*8&Hix7j^jJT(m@w`9cho*SDouC=5FPCB>qXLRF%+mOo!f~7~NU_oB zXramwqo7}Z>EC~pq3fziLveWFV&S6~zdB$4ITe_3iFF2MB|b#%U*;@~{<>WMajLd9 zQooO}e>(^h|16{)1g!?U79Uz1E*Pfj`5L6QBlB;vJ>m4ci1!qIQu>79+>rV;*9^*Q zJR>ji=uIY+M*3Y{G%0xofPYg!TYVJWcn0(}P+jICm*g4H+M5goQLHS!w{qEzLhnS~ z2qI?;6ocx=@vMm4+z-m|*G*BmR@Pq`2bHAjVJa*eAYV@W6*VPeyVVXyt(UJPbSk0n=_ z4z9iIo8a2V#9O*UixFeMwL=d8*Pj3SHqJAE8MUFm^oUK_{oocy5#1N^61vJ2;w+wW ze5@{Je$eCRBbtVW#^AycT&2 z|8B{<<|O~dPrg{mxBAH+us~tyX@2r|m3$S+;m80~F%25v>>E1`@Wu~i$loo^Zfg$* zxSC80`}j0_!Y1qBl|*?-A?bwy7)L;h@=qqe`?di!^xifF%d{cN92X^R{#r{*|B+`Q zLy8D2c-w!ST?bf+KbHww-WMz3Lksqx!In7Py4A|OeLjGB|DN=l?$BoX!@M<6&ij~0 ze&BsL@Iu#0s5w0r<_eDgr-j~$uo>R@*6$p2?)z0MH?2#w0)3gU8H z^NI3~u*D46W_QE^|0GGxa;j&=c}AYR#T(Mu;ureGr>gj-S1s4j4VLQe|oGb*Sq4lX6L0lpf4eaCGoy)T`b`1E<1uma%wbP780pYr|+J z%KH&zDRF=R@Nr6bH@_U7Pb^R0$ni*c=Cn8`ygnz~2aR@^=YX8>wK?J624QTi(@$i> zAy1N?my`F&obcM5@QWxt!*>3X6aRQlxHqTLVV-4~_?7AQoU-@i^nG?VoaxIU**I=i zW$UZRsXLyNXM9fGN9W`@HYa>~PI%XxI`7X3@0Jt(O-}yZa@ty-)7D<(@8-Nn=X@Ao zm*gA>jn?Z9-hJ-^8tm$#3)u0OWFaVM0MJ@*CGY<@+^YbQVr9Lslj|9C-L|F46{C`5XDe|DpPF~6k&sjGwYIpay|?FvuQ zcY@>kKi2$Y)v^W(D@m%xpTKr?X?WqZeW9rJ-xeo;P9pT27WmP5mpkVs6)+anL`g6WaJ zK7nt(JMBc!??mdBxoS=~!G2}YqbBo?`b1~RJ$L(mdPS?^S!d~I5{lTYuA?P8!Gq-5nEVhuW-6aPIw3}myo{0#O4==3qLt6a zF*vZy8Gco_(>8erJqM^HnlY(=L>-TRALn*9!gMv0YW84m@t*x)ETS5YJG7>k@y)|Z z#s99$<$qI!-`}feUivOmpHxX_?Genh7XYtUKq2;kKu@s6QTgQ@=;2*S00?LfvV<+( zIopIxxKr?kmw&JFZt?Dd_J`1kF2|iG%L#7?#Q}6V5uo@K)nf2IkdSvX*nn6+arR{L z;JRy;<6d3IninZ={>Qlb(%56k+B)ydTGdih=eQH*Imrog&4>Q4#8V|}=8`ar-+BBt zG8}EF=9*amV@;zr2T6C*J=H~!Dky3~0H0=9IV`nwda}5n#JdYrPF61Fv@j8Bv#r|eYALIsW=31sn z%k=H_fCUFzP0@Vl$Y;{CFvY4WtVS<=U#p>4yzE20+n;^;Qk(AEB|l#G)^%^01{g_Q z{ug@e2?i{Yb47IWSepV;qbVyyDJ3X_8In3V$KJt7?T)5KRu2zRdIgtgWbZ&h**nb5 z$b(R}WWxe_1vE~!AAidWvZhS*{$?DgIfu&-m=dyi4;ksQg#v1~sw8v54X1OcZyKMV z_VJ2!_bKWHy$J6g`40Z-rcKO?f3vx2A~1^vE3&w%>NvdN$xf{Qa=aU=0yd!iQfcif zRQAz{nuex8;Q;reMF2=yjEZ_vF!>3*X=6>TU}jPH1nIX9SV$+*6RA{BW1_N_ z@WfCFhvJ~)5Wr{m$W|T;W5!sb-D^?hF&V+`X{p+3BdFY33m2s=Vawze_BGwW9K(_B zfW>=i9c$(~Zs8~Bs?_OaP=nI`T+<{>xi>(Acd9DsgYe=q&wCP-6GZ`v5rw%G{RuU^ z{FD3`m*1E^nNz0B#ajUF9cv*zSJ$1}70tGah#J2i~xD1L`S6 zIEe)TPkoORNQnhvrr~~{(3j6j!(oe+j=HBy0yejbTu6)5KSW00YZ|qnS{R%B2R?2m z&BEm#>cduOGz2|{zv|NfGF?Pzv)B)lb=hcq4KnvIVqrT(FCucSH2x-_%T4oK2n?X zlDV{jr;N{cDvj&MGF#j)cn|QFkT9F~)%YA_gWKTEuuq@n3|}N-)0xujv097dyJ&fG zt%XY{>a7vi&QW8rhZMA^C9-~qCi`=eVtd*9@LcQzBb#282G`WH=W!y4g)hhfp-#xR&^rk* zB+omRM=x6bXrS_xSsqNmI^u`YP<(ga9ZLPHFicV8_n${UYrJVB)49H}HXJBgo`=2b;z zH>YEKXYCC@-&fCyR9rnTeK9J;NwWgLO@9*bpX`LB^tRve`Bqmpaqft^Vs^r1tEBIB zi|^xOq5S=`Qx4qAc8a3BQ-kzA1*9M2r;qi^S1En4(igB<0#7%1^|AVXtNfLI{(b!X zCCZ;qei+KSkXv}e+wWQ?%q{FmsyXF^5B7b>8PA9iyc~U{oT#j)o1d%N0pho~pZ|&t zV?$r6{gz0gTRgpxSv@Ir8Dz7hdJ}tHIAPO*wu{4wC<<WAFA;7AViTPx%j>CTEEB;zsNx<@=c0(yU{PNSpVGFoYn9g zu^}6tkrV!Q5Qb`||C|&5R8F`p8-|me$8^Jnrc1K1>34F%c{$8X^wg+!y<8x~K z%X^s|IkC58W7B`m2@lAr?}coA>9=#jJ#z9ya>8Ha)8DyOa7u-fwdhyD-f zj|!z%$>@(Bl{Ir${ZXq$f0h2|?gxd-yU`!b^7TiveErcwW-;Bp{^-%y1odVL-&ue3 zJ0<*g^haZttLAR>M-MOWq(8bqu224T{n0OAMx;Ob)Ym1n1G4CHhV}jL>5qyIv{BCz zw*I>MqZPX%|8Pyuocsq&NH_YUIk$!!?|0E3{g9FYW%YUsC53#$ zzo0+b&u`spftq|p`EK<`Z`^9V_h#ml&uXpU)c5uF#RW;EN_Q^0L z{|)_74LTm_j~sVzY=DMKe{?;!e!J5j#dH$*W%?sFka#_*ilQhBa-pCh@7ufW7r!Trqv*mW9FG?7x`da#;^`G?k zpVAl2c^rOh*fN7=e-OsMBP2Azdz!A}OZ7z?4H!awnOLx|uP-{)guc%DqD2x={|EI& zbDm(RJJA>Q16+617mc<4>`Y(u3~28EmcHl{$+Mkz#uSuU+8L`@zpt+^3Y4nH`1;ar z^hK)+0)3IZ4kbG2i(0-?U(~XLzUban!W^N#=oe;u2g-8`m$TDDebJryzP@OEXMNFf z(--~Mxg8xpfkdJ)LQ#Yi8R(4yh0#wq{`h+Oq6ZcT%)d%sbg~f=AM`uvi*Q`!Kj`tL z`l8?S49brBq9W8OA^Vci7hS0J%;<}FK^bCIr=!D+ zDgG8JqMI=hL42DhOA)DvLbB-&-6m1g9Wj#>?}3HLx7CSDrs3GZ1oBW%)PkPKSjb4j z1tNvvp-s1nqLmCWM^SW9t9=d<=0I;W!9I7cDB3t*Yv}73{;=4>B^32;7kkcD*awBA z6h-eoW@)ALr7ngtM5Sn@lDKdGW-wU`}>t=^&tVz z73f25kf~ITKIAItJu>gvfu1fvA9BIG^c<;Kq~Az>4Br*%Kg`HwL>-1S-er#l`VT%b z`j1_F{fE*|4buC3LVBqG@XJ>zeQ=O|m(l}$$Kif@nbP}_4g-{O=jkb_&eJE=R< zciiw)sPEVX7BSR!Y+o(rY{V9{Dc&LCGTh>nk^_CmqiIXNTgkTqF*>Jki~D^_j;nm_ z#B4uDqjLOIFy$6MsN_K3aiyR9Vo@oSvCwyA!4iC`(|U)ALoR> zkrQsq2_KPTYqT*NpHt(*FJ=0e6MJ_yHvM2u_<)@H-pa)Ku-fwV9S&E%Lcb_Bx9-fjs1Av$GXG}Fv)jS$=MQdLp4nf@|5e`(4q8#; z_LZ}iiP-T<|C!sKtaRFC0>mSAdJ?p&Ay9P(%I2M#0p0SDK-Ayg>W^-oCq5{0Qior& z9nNolO~66Ya_=otyg~-+kdNnktF`M-4(0-d&=pHl}TTYp5uGyazZM~`E~Jw zU-ZbUJ+D1+NIuW3a-yC39+YsV?00|#HNefJiNBA>$ygVaNDdxhA!GgUg)HlDDv&ZAr)AYbobLbxt24AV; z()U(c(4axRIbAhw;{awlX`x*IpQy06Fd&=E*$>{tK7amR}uq#n!l^4If z2hnz<`j*G?x!EyP^X4pB{^`JTe!1+^f%4e3^nHAEt)3w~|M ztv#Fyq}BDY8s^l$IwLgpsja4Nk1ovndMB40MZvFMDbGoUSoOueXPlrF+DkL-dT-x0G z+{S9SyUvvwT)9(vxswaipF?zl@80g@Ug`Jwc3ck5@Ev2VfE>K_4`k*eeHLvjfWyjL z#O&s!|AWs(^gd6X*7MT$P<)Y67b`A}TjNgP2%Z&ufPZ&zfjyNi>$|_6x-ZQ;S$ysz zPWc$tv@8c-DK*0m8U9B7JPU2@JdGP^ytU0*> zid5Z!Hzi)SF3YPu0_4eFddvLWZ1U;kz|1)=er*$SS>r*kswV*7Q5T=JDg2rVmBHyp zODwIzO!h_8hBeDB`7g;2c%?+sa;m3}EKQ37`}nNpaU zgr%;elnCf0pyTXXh4`}ibm>ckx)?kDaZ?2c4rEmv9phq*n|#-YY$orqnT2u$mZaC?*P@w<=Lik*~}FxA9j+xnai`k{M* zd1_4c54y)1AU;MGwhm1`8JYj|6}u`tC~|$N5Fx>1BG(^IaPX^*>l&XKyjoG;xuPVR zT&BZO9AmscxH|DxKCgJ!G_D(+e5G-XpXaFRN)Q;+J8JW;M^7YwE|8Nods`8$V zvGPLx?=dNPT=`(_xYnap^HSk+YJRgcx->m(@ap8}N*Xe{adqPhm&asGcc1>%$*q(4Et!<)IO0mzyGx6np?~=2<-Wo~bmH0%i!k9w5c!6#FboDr zX4S8*1|`3E|$^$PJ5RP{Uz z++K~3%)*@coj>bgK~3m)euCxM%ja*TK?&beXu5-JzTf#8>bg2lV&cj{nHLqt(InA8 zTJ3twbUd4vmYP=k#TDJ|Ok0s&R5*J=I!;|iM~&EI@vRSE2K#by<>(RFL)}Hk>yF39tjJX9{ z#Km9t!CU~EE!Q{LayOLUq8+t5TDcW_9)i)wG~4`vCq8TY#w8smfF853{LBG3wTgO7MvLNUrZZRqDSkQ1Ve@o^;c$4osuIYy0_l34ng%jH)6kJt4 z1uhh_*i;%Q3k&yC0(bFesYXn{6u-)G>&xqiK*7FLK2jKNi+=%Cu{Kpdq8rq^;3JB= zdbGf-(?M6>MuW`4ODrY0QkELWGE||wlL1ufdmebPKY_yQl*vlg`&Wp^!Jx#F>4oF0 zouLo-x`WU8@oE{F&9^*IX$slk{+!nC&P7(uvT5Q!_UVLI!%_(pfSR^GVE{$W2Jb0^ zEkVqm4EO__uOa53`^^e4XheSII#bVDIRet{2Z3om_TWWd4hwr?Q;Y zj)7neZ4MosT$(|z=1RgxuivHLHG?-+H$FW&+1B`ykBz@moqSEmaQQ+~L56Lh*|+lt zZyeqDS~k;BAj2PFKSw_1Bays~+E3UQ26A~+cRoleX6ROV@ zst_L$;#$Ka@%FDtE}Nu4$)xJU)*~V_#C?qBUfhwuZ^)Op#V>O?E5y4s zc7H$5!1L0JJldctCTXPhb_as`iFka7%;#yI$o*zr>*T}ufdnVbI(u-F z1v2;xu)KwRlKFOvtI<3yyjTZk?$Fuq7(H4{l=p`}NuLLI1t@on8<)O@S)HZ6j>xP1 zPyHU3S9_LzkI$>cg+Bz(`z~yehWZ_{YO{REwo6y=k(pm_Iy)+Tc9Tb=!|24_jUW6R`>D8gl#|*6)xxuBVf(G|!(>yzQ%KHuW7s@K_#OxVRp|chacbIT`^vgAM(EO?r)&RohR9VU0me7 z^hl_*T}j-+p{xe58GYDb^A5=lwV)4gE|VeL3>WJoX$AHXrGiJ1TY(TOR~gm$=y#$z zfT7lJ4jr+0{+z-9%*=9*f;qFnuJ(t zR7tdAt1eG1-D>ZT%$Ti2Jmo}_|9}{Ct+@cU!Kv7WPoDUm0xF;18E!>MQPv7vS+y`| zev_X^-FP8L#si*qbg?N1n|RiAjL}If1J;^rUzIkQebemidmB)jzj0DDa>>fE`R|JV zdODW8u*7(;em%2;&B*`RdfIDmY?}`T+ZUP1s}1prjbDrjqtEcawY#-l7}C)I-p{HN zxPN5iVR+&EcYEYD*AUU_t1MssD)h7xMLbP%t z_}~&BAC&SHKTn82!1WM*LhybC7a@wN~XD8u<(@WA~69w_zkfEv0z2M<(r zg9rLBg3S1H_IpvAJ1G}G^o1sL!4I9*|57BwjkS@sqq6(o!GBBYYEb?Js1!GheCo1T z+6d8E_qX6!M8@?xQZACfHFz@@0l>$>D|~yijItSL;PLYHQ4|>-MRC19y1Z3^SmlQJ z?q^GA1aYO0r#glLy~~|{k$jZAU)c6EJACQ$t_6{wz#|v!Q}@!5M`Horibs`jSw-5- zEd1*|p({$g=Wh`)IE{f^fhv4dnRMKK@hg|My^4pWcJuo^S?@p#!n0ycoKR znpkl?6h+#{uSNkw-rnr-%b9Ty3KEk zn9zKL>tOg{vR1n~xiYf&gY=bl10)Z<@TV`q+XZ^zebyUqm&cD+!4i8b7_Y{!{g<=f zQOtGOc!eF=-RE4lUAu4Uw5~g^`SKxxV_WNl9xY?dci3(}5~$jC>w+)-eACY6^NPE2 z@dX_a+(l+A*N)?!^ir>^_9!2ksxZ{>;og7{kXSYKQbAMlFdqaHO$F)yG(m{@iI*38 z-{@R|0&qG?KV53xBqKX4p*}Vass) z1p8o8wjVvxcQN!%{kXJi{eo}p1Mh7nG@z%fubrnrNqv3gep&pRtB)L{>b^uD*<@?< z_4JYN{Mv`FfIMXAWd`2|1g12WxQ~>r_){wwNJfMGv$4cAQuo{CDLx za|DhU&wH{*UR!@(UV}yL&9TKw^QpNfH*{-SIr5$8xV1&cb=0<++f2jn_oz9?O_b`V za%EfspjQLgjHUjx)zJ7o?fj^X19hH`>?fy7Z!5d;QsQw`-8)Xoa!Ovn^X@}=%3;b# z@4a6MK<-(k>Y(t&ZG%F6Kn(RC^j$1^cl7|4e^i=|A*uB^_t=tbH}Od;W+0h z?2mK%EUR;tVp8`s@}oKrvUv{XO@7O|&YQA^^+|sd)OR2CIjP6%EH%UZIPUa`-rIJu zbvhhVM;srch09-s=i~kI`ahC6_rhNMq8-uBFI8PGm6v?a_~>h-;WV~l`X~Go(4{b* z7g{>fJpR06{5r+!ST)Gs3B}bOtf8i=O0&&%v>JT+d}3tjn7(NQ^A2w`tC53$w|VQd zxssa}gQ4>#q;LPFlIf?29)hd=?TbM}1M`m*ffwEf>U+ zZ|f+sO_N_wCBR{-hP&X>0gl26vt+&qvsD!DCFgtHagbYfSC7gniRoR?m&)Vfq z%cvq%(uP;epgtPYzKU?!7*()21OC3NO^OmgV(bdU*s(lhM)2ve$sd^{%k@(Bd&mWw z+>YEjX&$`7A|NR{TLC?(J{@@c)7qWE3C?_noQ?YMY6Yo`WLJU*QA+jrcnOT8@{Vel~F3idzyVcQGZ4|a@a zTLN_wn?H)8ZL(?MBo0nCMcai`gn)qmQ2awS%%KIlW~}X3=p9c7rTQME90R3SJ2Dsl z2j^~Q17(0*aPIakDwOJgb2pV?XeE*@hXv5}#m`kG?=Gn`=+QW+QD=|>IL&{Z#Iud zS2YTeUCe#Bjgk6K=%6gPhX5A7^55ZT{|qDCLzMd@Z7{f66lfc~!vQ#!=NgEYRKw#C zf2%<1nG?rxu$9Y$$@jd0j6#! zyg5jj;$#jraTZ`_ngy7<72c=xpwJYr&SWfquo)w`R7qL4)#kO`;lp_g zS5L?sjD9d@Sn6AHZ{E=2sLn~^+_P}gi;`jiEzx>g_rsBP{pick7$oO;wqB8P3(Q|N!HD{jNGGVJJkT?bUnD(YlimkalWyEUXn9JT1c$cdK)!n zu~A4L!}S)sX10=u%Og9j`{cn*8B?^oq=ZV%6kC0lS)v`vPaWr#Lzq5-n5)(ALr|i= z*`sj+*ziq@xEkHlOr2-m**DT~KBi`O@)VHlBsbg1WkICAKapnNSC22WcQifV*v(dk zA}LZ~WU8WxZH1AUv-rYVZ;H>xFuLuuu-9?d?s0Tyx+;~x2KkZNEsS060R*c{+4$LNJ4$PQH4J@%@nonyDPD6eP z?yiiJ86K~ofmo^xhB8vWg<7O3zKZFrKZ%5>`$Tywleh2(Hbq0%Gy_db>4BDF4#I$D zHD@Ufpp44itigvZ{DF_Kh1r=F$|ChY4q9ke3lsbnE>a8U`7P8R0+umDLM`Hcd~{kw z-W#Wp^QxFo7LoS|S)d=Wh?~6MgP4RU*?P_1<)-Vnh!19mqf-2KI~DI6{?W@qJf8JO z^;S4u_E6D^jltROo(T}0bbme`H2cG!nwAOvq-HQrx5}w@nf7UwV_tOJ!l5ttI~5dS zJGInHaBIfjsnw({JXN6R4t;H%?bMfu^74VUS(r#JU8%U@kNF8rB+s>|5##vro@~h7 za>8CdaSZVdUU|R|hQUq@F_^(qUiu#EbsbG>u|)fuyn;k)e)>0zE|BNk;>YOI!j=Ac z{ktoye+Q_4A2AQ;19~rJhV|4!x`XO$ZyC@`u*rRO=&2dhYxlp11pSq@^?_efW?q+upk?j- zy;z0!754tip2D*K@*VB{M=N$(#@@e}F$DJhbc&Jf#{rQ^`;0JSMj#B6e(PF>3BTJ( zu9_vc?~aSx_aS@muERds%R05z@kp}M6@c{RmZn<(0}1UoU%_#C;sZ#_REi&0z9W6{ z?f&r_TQ6VYwni~`k|RI@b~fm@%1ISI4uV$L@qXn4Yy+=XgHOP$G4~>68DGYSBRj_- zlV)LJ6oZN5kmEw;`N_!4#|`_OPR8V6%PeDRFSG-yQM$xu1aB?Iw}Bok9RfZ`okK(@ z-~t3@MS6m<{v@!E^le!E@fxp}5PymP=0Uc(@D&jBDK(j6KXBCTv|Ekv9Cgnq68@ge zch)ngx<8hNu*ltl3pnqQV3(XjE=eyir(De(n<^;R^-Nd(IoHK7TcJ0Q0NjAq0Z!3W z#aQkOam210nguRy!-9{#kj#Fdv##TNXsXxt->1rMM zSL;Hj<%<`MP4-`_JXD5OVHcm9xC2nh9P!FM~$*Tgr8!p2->D?mrYl#W6o0_iBTXT)tMKsOf8(S@x&Z=?=a{7SX z?fv1L0exinC>&5)L=gDU;>@%vveO!=|7$S#GJk%}n;T%Y7P}FeFAEtHCO&;VS+gvg z&6B;8r&)d-pc9Oy520)E5SRq94%61*nGPwq`0QhQ!hgLj zT;&VsplMfQB<5;YBlU+UyX(Kg?zs#rn!3&2PK&03cbGuG`-k`nvkOj#=m`) z3KOUOJu&L~?}` zQ`}ac<5VuQLwKz*Y7*wstM3!zXEh|w)%_o$A-rtL#7E89r zl5fViCSK7R%Wul?Ggg;NS<0mbJC)lb^)J(^ebQKB8Di1WGADU?JN|n-em%@6Yq>Yk zgfjs$A4V6(R&0spw*+t?LrV8#>Z3cmLviC5Sdg9;HIBMYqpopY41b_cOSjvk zf8M}MX}L7yoB`Cz&`m0pSOxb@r-}o+RF*LpmzefBv0`8+;fQmGOLZK18tMVRnMgmH zp@}Fi56pjS+iwIfczlRDvVd6e6g>M{?e>XB3)!T_pU z3wc_h?G_l=AN(8!_G5N)VBdO{6fHia#oQlFJi~GC*J)sQB-x3R!uSLIGvTa zzwrn;$y25{$=@un$WtcL)U-&ynT_^2^A`W-PUUNv*~AYWhLoHFH!@(0^dc=YR}rH> zf@}RyUdzn&7OERbWNQ%VB=47IhPHIBN*ijOpRN3`#p39Hq2JHN_h>$0AdKGk%ADb+ zmGQE6Vi~9UrvY?L;Gdo zN2l)5m}05NZG`TCQ|H}=Af6UXLn8_SiMZ-W+VTB95cb0d<2;W&@8aB$qw9}GO8 zguHE-*7;>Cw?Rk`r^#6A9-A5ev{&aizC|z$QDB}Wp`(q579MZc30EINQJu~e&m$t}kDcqlr{&l-26$!P z&u!oPJ_NYHenpvxSh=ciK@#HG1OJeX!#3NnEu%^m%V(K>&+b&Co!7=VF-)CR8XG>Q zEIx*p6XSiJ3q{@F8GMbdn0ywLxq89&MF$agYZBGULui}Y6 zEK<{lMRU2=(7w3q`m@H|j`$*gfp5u0f}Kq3g{k#VAk!Yyp12e{cI_{qy5~5&ZCwU`3*M?Zht9R%JvP$t88Z=617p~j+hP@H>Ucs)6r1DHKat3B z700E4{ca&Wi&ZSwN$DN}@;lhaF)5%q`2=T+s=}m=j8(L7t{ThV#Jo7J1094;Cs9Q2 zf{}~pAiYq>oV8q3%3tdwKaTu*X{_Qci9rp9ZKV4V%3C6+ zgt-xiTCj&a2!FF0>qw(?oFXL0K|MJ`ew%4Utx7$$qLu52C@tNbK>G)C3ju4Y}BpCYbzgmpmuJqq&vLhZY}wCW^0 z9j2>bi=Fv2>%?NGe98TnbmdFz)L!=YSd(jiEZS=9L%=`uFZX@|GvlP>6S~>BhH0gC z3*LDwJE^N+2$sgMABrx0w`Uu7Bt|&bvtF(2al@b4rOM?$ZLvU=ES7)x-`!$4Ig1t5 z!OTD5Mp>s%SgpvfmqjZ!Mdm-Jqnn1evNnU!2*pgE(wXL2g-qF0s7*Fqdp5*apxU;#EO zi7voe{55+`VPke7~ss;n9LW3T3FWMoWeMBl#aH%drI zE{_+#qeaxCe;(3InS?Jt^#Te1%(H@tiY3ogIMpDgsZhet(Q_}io}`02dHj${e}oAn zE{q?-(8Iy2P~fr**{z*CZbal?!p|B8CfTqIsedbnGjo1;z2-SLY0Qe&A1bctoY|i; z&ucDao?FlBM=yg(9@U>-eg}GaD6Q$_-{o7~sih?MEu@g_hHBCEqmZxLB>zJ0#Gt8zpfV9qaXqdosuHPUS){j>>^?WS7t;i$DGN%Vq(_y^U%H{y+bL|9CoN z6zLDPYW_?+BWXn3<)CZ3&{DAbL-%EaZbP_)TDfRk>=@A7qzg!w;Eldl80Lq*3UM1j zpLM)xiw;CgZq4fhyXMQ4!OoMO&VVtFoAk;e3G=KMcQUQxU+qu{@gtnE{DJZG#NHZ_ z%La)X#*aiVb;fY`1w0L2CVd;Wq)4ety*j`B39tGo98Ro~=JZ78kI*a0z_396JC63D zq&lVfRe$$sIobzN40hK0VT0)@VZ(KgM7l;XhZ&;l3+;`ZaonY(E18S*e12?co}Uq^ z3?BRfF(tqQoy3!FsIW9FJK<-Cz|UTC{OrP+AVT@s|NmG|{?r}U)73odzn;Df_4v5; zbPVtRr|W49K_&iuEax13ELcy>m;MI#r%#gSBqF%=gnu9s+>r$<2vHyu3QQu1SHBuc z1nXH6MxC~DahKRXENGjFZZYQ$5g9i8P%WDMEVCvO!MpM53?i3waAd9((4^Bh>y*DN zJR^d5_eR@c^o8vPWUut}&9I~8ua+niL%81fSgdG8F)Ya)G*h?kf|oh*CSLrSQAGVi zxSkA0J6Qxn&&=_aj}QPtx$<88EcFL{gRj6+RvqVZ6_sygC@r=>$^@ zH;V5w`gQ|;NPZopwBLN>GE`tMBpX!0d{o6#0l#=IUj4ea{9PpK6_Etd+W`9E#r(iu z-*OW53L>X)>y%LavP_2B`zhB~I%+YH!6RK)^NoTI)0d1uJwZMEI`&s7Ic#8H? z4mMT)F*XC@b#FxyC#$JspAaJXa=dy=c0amqcI;h|t@%#cgeRGfIh4|Z%w6ZVRdK8= z`pt9-P_7L9ru_k?%434RTAqC%zh6Np<9N8x>L@4Qw|Q`UR57o*F9#p?=C-pJo|Cf| zVk=a${p^L$w?E(&e`q?9X8vh6^KZ9tj>-k)m>L}Qd)puW?O@{L9QZBjH@G=@9w^vP zs&M*1Xg74t3~lR7$> zJzPy*&3wSEJ}_G{H6r)*dXY)JGWmtIWp9H$o_;>)^DdObq{w8i|Jl<;Gt6Bd8m|47 zaXj%)G?J^$Fzn}SKDNCA4`Fa5Fd51F?9=Z1RsO4@DSS2PJBE_XtGx+%^x*B~)BeH> z!~yw~3;dU8enUHk1#C;JnD{qHwHw6Gsd)|FRgT$b}^G)Ki{pf%STumAvq1 zHsjl$6^^YIw*5oWmdH1VCJjA@Xn$%A5Un{mOL&G=pg)qi^%SdHNqp^X^YeXrAkuXN z$dDnGdjY?VEeq)oSz^7My?HD&g{&{m-fS8Q;#5O<=7vzd1#pdj#O9ljzrQ<%?rUY4 zUvd+)pIn*w5-sKI&XBLr?p%UVIy)cydu|5!ckvkvsKB1Ell(!W^6v}xyB3^ACqE`0 zr{-H4#D5o$a~yR!`NYCCbM(UTw?Yz1WS2y4&-vdCuZ-8d?y{=ImI*ER>-O$u#|9O9V8{VQ(V>9nje@DFN@Kf*9D3$4i2mRyH93cw0M&U!>SG+9h;Gi3~dd`I6SOgS`5U z$fYHf{6m5kL(5oll7)&uM4`!p1iu_q1R{T8w_81h9>N?XB;t*fvbkV$F`%DX{+6MG zF>S$_hcUICzI}Lp#E#c8QlYXHnSB|I$y3JrH@iF>nF02+gXWMA(6_hw?JvU#Ryd?KR6d&g%6;8);mnIpD4v>ZDdsG?6A1Y7 z%I3BVjHOPf06pY?*}}!56QZ%+NiqwNzq2K31afr)>>=?(+NN+7<*L}feEdeOr4y=n zSsakjvni1yqw0N`U)(2$i=&OFsJplnKifVs^E*|aBXps{H7-}W!GPmx>oJ_?ByG41sRg0i|=Yo8%+VIsK@ zg~HZk!Gg5)q;6Qt_2PxQ(Q-DS{?(fL7MI7~ag}A_RC>k`Vaq(otXCfegrHp6wue=O zw?H9tuHL4{e@1Hc4_G(BepM*ABQJZ&*8<_gl1`AHQ-O8yR76&@>VN9cTdkPHa;<`{ zw*wXK5f~aqsnznr`%uS$U$vsS`X6Rymp{5G)j!Y|;aUkrXLPDPj!GZ0f211BU`>~q z2MgzHLK4rbtQ2m;bRxs{izolyoP1yR@w)rkkI!V7Hj|%NLzIXKBkgF`HtLyKW+ACH zHw2_vaqgT-o}Rufd4YuJwlNZhWj<#4{~bRQ;eb+u!rnODHO4%)|4z<1WxTM62rzE2 zP^v1h0Y&F%%5skyH*ejxuzd1|RqOL3G6>WHfGcyb6;fFXPy zNAOh@xej%m2a*PKnNFfxX7KZ(zF74GR_SjW*e7qy)UEb(K6+dCnB6~EqQ zob}uPEdk%GX-_$I@;*uVvQIzg)X9JPtBwn#d?AZy#vpMRm_=@d=C9RU-eKyli@DZ+JrTo|#7jKPRcFrS$0|%o!^(=jdg6{Da_z|D7qheQA@w zwIy%qR$BUrAKsX5|4FSH=4x3jBhN_&x@ZtAJWxkBhB(Rw$uY;QKL2>6Dv2a4OEUK& zX0oaJlK&utqSr;`^fnD)ewjYjK#@}C^9ys^=Xf!az(J{EWZolhhscd9*$m$YH_Th*o+W>p? z@7mcay{xApevJN5W=Qh>qhyR?+XY{tmPRAZ+44YM=vPJn>8dH)Ew0L&Jpbon&W=w>N2Y1I}T7s7A>c}3h81NaT#PNN*-Szi5?mEYAXj)xLJ`jlzWbN5l!7gLWq8mQaPf-BRo&j`vnwl%PFwWpl8wk~x%;#$)X*^&^1OeVuDFyd3ckhr!F&2H|jvb>B<@TUgs0;O~5 zV5IAdyb8s&I9+d0lm5h`S-!xypo89*^6AR9P&tyeMXL9aRs?=YBP@P|(Oi76`Vd!QFy!LN-UQ0i&rGGXO%@ScEe@H!<_KM7JxD4ciPUXvZAe5_taot+U z)tmDZ)laTwqA=eS%GIY|tJ#RtF`Fpl>fA&A=(`^d~xtzYrUjge*pJs_^W~N^Czexr0x=N(l`Aif#tz={YPqHzI`p?D>saw$< z0Lird)NJU$hfZ08bu!s|QHA-->O0`aOi7GJnufkiDUev$&kI|wSBsye#q2!cQx12` zB+zv{yT6%UWC!o=tqwb8+deR%G5*itic;8X*v3hZgFw?^q!s_i;_1bt!BG5u`n1~2 zdPVdBC?}&sDx4t+1mrpouFO1oANEZbXnB(m+?@RR3DWmQ2xTFCZ~Vzl==(?t z<>~vm&-^dxJ3xoNzY?#0rR_h^_t8ZDM!NQ<*&XQn&2%u*wI8oSvXQ6nXL$c5eLrgF z^u1Ew??B%nNtk=<&p>`m;PHnyT5iHZ<2cHh_WSa5UAEwmq#KQQguQX@AV=A`J)sQ# zJSm{$l40)eJszOKJA9zaS{WwsIu$lrEj8&w4 z@61ggSg>xy6g_+v%x?1Wd{}0a9m#v=Rho<#J=Oyb9+@q7}3gK77w4t!~}r z!~=CB$A?|NyNG+flFk}^hfs>jn#H2}Y5!tsh}UUKy5@ug9w3MnqK4XrrU?B0j%-(p zH0CKh#K0*0SSo?S+e@S!mBrV8XCyv=SJ(rG3|&J-=o8p^wxjdSqVvn3^Zooh{Dnvl zA#I1i_xRT!Q^$(29xG%qJ)<`Bgwb{jt2CNzv|YChtN>P-AN`lKec8v-_7jY@YZe@B z?>8Tr4dPQDL)*zv@swzlS!bc|*(uQX&e%@r`*nmui)5bxOZA@-k-t(;#C+vS^=EGK z^nJE;MhgFox(eyLVs(^$Lgv-j(r^iq)+9$Y$!?Mh6vkWmtw6u7$mu5z-=9*Pa6LIu zXZ6MR-4Y+u7q9MXdjZdA?1N8XOo^vwTi#>5r15RW=vH6)%OBx9eVEIByu>}0tT|GT z#~6P5{sab*&j()U!8)ISS2S($PkyqCx2?4}-@H8N^xZKJyG|3|D#L&|4G$!Qv^hNl zwdCMqs{Jj?DMx@%Tw4V=V8Pui?o_9UmXU`Yj(#Sp*uGwRR=NKs zeFU^iI{>49C^vHG>pZKX*p&Kwe^l22{p-3%b;Z~hPP5Zbh~@6FUY54_*qKGy>%HWf z?2GZ_x68AnTXy*+E#%-M>>DS8`2P$0M*C6pcn7V$k5%0JvG$Ge;Fl~8*W4swCeYgF z>IQJ_+`jP(fb(By>2EXw(CzFS4;oxI`unQ^vv2HU?e_Vr=xNTrkr&l0_6?1+0Uz;x zHV$^8IN2>H?Q>z9H&7u>!K~Riv(qTK6zscJ9zNcfwP>+u@UQ-GNPGy6K(spK0#8l zTij31mX&%VRrw!$MsUWS)2u~~G?X_XRUPH{ku1k1E0NIVyvz=`3T?kN}pH&#PrTsnatL0}TrU=9J z*%5jBC(f?g6@5g->dHA!5Hv(H-{_BFo5_RCM%lBEBz_5awb%M52wg0Bimn^MO&u@f zZ{%45-IA;lur#8|>&HKuWEv6&A&h93X_w8Nlgog2~A;4jPpKGL;8Gf$Eux>BgS7(V7kGeInY z8H;Le)oYnB5_fWe_pBHND4}hz_mIdnmPLAl)>0m?A3N9v^L?6;@#a$y%SmO$_RYbOp+NOd3;f(2#gWlA{Bjq?tr4q5xM()zNPzbh<_?0fzuqf`1M;p zWUZcD*+O8{m%pVWd|tOq8_yW3`_~bX=+c;+&dVjs1_PM%*xhg_0lry{ys8hTO{rd%1|0!ll{Nl)XiDx2sOk zBmb9J*t5@lWWAScUoYG3OtCQ6)qN5LGRO1C$>kbE!LICOn=aI1fG3w1(Az|%w(Q4r zcU<}nXyfu-k&+iqn!-^+c$#BT^9N6MOv~#{RCWUm{o&qjx~`EXPDW2u-o{fzqbhH4 zPQ{0C%=rz~S@hi)T>!T~m`KDiDnj03cJ*RbKRSf#A)^crwDRXR9?Ih zZEC=fkNmE)_Bv&!dAjWzlHq%stZPbx0{XJrw)wY6n@+ovpLD|YKgMfbnu;hBnfEi{ zS}I|uJyF=|q(181>OX%!bgb^__TQTY1$|^;(;!(;9z(`R-cq@kmbArqb3d%0 zIe2dFag|>*kF+uOT^XAgX@pWvfwh(wLjtkg;kjqQv8|zC{z(#5+~}q8tHl#GM>ve0Z=qBRUOTL zRh6sy5?5Hgw?Z}fLbZie8$`Nfo}p?1)k-(yOM8W-4X|AsTxt0$+88xk>PSoet0}=H zDWxbZfL^H5%Ub!zFM_~vyqVwt*=p5-ovsDi!AlPu6|9I^Fn7#5^m($pzegtCY%*E< zm*J7Qn|4j$^GBB9o83@Ey#k%85j2xtg;mpR)wJwb4_?0YBd%9PLsgMGE(ke*dIU_> zBlsr0Dy!$WuAb-){b?b%d&G%5tw-=yJ(_}~S8Mf*clFflSWgZ0M0Q$_WJ$9ftxoMBY9qJzN|j!Sc+dNf^04|3enVs(9&x^}3ip#`OEwyFR=-KvMv zc7TuhG)?zXx7e(SfMFqEdvywPsBOaO5V60Dpe@j(f*c+E>Mv3~gnh($MkxHZVXlgb z3J$VT4G0)Dmk)pjo`+Xy@~o%8o@*C72qBgvlfK39g*?8IVMkSnU(B~w z0^(203)C!s>P5F*frC%}Oq`770va9}HbGf{X@EFo+tT&#qKJ6sbpJ10YQ@~nBXxW zv5?#Jh}Zhn9~%!9@Z@PE5WqNdTxAPk|J+XzNHE|PB_}ixP^}JV(Y5j`IyFtdd{Ubu zZ6aRwvj3QySeg>L#!Cia>@EJmCXFDZYOdGwNt_fV8>~8ZH($wlK1CUungu9r!?E8R zFWh>`iQ0T<22hs2jL5D#9Ert(&Yr@cL&P+=c3FKP9MmZs{7qL>dHi(eC&531>tuh2bp868~To# zL}>V2bjQY5Rye-;5#PN;(VfIYCm!U5^X>L3GC${r{mA+eF?%n+&HU@K%$KbaY0%ZY z$Nq$YF8FpzW)?TWJk^ib#ns;sNlea{@iN0yM)hZATKhGT#54H@YBPTb+rQ7fk46#~ zxR7{}yKdm8KP{1Hcu}E?%-7$}t$)&U>^%o{;x9l1g=H1jC@g&!pib<4 z@CSC9DdP1n z@tcxY9d?Xsz5*0#@+y`8g1WzBm(0y4Z#@h1&1ee>-N6iDmu=D_il?SRnb;NILG{q` z8Pk2`qsZ*%fbYZ5` zOR>WYqg&+_`*62zEs<-fE`sbQ8-S7mToPN#mHxV=UPQAMh5G_Iz2eC>@=6rw6`ON< z#c|>qeAytS;({>(j+Bbck$G2&)k_`QUZeQ&QeniC?SEEI7HxAk6KWEq5CeJCHQun% z&k>W%?gs7)@f~WifJYL$Z{QbM_c#2oAQtljjQH+q!x$c|;Z|fzg}eFXkywH&70}Gf z>oFoB`Ci5gwX>OD{h92i2O?1VqhY;M*$#ervy~r_BQ8UVMj@5gqO9Ugk>3LbA)Kjm zq3{@$idQ3%Mr1LR9Pfu6?jSibiRF-vTQ5MM86pXZlrBh)PZoE>J74}C{qIxtRg>oa3%ty$LlY{VI-mB4CTGvl#jIylm z*S>s<=DUXe2I6A8vcbO}SwTjD{<31pg@LRX<1?cFPF9NZTTJ#hNs&D+sw!7iah~7C zZ!9Ti9-McO7FkaEr!^-R>0-J{`%BWHgve^Ny++S`qJ+J=a6;nMl+t>Q`d6xa&VQOS zbFl68`EMQyQRXnbJ}1v0L)m%^cR9NyP0gpqcoN@)S8~>fLTPGG=`|lCPhHFBG$naz z;TJ_Poaj&>Pn~Tu=j167&5xC*Y+KOesrZ-;N>~B`Z1oRV5(rZ>BusVAFNmjS^sQb< z_7I}hcw-2q-0Z%4^DbCI6D&KVgFvo=4i{x^79Y;TtA51ZZJ|7|2%Bk6p3nhl#_!*m zq4&rWE&11FnUUPcZd*U%3EpEb#)_AdCy27m%M*9m_btd1aHOz|mw6|w|L5*K@QYaPsO4_Lf^EAt@E2)xRHE+awW#h|HNW2fI^p4 z2@?XYJBk7OOCX%C;Z^>*Tx*l1Li%&+kV-4*U#M@Wb2@dYk7~uPfty*Hu!KfI{S;936^|=%_bbIKO}(ken#vi zH^-8Xl3IZfGnv4`2r>z51CXJ8&Hp@kB5J8j&g64kt$_YOg9s3`ZlK+ zCtt%fnEHzH-+Ei#h^0#&ReUYG8};U6In0us;DG-&mV9l3$UM$grOVBE(|(%MQ!tpH ziE62sMLSb@J|#h>IxpFbR7ub4;D<*ghpdc|hjlLNNxvQZ#?#-a2~+JgC!fol!16ac z>GJIOwo<j_vq5mIdnJ-sWz=kTLu2wG@^2_~;Z1J~u^ba>k!4=VtW>m)R=6<{)M;HK8HMj*FX-8kkl#5y!T8avFlK zAKS9oOP82o+x+QS8DWRj+b`sM+t$8APxUbqit&CzE2$*e7nnJPkiQ`E8G7n}C_Os4 zPTM4V7<7)pUl^-@toT7Pwb+hRr9Ed}2k|7Htl{L!mg45*mh5K?pNX=p@Hax_muiTz zaD1gE=9>ouYf(}_G8ey?z$Mg%Y>rz z*k-QLtj<=tjvKz9j*=H|E_5sVJEuRo)4Kgq&@posIP0aAA~QX+)<078T6p7anbDA+ z6lTN%-2k!|PtNcFlx7C<*kIfLGmyIvd5EMbycwKX%4U?m1_%Y{o!uw6FgRWh)62o} zbDt04IG4v-0_CiD6^C}bXs#FT)b3+BKxQkmRIq{4v<8QKjy8`ZHUXi|t-HDS^v%jJQ?PezL+zgM4r*P$EHtRcOs)!^GMS6@SiL>g) zrbp*fhq!sm2KaowqrjMl|B89cWy-xuvCvXe%9ZA0Z!=@sPR$noDyE7Zm~x#MwNb3{ z$4rA|jS(G-r}pEB?_ZxK(cwJ$AwP{>*=PkF*lFEQj#4<(Y>6kWJN^h7loto2W(u@r zecPB&u1U|R81t~!&F9o!dPbG?K@W!MLF%NkSpCVBwe9Pu99LViNV0tyc0Nci$mxeR zmLSdKWBCHm6kElXoIE}B+%ph`IxkeF$ugZ*ccBtZ{^4_gzw=U$)RTZ-=hyvAPa;bv z?Mt{Hkwd4JD=|VoS#o%U-2tq}f;^LrEF)V_wOsVeGwG_CR4d87TPiTFT|xbqmJ_0= zWz0-YgdPNmk%P3z{ytHm#-%l4TfymdyYgr|MA%@TpSd2WEp6pGj}VzBM~Q7TfPaJC zJ4!gufM4L*n$&*JMk{a zpAVb9MhHmK(I{&{0oVT`*e3H2h>64h!p7S_Ep(=A17!;u-^jxEuVUPeSoCDcQBvZF z->1Cvy=uUp@h1)D89xizaK#AiSeqRZe3GfuV(Ik3pX0gmul}s0hd;LnRp`2vRNZ(rfUPNL9v zpxp8Foi+M2ediQ=y2WnCTdOa3;MFx&hn;zq7~!r<7pkB( zAd+1w>|Y_$lrCLHehR$qATEITv&0OjN9%jLiYW6o9nkIa7P{86?M^+TgPQAYcW$+R zZQe)Qawxs!J2vVXN+GUA)tK~EM%SQ6I0FECcP)2-RU&&!c>Zq6GXJdbq4Rh7OrcbUS?id%BaW zWYRPd;T+gq}cng z%KO8Mz^2)g?PhESuso1{TL`2DRFVBjUe^>IQ6@MB?6gP~pF7u|8EcIK!i9nk9P)oPr^=;cM@hVb@f3Rpx zs&g&`Kb~yGS!@-ObuV|ipN{^sVW+xJX9oqOuSv+Rwj4Cy$KeL=oX?5DrFJ1RBHPy_ z#5ZLvP=TKIQk^&JLzny8G$44f8(QiovV{4sOQCnV&}jy++g(#}RIcLJl!r+Uc$e5K zq~RtSBY{el*V(Dhi!*-_!A3tmg%`JX2(tCJ&*dkQz|Re+o`;D8T?VrJRL4`-S>Q0r%-s!G$|-<2F`*=VGg4WLQUw>S)F!kCmi_hS^4#gwep29)0J{ls0YC zRdg=r?XmF0NP6_o1A?^<-npW|Dm;66PWfCF8P*VhJ(jvcXOtM2iQ*2Uh_;9LboKY6 zAWj{v)DcTBox)E)h{sYtZQ+^UVW@4|XP3sB`k}R2{CDkf2-rWj1_EH?msl0+yh2|V zXTOu%FO}6oxQgtjC}crL(;D@Ufj-E}6U`h3+X>6tF+W1ao)ID^+x*eD=oh0(pTGGM zu@E#@Nt>nqkHKbJDD7WWJ<|#_kH4y8orUM;;6p2P0UoEw0jf2(rV_Ux>X`#vAn&zR zwpalfY2eB1r-P{&W^~X^RB>l`V7%Ry)*G+jP3A&)+0^-gdf4X=r2m;y^vudoedqpbAs1YDh<^=tFhAp&5kKzrjkKLzfK!GZ#~8zVuz9+m>o3{L@Sx*np{I z%R=ExhzXgm!)2LZk7rs5&B-{7No@_>h8X)Eg`2WIN{6guXmh9jKZ ztv>lTvp>#yXRupt7d_~&T+aZ0c^=F{o=*|A8c+VoF8j9{)IqE3ytI||lEoPXSoke~ zEFi1lF6ydmWey|$x`)Gvz5nQ_Sbzps@|B@M8(!Q0-Q?M{7EYe9>zIms>$z>!%X9O2 zFs)~E8>NBRzRKAk0+Jd@oXLFq?+j<=PvYwTCXX2ZEU+h@PPLyLKRs@DQP)pQ9=Q`% z>aT_0g#7f5VFmnj3nVb)r+?yF)6uhMo}d1&?6E(fwL*F1b3&g_%pNOxEB06Sr(x@a>AI0`wJs0J2;G>pDco}zgT+eHlrgP#+K^H zu^33k0bUBz&*RF5X4!*e@#Neo9Sj9EISOh*Tnhm>vS&UWOD%R{gOQ9aRMla{WyF1G z2f1L0aqR8n0yE-1WL0(ZZ9sig)#8t->b#Gu3Ki}wVG+Ky^1EqTG-s$RDO!`jB)Nn&jbzP*a#(*c$zQJ~!4%W&2N zjW=xyo1N65g!-Beqg-PQ5c&c+Li%~{$%2Ks2BHDm6R!47}%zGRO- zQtJ|Xv>-G8L>UV#h(|C0lCdOQjxLn>D}SRU1XuwO&hEhOKHEyRK`09n0!K6wlyPUB zg=MO&Oohtm0Kl&T|BhP3%Gcn&{f~xwP&`^dQX9Ysm~zL42cQ&~CMxU1AtqT!eDoZw zWtF6EMz5pxOi*o61)g=FI$qgg9Y8EmCui=a4LkdCwneJhsoj^G1OU@7@BT>MixYb5loX#z&-TU^pT6r@m2c+cmirz~-3S7};Jf zHRJR4a%uk+a3_-)pt)9nE;$~PH4%uzfxL=d!Yneh5Zi49+>8aSU{&N-z^VVAT>&E` z3_IoKK3gKq(OB^VcIdbSZJGJ80oo>y1gl^T44OJ$_@5f6GR+8XZY6l&yrSAluRjii zuOlH>;?t5T)5M*TJ|;Z_axD?N2sFS{!&>o2Jp3V-oY%eUj`RM3I>fv;L>7J!&VA+u z_)lQBRo+1~1#a;#T#VeaZL{Tt+J$ElSIe2fsDE%?h-}CC*P%vc4-=F4E?;oh!h$)8 z);CuJ;ad|bZhOu4+c?x|+c-@%5@Ibn_&=7Eked0M8Ki~|KONRkE8>$LyXG`w4NG|B zkNgyjcM(tt_-l5*@Z!MWf}Npqe-(qZSW#^ZlyM_qPQPB0+UIX&aapPB?Zp-==s#EH z-u{cXdHGqD{_pHpmow{y^3Ip(^d~3pR9f#pF?q2IVccH;jyWLr`ia2>@=kAFQSfJq zUjyWy3=u;9AM960(pn+>UT1`#9QvnUjh?k@iF@J?-Dm5p@O8 z%>E~~3H*))(f?2I8$fG?@cSDx_Q}ETrqWLoex+z1eiZr`;$I2v<={6NE*Zk_JyV>p z|FPT(;OBCQk*<-%Ui}2^DvW;h<4eD$jmc$3>IZwtH@3}}k&1nOcb-RTG<1%wkiPtL zrPG+&dGE~>JBZVqMbA^`ls9(Fs>e{cklV<@u3LGDZ)S{gv^t}gXNhweyJDn|@4vCuzupw3ZmkKkQY zKA^2CwAV;Ts45T=HmbhPFim04e0c<#NM$O*dU1JSVJIW#B?J--v46}tnFQzk@uTG9 z=a#86{?AzJ%GyE2ij}3G(Y%>VJ*qJI0Q!!6K>wVGC;LIbV_%_`Sc*WD;s*@S;-ALm z#A9VS6qCi8MNPOs$ddX3q&pKaZq>oZMTC*+te^Q>I%}vn2aoj7?<3(aIY%zX_>lu} z{25;5k^fN12b2%ywWTFjBr9$wTwA*M_O7sDTf7$YezuJUOR zE)(I#_^tmZ5yoV9FsYbkx&d%^=3pz(P~`ucWtm+l_FATGnR}Q<0=8xS0d!$dW~Ptg z0u`1^gam^DQd=Z66j9Wfi6{XDDat%cd4Bk>U|D|WPH3&bbT zuT;e8qRO?0f7NB1d{CxfiQsz715!H{v;3e?68clqstj;0#l(iA2+=>JT^%8xPmSWnO zRv4HYc&3uKTL%4BJ-AR~@V~^|37w})X&x#?nO3Z8Yt;p|B~#M@%%E*#309p##`+?@ zNslf0CWy!F`z#gdKR^g~Kpy-ZeBu4O-*E4r4Bk_Xh12%=e{=<23JTm!0eSW{cvx|# z356~Kk5yBn5&UPDG}@5H{!1UE$1mr{zXE1rgCl>34r=n-!30fxf#YOP_ArMpJ=rPY zMb3lr^p6;}+W&*vXHn84v|w$WqNVFf` zsgzO`QQI=fy{xg9F}-BBv{CET@U`GPT#)v%O=ZHO)Sik>z$)@cKjhsCG>D8o%)BF~ z%P+ozC;HR%0=t3+SG#z&i^6=6c(T;$DC;c*hE5vh4V+X)_9wtT-AfNyWnWh>*RIbY z_j_p&V?QdhIa0B6DltWNXSx2!bKx$xNsG}?tk?+gx7bjih8W{ z3l+Cz^uGQM>l=Qy)R8(0-#`c${@_Q0IHbf=5=wlN6o_VxPGzfPw8w7IRNjxMuL*zeK0kmfU8IPu}5+CgBAvBak2 zk~aKWVt;Dr_xV0Z;088PVw#mmYDqH0&wBNsPupIu0oFq2heIP>GwfT6KnL_*`);0A za4S$M@*)#CIM+pes6u-N*3^n4d43=vF*=lWPLJ)hqq!^Y#1^h~BMx|Lpg)k{FDJ_bHxusUZBWtDDhl!5IMS zF}uiA7__EE_^f^E!jecap^fUkH84Dv(1jp`N3-}NR83^R!fcTwEx2Hs@kx)tClRgr1x3?e1# ze&Xm2i4$M)f8{D$>hO}6*_8Md zZB187kqZTXs5{HXUv{!Xf7Y|mqODTtv`0aMp zwqKO1C9I+cTgD?K8r4wBGpDduq$C11m#m&Ypu&4;7wnJIPR5hd&bwk)9%S6Dv{L*~ zTw0P&B;6%r%7kTOFWAjk)?<9({{}MS|C2$7yeQ{}vR@9R(*gDZtp2FHCqq;zb;+Yt zw|?t-+c^ie&%aNq3Phkqk$s{>{vtiG899)BB1unYsser=_&^l}6=;#%vlV`JSJ?pg-YAwP;uN>+Npp?O(%&ZVoO);@M$8j1OUF__sjBls22&%bL?|V~d(eZ0(66g!E|!Zq{B& zQ)Hn+5oSg?f-N%V*L15fvh;BRwTqL(2Rj1NUE(Dtlq)v;e$}Rp?j?i28|js4>=f`k zCZX)E5tn>*_FWep`fD`K+9~>N1*s zpA`X%3#ca`G+y94bz=&S=dqOEVUe`z7 zDcM>XNsH0$IeeMHg0J|hVq7O?RQPA`2)BJ$%mh%Y*SgDedUgMdbRNdQvg=~$zX(@Y ztYSmMZ-ihHWS@AHYQ#^(e^*8QO}_wj07v_)L4U+A7;2>JWr4>Hx62MLq&xZHwtr{m z{unx6M(ZHfIa_d&_ok+_v%9$Qu)&vRoW9@tA)QZ;op!Wz77MQVtkfx0q{e=U zA87{Lw!FX1-xc6H!m}z&oVzNEh9D`7g>71Z15@i8f{C>ImvjX&JR15Hd~5YVNwQBB zQY^DCdW*yJXc|xCh{Rm}T#FWB&74o1$-zi#I-_Q@zvL864Y4N}b>Lr|pa*q$UU;Sh zZqj4FNlwsw6S35Zth4FGk%gyFU^cK_7n!p^^D=kjV0{2$lP~=g9I;z;JmmnDi6u{j zN^FkId5~J?;)@@Yv+u)ll;Jzh2d3O$t1nlM=NNl-QeWKF$0}w6lzEw4UhIvAUMMBm z^Cn9)u?u<*Tf!(Ge>S>=3w)e-N6x3*J*02Ee#M=V)tuUY$zSyBI8U) zaV*Jo-Pt5Z?fSB30sbthy!e>KyDfr~3 zkNISm@DuTq^dY-j8#CMov`IbM=Zi;aSVKLQmGJgH}5t<6Na3)k2;bc4y<$_T^BIgG*|H92T z*Z)0q&3f*3vKbq+wMtpFYGTF9b4y1DG~OZtdRKVHOZLxLWG)$icsm>l=in%LOY#$` zVHKa0PrGof@u+@cmrH=4Fg?T?!LmWTgWeI*2(#gXBL9ofOpMM77->SRZ^m--l~kiv&kLM1$>2HTG`p8ukB~;>pWfn$ht)@Z^a}6d_1{MRabT4E^PK+0v-a7J<@aVwu=`iZB%?UEaGFc=qRsABv8#+6;UJIR!H3zmGkHm|QF{!q%*gy@+pCzrN#$U&RO--l?!uNI| zd@s`VTeY$5Pk(fOG0calzo|oB>hdc4xw2M2xzp`>CHQv^S5xUmJdQ56*XfcN^R+r& zAB}YFrTOxpX&086b?z+YYcX|%*11RmfeC~d|Fc6dJtrEuAyhV!P({fhl?1F!CKwL6 zX@7Nh2wvhdtCE=hOyY$>M!F6cT?DQTS|eV#M%rqyALR6-yc{gP&`%EDmSMgK&-<6; z!ZNxb+AoE1F24nm@aACEv`88&L;V*i>Tg82gQ`ToOdg(Y^6;mjr06fm&cgEBO$ArV z0?ET8L}lstbd!fK;N#2yJxc#d>y*XG;bp`T`Un0@V(v4M#BsbexgLuFBCTICUoW0) z@5>yrAgC^vnPlAAk+^y2u}JzEp8wp zd5Btv4ZQSarykt=IxVHg#*WOX2Ujo@!lxFgkEyZc{0PlAz`Q$h>G>YIX{e(~| zJyhdeVx%KIb_54u!|HFD0IUD5FK`TfPOrZx5U0n!J1Pr>HoQvJl3Df)&1EN~)s^j_VwV(MX*ujs-?;v|=h(&A2!o#DE0 z<`HzEcO!b*>HHJwY56GY+2#Jvna-tEv?ROHJuhl7qN$f_mbKdb+LyS3zb<$=U&D)G zRY3}?RoOk!pI)x?b5P-#wsvdAt3px0suQ?CXzR^+&zJ2t%IAM#D8XtFrXHw^@to z;GJsR7^|0@Qr_EO3!PRMt02pz!9P)a*)8>xjg;T%dem>V|G?gWJ{Za1IjB$n-mX5Ub z*tRdPH3;i~u!>065A_s?PbuIzSE)OlE2o7XZH=dyKcnb_0H3zx0_M#I(H`aP+avYP z8`I-eBPkutv(24nDcXAim#GPygfPQSLKr2}yxXS!G1;sjNaDbhb%2imMJkts*BH8WeAE+LlO_6Xt zNQ`O#5x_sCjksjCJPjHQEi_BTvARbiiIW(T+mNTQO`c4o>4}<_WdKEgqw*Oiy?s&q zwuM((wlE;|3QIB^{ya@+&%pYJ(+54ZCDcscthe=Ch*(@nok0qMztAF? zr`ryby+MWMDV>A9t78yNDxSS+h;+TfNHUXvBKA2IR>8*$AhGXO50Ty* zvz61A%pNBgTuvp~zdG;|@k~ShgO3N4+uLG`v>)X@UvdlecP*f3Jb9;n0;BE0Qrz%nXKQfVvnSDa>HXM%LGHbM#Tc%E1fq=4E`rLSL z-}UZ{2RFlwK?UEXf{`-%qe>vYTjWTWl zw6}s8_?%5b_CCK9(MhNpl_=|uwP=VW_R=b1&6s%P`5lbR;!bVf7XL#fyP`Z0wF6X@ zy5h-aWGxt3*_?u2Yq8?O@MNS*+ohQeYMDkVa^2vfhKDplRtVmh2nKa|l^3=4A}rzG zz8Gbjo}PqIfRRUO_1bn@Ob0`4sBK>n!Wx|K=3-B6+ryAXp33s+i6s86W;%POX}|TG z9W#MXLiuHbmHy%as61^-x34hCklFS;Tz~a(SF4qT@MUymdPF#ejO^lhdT!%0lKnRt zG(j^w_5vi4*mmFT1`q~gd z;`1Wu+F!9_l80A@gC8K67yz#BHu5jhlz^eUrp2v*gR;y^8bFTTr%Ge{aW2H3NwHU~3^=#U zfS`y)hI&@2^sX-qD9S$W@6N~Jy0hGD@-;ftJ?Y6F7rq)BUYxjn>y2uik;FE*-(d*K z5*_Pw?|O<`Cg|K&>5rgnb{|UX=w2UE=48~Txzbu6L@$FW{Ifo(YQG_Slfgx;=l$P0 zH-Sv2A@qEy-(oGp;zqigx_J7)>vo4*qxW{qYE|MTY-|myn_8LP_OfxLTv4-w8@C;| z3QSlvjgQ*bY3q~o6WR_Gz95!{M-n4bw%Illx>nywWEHfGVVRTuCB@p-T|wbbfmp%0 zOg;YMf2&ABXyyE?b*z-jZNz@9jbYF2`;P+@o>3QZ4)XSXP0i_B*BgT7{F=yDbHAuS z0@tz7f2nh34SO6jt0IZ-@h~vqU=dbq^bgnQd&Q`gJQSn_tMXF(hX`5sU|ZbNJ|++# zD|K@&GEF(bo}G$(P&(~U$qYj;=V!?fGp~$Y_aze}T6yGO3F7A0Yv(yp*0&sM<2YlI z$)c^s--`T|YGUS&FHcT@Bpr(q>@=rdm*y?#VsBFg-vQbt^+Kmp4Of3-Dfa3)ovOoL z=Fg`xQz7R0z<$0>&Q+7$m+Cwr_(LADGRZpO*bnH1sY$}wYfwmIowshFN~};5X*7hp zk=I*oH5Z~or0WJpJ_=9Y0uNBrTMf(x-$_%vz*AR3X1CRngY*6YoFfT#njsebU>)hg zXp8aZ!S*LQ@}CQkz~Ob2D80>F7u0L3wlCo(FaV<$N}WE@=1`d~n7P8~gEd%!vR zf{{Y`@pP_N4sM-{UI9)Y3pWAAUw2MU`|I<+@wi}j?(||aw)W8NOfUZg?NHyh3g-b$ zj+dH?SZV-*Tbmc_g^y~QXBA~yHE2E=SNKU=S5Q2bK7lp5w(awUpD1Qnhl1T6bNY4m zFS>hFHUOyFI3_lajfMG=EsG-#=JC|w{T81b%oS)4l|?Js>Ns0o^TaW|IduW{(bI68 zb`FUGJgz>G#8nIAq_P6yK7h*~0wg+Vi5;@bZFEVd71I!JyV3n>VPSvf_`2~0u zSLz6@Pm!-?B4786v8`(>J688lc!Zozvf7*4FKIGiO^b8uqe8G-Dy3N$XQGTDKciu0 z$gmrnSM&oF2B`*1yJee&M(xi34~80z&*VDcx~pFeB^)TvBCw?@BH%6!F-ybz*kfch?7gelBk@*3$fZ6 zh{ZoC15a2G-83A-qdVB=@=y8=J%h`(phx=CSGG3y9dQ|W%}U^z z;-&Ag#MAy+EMRS)u|mXtUA49T#15+MbY+Un8f(O1Q#h)>xsNJ4dN)^yiFS%Y2|3cwS2vez z6&SZ|n+-{d%&F4&lf8aD%s#BpfgjU9aNjXUx|#@^RCd1eCHuE#No(x*4{g@2A?djm4PnWtfL^gjz?$r?L->5-Q%d&!)%$=8W3XXuYV zjoK4Y{+(SncQ#@w(mYYC#;1BTw(wk?CaY(9dH;$ypT~o|44tf+$d7)X@`zha`SW@1 zFo1;T>O3X+Dmgv6V?9xq2&jJOyL(evVLeqrf1=hOmmj^|)wA8e)E^YOp^H?-4*d~e zH7?CX64yI9un%?hH0;=)2I@I)r}b!7RF9^_@}p08=jz;m5n_jW8d}5}IiIKWbHTW9 z@rAD1xjL;f*iD~`v%;S)Z<14^PzT;M%`QLJGk!y$!5W5LXV6 zsnVyJ2zJDV5XgQCO89k^9lR6zx2Vr}N5le&RJxO>+XB5$_Tu@_3|)nwMyeB)9X!wEAmgLfCy!)KgF&bVy2q0AH7n( z6Ky;$_q_V?@9o9h4R`uzV@?}?QT?xbtDQkQnifw6M-JT^bSN56-C~~Vbn<4N>5LjL zsgV9&+>zQw=H!^cMoNZLA%X-))_fqpk#UtYs!0_Q^HSp~alG?e9uVa{IUv>4yC$rc zPAuR$Xo@R?jUMSw8ww!{@JHXQO@FwQPCE7^E{u{vC+Mpx3kt81E=scA{Fs>Iaw*<`y!U`FC!%Yd@i(C-DAMn&Sd2v z=Ktd)^z~W>9O^q@cLlW;Wp1Jq_-_VwipIcDL1tDA6_|xkadxy-@LcwmtGjI<``=g_ zWUF|DU;FG3wy^oCOrEZpdu5B5@70g}`3S_F#7xYMn|01+%A1mDl5N!Ssrs=O@g5r0 z5J^PxWju=^x9`8<-s9-|dn}Ye9krS7b7Sx8-1}%GaYX)oRc4p){od|K>v>=6(`U?@RU{%|K_Fu-qZRe6jD_Kp2 zJPY(WRO<6)VUabtDCg=&tf_7 z-+x$jvSu=0rZFvvD``^Z+pAa_VT+-!yS^&-Ju1^xg?~6#**jdR-fXZ{ z*duMa@*C7lEhFy>%F6n=>_JBOO(<_z@ATlpjSjJ5WnmY&P=!K~u3L1$nl9%n=ho--mQ?BnkG+BMf#WJ8yRU`gUAj(L(;&Iy zzqkkaQE=%_?8~QGkgyYI^eb)C;?UuEGAk~Vtqth~l5k|$K)e_c{bu6viKslYtZf(n zCrp&_Bc*^=(s3lv&q4gP|E7X$Q@5{(^-eZ+nPqP;p-w43c~lr*;nh7BNxZ}puli-H zCw%~p-S7w$O_ahRm2GDz>j+zzuu-}QXX#uSxE}iL>*424B7REgUz@Qu;b(L{8x?(# zYP9Lfm`Ll_JclVUQ*0>0DV=K%LPq85ew-y5a2-@MP48bhUv6*)iOEmoE9hAVa8oym*6b73@sY<{&Q}|C9uUw!MVz^2sC?@m>E}t>5JGc=0=u z;Cl4WL(b>)g>dpWy2*dZFI^Q(ek^&e%{}(i!gQ-NHsXIku9<$9^5T&~@yXg}?WX>LXd1_3DZ!vd3vpZb(sNmXg zXCAfqzJe#f1zDzScXUyP+oUX3BhGV*=^4#$)8QzKmEa`|9*b$D`a z^&%5in_KcJAsX(ci;$4%!U%^qP@&VEAD}us3qa`kQA5@9cI)}8LC;qQJ)cMwP$28~ zMv$)LFc9v7O?3M!8VJ6utZnNy=C-yk*-;MRcvE}CIse4G&p7?N7i;f{Tm6IJN(#bz zwi&Xov%=By+vM-IC0_luzhDI-rL6tzcX1@dE5gX-Urm=}bg{Qc7{1*?J>-O$Qh>k5m;|C9m17iC(9xJzbhQ&2Gyi`;U*7!>^yM;%9{(kM`EB?AjlLXjV$R3V7kL*Z|Al-?{<|Dq z`N39HE9gr7&afj#SANPgmpuwyS+LScP9JyP(8=Jiv0qkP13*R*1zYt`4FZI%$f!Nu zeGOQB=^Dwc>==xAVOOMx!;0EwD};7gt$oP$!7NCbJj-jmY?@uH!XGYTw&vD_)v{U)t$G!1GxF$ zTNZ+;C;KtyH}%}RBcQJ5Qvqtrs1Q)E=t}!v^wjbDoIiw};RL6;Y}3s)aAfFy#@}ms z3j0B>Xb1%3avxWI>$w~uYE2K5kPOu6S>>UeCZO~iYhg=}^*3fE|*nBnM#Lss}@z>EDvT>K%&v&y@=nWV_o z6DtQRN8jM)y7$}5Xs7GP>hLR#W^&5z6VOirFBOef+Dt6&4E2I!0B6A}#Fh3OgRn-Sr=6?IdnawR;z^~yo>9^H!4d60MMOm? z+SooAADO{L?Ssa=lKm0%0er3U_dE{mNsPd$S^MKvt*bKs0oJ2bP60c&iXvSS{qcC# z#1sg_<~JchRuR$9pU8QTainj|c3sG&fCf*4c>RI*mkPw+WyRrEcaA*XPT{&S)iflQ zKE-_SjlZbSJ1N%JBq%qY8#+8^_8OP-vQ&yegVeR6fmAz zB8V&69As_Pdy9B%HAnb!gO_FgAuM<;oRf2_aaH!WY9srLu%%(ag=#DJ2kHZx?CN81 zak{I3e>WaK<~|D}LOQ&&2>I|av;&Cr;gN-Nv~wP&qq=}R-6w)y@0MP`{>TNM+CPBy z!*reg^{irv;Ps_14kU#m%OK1rf_el;a%Y@5`8PE<8Lu3&t)&4Z!P8}R_OHS73p~z> z%AG%`F=Q4&r3(9dTx0SdcTJ*a^5ErQSm_Mu5#PWmkdBx6WzEbF*gi4&EM25-+Rb!A zxXq3m`di_gu|HoZ$N&3;n@KE=C;v(O`rq$5nyRHBNlGFd^5p`$gY}C*_TR-Rpap_^ zk(G`mAEzAqPJW74&B@Kp$#<|oc*%#rGt@I*XBTRDLc>Cbj@>2B0hNLkdoI|?#2#c`7Ubj7S1^D)u?v0x+Jf9sHtvilxH0J z_C)TPHbh;yU*(=@3)PkTYwnrL{NA)C2m46ZFZGhcfCT?AWaE^csq_7_7HXXgJx=Rg zyGB~?Tr;|>M>4v9FgoYYqi}{d#Bb6=f5@99I<1Ssk5pLU@2K$gR=9%~MQnFo3}JxY z4*j{R-rbe{tt(xp(%+}_{A~}rgi9sdxb@qUxdM-tklyJTsLJ_dqz-V$ED=Rg7D;G>24rYv0^9G*A;V9p;AaI0p_reJ zS>>EanMZ<24ke-vC4@40e0Y@V)@cQ%$DgWU{Q`g=Coye&z1hTkU~YeEd5~T%4m*uqfhWj;F;c}s51CO6Z2xO>3eM>3t_N6 zXGN9RqPHok(aNQ!Y@0+VHQ4LCtn4(k_Ia)n3~-kRHJRV~hpMSdpQcJLq*}t%r=ZBt zdyYe(@WEu;ix#zwim?k7!&jtnH7$u-4nWwi!kKa2#j|C)>j;GHv@rFD?!d5n7OAlh0?KP5vaQo`IP!-xcf#8oH@a}WY!{dB) zL^{2{N#no~qRcsDmt|Mbu_JgR%i)h9XUC3o)c=EqWUe zlmw7m&6cH1r~dIU3uBBtQw6C?6-w43) z%=?<{X8{|LIN;4@uXj>ZP}GbJ-gW|bLnnJDn339{%^Z7_?4d8=*;K+p0j}C=lerk& z83z>Osj~vy8TenK&Y-^*pw2tD33XPAe_U+)&fkz+J-8| zdglZIA#lmsFhjI(1w$E=`glxcF0bx^NMa6i)10crBKAVt?)F&uTEg=Oo^(-xKY#<0 zA&j{vqC5?V!yu=X#8ln|p253l*Lfd;qE{h_S`btFNnYwx`Rvf_G~)>+v#CgCl2=2?tS2v#nO));f2?pKIII*mc$Wq>J4g1G z34Hl&Qg9;3F!mq>kTXpYBrA1JDX&PtX*niGQHvw$t1_?PIMYu)4JC5G-nof|cX&T| zds2Pnx?Mm1qOGta6hlM_P+o>R-9@>v$5p!EK@DvD2)&Uhqd=2P+^1X4o|WYypRe|I zo;KSVH-j)&=AU0eD^nXgmrD0LD3Z`dl1{Mfn5jIrzvd3-rYod1aI|JYBHQm$FU^$&JCLly%ad!dBJt$(B6BV?jI58i${SeMUm9)?>kWq;;l%4CHW z;2&_|ROlcDmH&%dRW;9~G*$?p2H`x+i9mE`Ja^MgRpI3@E7J9hhSa%oN&$8ex#p$O zy|s;*W(S+YV-sU-ElUF*4Z+W7cae;>N_r>vwY@aD^w2eL5Z`|enm=Oa$UgmyZ+a62 z)=W^Jr<*JkyU_%au(p%WJ7$hAYCpSU=D4EvlRIXTq_#=k5AKt-2plqqD4!52uzPsc z-LzlD6TcYRL8jVp*073q$5JKTnPb!|Yx7IMH~Sag1Agwo;eVl{VNI9i)#1!=ww*)F zui!d2e|jh@vh9Dyk`G8ddsMoa3_;2P(|;i(Ecb}kg0%Sh@#nnd)`EO~SPKfTV}e?^ zF^r$03m&wFYhjumsK|6(L=vy)77CV*^JQ{KWghR5-(=kZ$iXW-7UZpvOZ@!L5?dI{ zf`#$_*!vdnsH&^&2__O-aDsxO#X4%#q=F0u$>EzOJKGI&4h#F zo4XwhMM`ExdecU55WRjpa zO~aVk4%;W8`wK(8zXUC6&oL$bT0FU*_)j!G1E$rupz;{2uEy7i``pCqcVJSzh<{1(F<0sEzlBs;KB?Ewj$&W zXKL?ceUgmOgwN59)Pkn(p@Aa!8^#RM)7=ZffMpBo#MyGZ?P6>&;achW#$+9z{NqKy z@H>oAdaX{=tY(JA?eqi&>KbHe?ifGn|q2f$cW>csVK2vU2In62= z*Mg*cZp2f+7h{I)M7!zJkljS^_@vK~kTO|>-G`2lB#FhufE2s|2*D_w z4|Lb_$@O@WBB9=BR3FekZDb(8S%@e9g!p*no(GXF2!LlE_>W~kZj0rY{3fwxQMcH3 zxKr2bzu{sjm{7f2Fe{6L=T%Dum!=6AsuhwMIxJZ!zu0gQ*>C=<_d}69u!cgWx zLBc>D0HK?O^t=dc(wcZ40AT~OFC$3OJ%%oI|m= z1)c!lxEw+dXPr-AFs0_zaMVt8qOk$tv<@TR6=1-XF`0dcV7=;5Yf2IZsaaFfHkK7G z<}P$=dz1bYu^JA1!0ZhhBwNF4?6t-n5kn|rcldp+@r>==W-MhqX$ybN_Wl{(Z3;cH z?@{`jBI<9Td!Y*_HO)c3rayJX&9MXvtSA>TU^uQdo`5%-07%6ij!3j%%DZL;tZA8HPia5JxSX+7n}g-wLP z(m6X9?2VMJ1?!uULU_~ANU-zSdRD5$$RH=#P@iYuoJ1}}2!;rk7F;5gI{!9L(93xe zI0{`~zpk(zd5JkztM~u2IZs|+k7@E1v>&Vj+!!#j3)%`7f~M)XAV}1RezUM%>3fT{ zpmdnUQa;pIfyU~uU+9={xwDD#G}e910N!_=Z9M_!Kk&w*$CFvZ^a*0^my%4Dw!S{x zim>F;8o*}CrGpohw)K+NHosGvJepf>Uw|NeL69o&P1}POah>QMW zlp5sh?OgvKgC2z{q<=EfZbn+=iGeHnKYhvgM8pf91y1y``@tIMEX~x@aofS_$|%ha z29LBL&KScorPuung$b%2Klv-g2TztNuV1J?2M%WRkfQTJN=-Fr0_ca`udd|HNVrrQUx*iccXo&@K!R;aoy zbx}t<@qqs<*AK~r$xKD(58Qj=2vv%Za!I}c$<_3f){KVnU@+zbu=ww=)<9;}0t^Oo z1T1I61Wi4sZ;SBxe|!S^4ds1`AS-1H`arSzKcNoA>YLi1s|(f4D>?vnFYd6G{Z69=Iw^d2LLT4Y=c78RB^{4q>E;2Zi&APP** zzyzCOva0YzHA$w~NYl4JhRkiOE$|w2GuzQcRMiL_7~K($z=5>j4N$A}ICHDHf$4F? zN|=-!?v^ezyYyLf6bgyO^Df|7zvoX9N8nH*_JPfH*=l1b_Q%y4E&4s*7QTUYauIH3 zoo}>zRh{?c1fhiC-0!nZVU4hLlpXw`dCzVGx*O*?NiYeI;=_#%y3f z?q8OGihTH_BoUTiyEASP!uiK{vU&dDwOL-M{7`~9Kl#p5`4^e*Ud0!;@v*LjCF{0G z#QRCIxKjc?X15*qE~56^yuha)wu;2vDQB?gD}3@1KjwWd=I5akEfay8ZpRln&TnV(Xyj;rbBKaGlmIhrZd>lb|~tgLARiZl6QQSdC! z&@(UnoCsbou-RIMNDf2dp;L?FV_N)+m=^z4T7hFwom>18qM}-SC*X-`@iD5!JrCXD zyYH43|6E$^Z2bAL#Qa$W_0LPqaIYBXiPG`;IGTZ0KFUveC4Cg1a8LS|C#cZ zim)QbY$T~)Wv0IyAI_zBGBsA?Ogk!Pt3NU4b@<2VuE@bw*~^GW>9W3Pw^`A-guXSk7%q280*3gGw2eh*T)1l~ zB$2v}O?WZs!-^~}2PHK970HQQ=fRp_!^CVxQZnuABNgPmo3XyR>1cu{05|YIG^Z1?&v0i!u_t|HPsI zOTiYj9Bco5cz|OzCcA_cLpx0V$Az*Lu|*QVL#m#J^cliokwkRLjKX`0nH=%MtoVF0 z3`_Cktw@JB9S>M>L-hu&OD5u{;V5Pfo(mh3xq>C&T=44`%gpn105TkOUEVmfK?zD?B_zFA!?xdOQ`)g&f$Z>5S?f>zQ#dKxg9D{#9fLw z94N51L4)Q=J?Mr#wF%rOVJ5`Z7UEa6ss0_Al>lX7+Kc^B%^Iu z)Wc*gW>~bbpirM}I4lr7EO04=(lJ6{xOyulx5F43E+CCixvYUfqTNXRV_*pVBa+k~ z9xYoXQU*oTcscmMkb%?H0I?4Ysw01FVC3A(zCW^oA)2ghdY}7Pgc=yX{*aUZ*AOMm zrG_qoKO)VA2F@2ln!vh24(uUZwpg7_u)0DajKbQY=~d{R4QQ0Gdce3k+*#stkc(}W z=!FgiE&QbUZVifvoh9~S*Tb46KK)r_miPqkVb;z04safNV)%3PfaqX_)_`!S>A;#8 z(!!;4BvX2tgw*x{!NTP5gl#~4h_FKrh({%(Z9q&$V`2tGl^PIlfkY2FAVA?94j9Qe z)p12Kwwe@g=G`deoGwW3LifRhybq{p#hE9izgl=wQv zKgD>9){4vFVtnPA0$3!?+ifqOvrKv51;iP*BLd|O054kpRlL+U9`C3x2M@i}tgojn zK1;>-Wqg5(w=aOle4uoB??I=>w$9PNybTJKm+@)JdE$AJGeNmOaKxJ8k%02EOSq z^nXbZ=56)8QFRHvkv@b%G4S+Oh#i~&jE99F@Ny&DhnT-oAPM!HfIV<)+vo9D1wn5j zNM9C`=v-TLNJTGY^dAsyUX$y^g4(<$cf5L~rmzgFIJqkKB8(yOr>A$Y-kthREd7lK z@e!t~V&ggcXE~;Km>REq!?+Et52wOOWUoVoKZVAbl|KoY!L0m`>Xnsqy(yJ%zC%@h zHqgh)Ygzfp_y`wUY@CZvAmZ9rA{6@o-a)Z9;-UZTCUmmNY>R7*MeMv#&mvoDze+uU zspmzf?vEA_*+Zb=!Ly@SL*o>ShZ%ZWm?67^y_3UI^o<{bUYYt5nGI5q1-9MB=xQY+msVqP?FvrOoiyB}q zBH)D_H+lZiEYB(M(L=#7mgvA|m=21K?x;XtjZvViXJQz@*+{HEc(B;rpg4^9&%xx? z&^;F|jbcKlAOK8g#!GBt&wKH(m{_6&<$bpSIQ9X@`Yj-F<5r09s;J+gP#N&+W6PY4 z7?gRUDpRwS`3wpJxj)1*T`Y4b%Y=2seOhK@NJj2Qj`yIX3*?ri#^HrO*=U z`Pljh_Fu<4Fu`k6^p6&xgMY@+DP~_Wggci_PzS}KGxNZnBox1%`~YhtOY}7AQ4?x7 zg* zfT7#+QgMAAIO3zdc9`3mAJj28=NjWU3eA4Q?LMIAOLf z5xsBx7<}9K2>T{_F$jkx-vz=Br%F)*x`6_W3X=V=%v7~VrJo`n>vS(alFW|@{5T39 zoQ5vMM?`6P;}lGP22CZx72uQ&Rsc5^w6g+4=r01N%-M^@Jl|w(;;c1hz`nyWx|?o$ zud%&X+1~H8y?bo$0o(hnw)giQi#-7i3wqgDZR(mngsnu5cL=F&E9>3Y&qUdSVO~ zAdvLXAez0^Bx!*=cuSlXybzQ1eJmY*FFb*C>=WwR%`CjChn#XJ^~gt zcPvKSx~9AEe-eSjo+T3pq?hn$=GVF=xZdJvBgBX3XI}pkX2s<{V?ZeDB5XXWYD%{;FbKjihi79DAbgXlG@~dn7lohCY3cQ-}O!M6_ggB&K#>w_Rf;Rudv`i6Z4|8 zn)wLaCoKxxw^3Eflxy20Aggq*I`G;p-MNv`z5C z5RwJcwKa{vo3E{DBm)p}7oKLWzGfPNZ^E`1L4fTwln0H+eQEM}-$p(H@3skcSjaL2 zfG+u(gI84%iy%f?@YENa;&A+03-vkm4GzS-yZSjCzoEO+*BV7?((l9s=Ls&6k=ei# z$T&?A!Z{Wmmy!c5Ja~NE4WzipZD6}uB~CL4q5kW7l#u(qzXcajNHM_`L>7p84xksJ zcJZ49y@<4+S82}B6~wi~OQRY8h>Dkf9=d|PF7b+<5Fb$S(vL$|uvc-t7bZ3g9EVkh z7io<17#Ld8j6m}B`_WosrYNZM0eO^#(>t7ROx`3%{dPQ;f%Z+r?(G!?lL~%QP%_Dw zhcXwTkh5Uhpe&uR8mSP ztKcKWf>Rqvb$iIHSI$NBIVyUD8T~UvgX&2RN5G+fKw)+c{XV~Gkqk`(dN`ItzasKw z!iQ_sRM*V5*g3R)a_f>q|8R{UuL!*m!=dM6P(^WQ#el2{@ZN(l>MsCN5iIUQJtyP% z>hfZRop@jeLwCwn@arw}qxtojphSyb+vxqNd7$@G5vRX>trEQ4(6&8>Jxt4L=QH!2 z;WNYUZN~SuG^fLGkm;Fbt@#eY#`waPX16U(i!DvJ2P1X;8tKjROFX6PTnTHW?j4hh zBU`7kZzQ0}Dc^OXfz2Lt}9U!*S_$$=6a=VTAi(}&5G4bg!@wfmps(jdw@f+=iFS>fv@96kM*(Y>t zRnKty6}Z^#tJqsrFiyQUgo6RSmD)e2Q%e6N69!wVOFE~*fmeal zma2c}RN?kC6AsGrhiH%BpY8D95H9UFVt#tKK8c+i9c$tlEtA+0(XqI-!-Q30lcQrX zgDIT54i~=s8bNe4eIP7~KA6T%9};h)4~e(Yhs4|HL*i}pA@N~+POdpT`cU)?XNR5W z8DUZIGL0QxiMPQk@iurR-UhG4+u)V>2)uoN0=)J0q*a3aP?%^j#BNo=pydVOVA8S* z-srzB+^<$jKW=WYsMJbzY3Ecpfg77Dt#hhAW~z>67G?3%!=dN;g~-rFdsJ|z_L!;d z?J-l@+he7&wZ}?jYmb%6)*dTWq&*BdRC{>ON~jN%2PTW0EbY0O4HM*{e=a~+tAAD) znfdr$@2`r9H{nIcsrWH5@!2u?&y9&cf#19rTsZY)i!UOi<73Ppd~S`*30?DdoJi|5 z9(YR4&fl5FJ{~0AHXbD2HXbD2HXbD2HXbBCG9D6t!g$C}>jGYcMZwE7c6cS;2Cu~1 z;FWkAyb^DNSK=e^ru`In=X3!t!lK}18auoaZ-ZCjZSYFG4PJ@2!7K3*c<1~ScvHH7 z7hzHGGL0QxiMPQk@iurR-UhG4+u)V>2)rpj1>W*5;6+#zyi8+fVW7n*<^%`XrZ#V_z3#V;h@#xEq^#xEq^#xEq^#xEp3nqO##%P$II z_+5S%@FOe=etbv4FYz|`CEf8|7U30vPu8 zFvHG~285y*Ho{u{{s_ax_j>=#2*Z{5@iFm5G4WY3@dIMwWu6H)Ag5!i$nl4QA2;xr z^Nn-mi^A*cn+ZD^_rsA!c!Kj8?8XYtCa+T}U@#VTVL39A(myo}aX8fk6KGqiyw0gI zI;X=x?1 zmkRhT*eSAU-W!vlqJSun@XS)EkZjub>+lVgm7LqAQp3`|55pE>MEmZEImZoW%+AK& zLFeKQyi9Ms;nU^qy-mJ@vJ%1ZoQ*T6nc=x|p2BWnr-yL9md3poV^s@?-i0?tO9ze} zD1r$%cov+yn6wmPE_D{XSv8^>S;W{DpZufVMNCoUEV$SgVgK3_2j6hr^&zvRntv>j zhW5v4@aM%cmvusi;Y_6)KGCu8+^<<3L&XHU9>wc-&XFRDQ3L=tt$dSMG2Th-{3Fz` zC0Z=+*d+tuSO;6;1a(^Y*f&uj z8pAs)B(pB~Gj(UcgMKQO_)5isKBJ~qe0Bt`h%ZJ}FCaB>YITYJJAgn9c#0I8wHumn zXjHay8+oZuOn5LJ*jkc$*rwV)1@_b&4W!e+?kJOLq^G_-3B%ixuen)?+UMw?p5Sa( z!>e)nN#)SPz>_4>@#CZ@_-nL9 zD$kPn(gnTI2*TV$;<>{~RqcW4*zsHu5y!3bUHQ(XZ*f0QoF|SSE60|JLpX)F@aecN z8Oub31KiG@A$I>=NG8TJM2}?I%yX_>SAbh{LJ2+`F{GCWj96zXT#SMM-gyco28Y3W z6%@q42l|R08H3#mphd+wAd&%3>V^6|%bjY@Jk!|E!au?HFA)aRbY(beFRxyFkv;ETwsEO z@hc}FqF6D0xB)pdwp8~sB zel}x6SxqluFkv`;Fyn^~^80Q2NU;9pi7Ix78M_Lxc5cowe{gR8F)~NE`D%Vqan1wF zHg4X*jm_T!6yWBsPYQ16X8fScoJUr$tT3lC+5ed;nNMD9V^QSh7uPYIj10CS=xEc@ z*CX8g8+1byH(L!5=)j|j!!DF{?|MG+_T!=JRnhwK;dcHU=D+ZN0j_IgAyQvj7FKdw z{C9QxP>b76vR=#^Z`QU$NK^dN&LY2>qFAH>rr1^dCd3Q3-KKv|+5#!BL{jd9JS4Pc zHa)7{KLEwVg`JVFuU5J1k(;2ITviKKC(*lzv;DHjctV9Z+6dE|!W>QRx0EeCe6T0Z zBSdgug4n+1W*=S{b0%U|7PIF8WnU)4IVLe}!Ce5ul5uNH!#HYbt^lYVO=ueZLaLiexK{ys@G)BYW_RFyF}?8P z&cZjbP6&LBl>YbJ!OP%AU~H0qlh(AEn~im7<#~ED<~;aYcplLyh)(lA&1evFU5Yz6 z*QM91_Sfc=Meyi3v@mhj zFjxfPc)`b1QI0 zKK^#P)7KOQK6mehRm?6=;45vfKT*6a^9TUm-qqW=MTcJb{VI~DUP-wGyVC0jJ7)Mb zS|}f$)@L9sPKLPgAG&6rCramWlF(`n4F33f*u|f#O ziSxq3zHt%kH_gW+%B*%5a~AO`a^FzgDH@!udxsLWgJO3E{2zImyv7D z<2+!V@5bDglD=7AMLs~xknE5@pC37(Bushwd{ChHc&BArim_Kp1|Z*o6orW|Lz(<@ zQkYp#$EJni-(TBZZa<2_fiN-9@JN+~9a)E$1%5NkX8plUbAQ?^Ihz!IT-%Ij*ki` z-;uYkQHWY}|0Id3V${`Ul*{ObV%6NauDu^$Fh1L}Ia5H4MqX#rZPL3R=}i;O`G%s* z!}!owg9n*e^x0hhfiF3Et^o-aOtzLwtJ|Ma6|8Pg(2J3bg{QD^m+x2-nwsb?MTwDW zRg9YGh9L$;%3O!pLGO?6_S=|K#rMOvaXFgEYK&p#3;E*a%b63S^&}Hg?uygTOJp!v z``w|0EVL7@Zk4^GZF*4q7ql^YOABnEPb7zNHPDVuYQ($J&wx06-Hks3l4*-78}2a^U~%|LGDhCMK@K!V6y08K;R-h~S* zlTV_QNPxm?Z}>7G$CL!T-^`_yMUYQ1>=H&Gi^${n> zGou$Wnp!q=O)(~UN2#XK3a%jPe7Q9K$wo?^^ zv+-O!{95=!!j*rvEgl4X!HO?}6?NWS77tPs1=E`@6a0Pv{6cV6Vlhyo_l7LYQL9r> zz|6eY>=|GDWOT<;Bv)xw2kT9QA@I)fegrX;bu3yA&URo05cVjTb7TlpzmvlO0fG`B z;ah6LZ@?3VAW*+yg2jl|w%3`A$oMVFRPSqzKbY@ee8zo|aCR-%OsYgVxQBHT$erK> z6FPvJsMcjudb71oh)y-_JxLqZ^lU^4I^}Wl%Mx@hnKWdvooc00EYkh?moi@TK39-l zku>ulN&6uwS#EEMS41Uc9EAv@7)XfV>Y)6M1}}+r`<8?O(8Q``3%dnEFmx+(Cs4*J zmn%)!SkOB<{GRb_I#sKbOwd6P_$%Rr10EPp$DO6mqo8`Y7H&KP!!uZz7%J`vj87E# zmUQTbQVhopDmf>#`+i0L%XHBS5f-+M*eAeC!&;YAl!1zn$p3*#j`A_Y#UJ8M4c9#$ zFK}SyB9O9LlKDl=5_`t|&@mBc;_~J@yhm^%#iP^(?GFpTc4O=m126@I2oh>V2JH@7 z`vfwtbTJxz(u@z|SfE(V|DX(*1rAM^hbH@fjYj);DULLl4%3`XPSPp8?PI+dEbglf z?eioYF8j}hI?CWp6zm@nvUWLJ3ll~068`Mb{yeS{cm=7DyJKT}vu*taio6~L9bSL6 zkG1y~&TvHcR}bjvq`wAr>aTYs?T_>qyhGahYd$Lak^Y)0a2%w+MkA6`JqAzdx0Us% z9Gy1+0n$%qzd4((1ybAZ4bR7i=)dik*!piPJ`8%bi|)Txc1lOZcXJ)!AI;wjvINfwqA=`EhX*$|AKjbv4H;7(!WU-~%Eo)+Ll zjhh?yIcJ~GHKRWrcYin*j>U{KGO?g7J{}~AGQ#t<9}Dy~ebu^RwJB%9Nz118zZ37*k@yM7Y-1kCxk5^%OtDCC4M z|1sq&<7_Y|8@+;^h#&f>wKQ5vSs@_f5#%gx$^t;7C32%hs3-ZQ^P?D)=;n|zsN+<@MWi73BwAA9QtT9 z8<>udDKLTPUajxD_$QPnZ9kg8E9LbcvI&2Kq%@X_SW7R;n7|KZ|h@s<;kfD6@mb(;z&2Z$)T8fa%fXQb?en z_7&{99w4(GNbS&rpaZ}O4n247aYz?GmHO?3UQ0O}^c;eRrQY2mL3w|_ z9R<7#bm@K293@lW2+&63*=q!a(4#X@>61L@B;Fh-#F}GkC4^IH;#W{5qKQTI-(&Zq z6709}J?BEY>(`$JFgaVCH&^2W%zg_JUcghl7#=BM48Z9oh4F?pIX1#hN;wmlUm2S5A@#y4L0Q003w*4qrk`=xiE^fo5cf`$?y~ zgf7yJd%&Y3EDRlg{}OckiOd|z%2OR*kB2^QjQCT?%Y6d){Ra3oczVRzA4jFuJv(P| z5f5R7`%@GU_T7o&1Wi8!JaCs<-2;*V)i@Lci7{v67xKkjD!G9~<~zASWUe?~JymQS zq@*XMB$7Q!8i<1Kj%2+BuaKugJ$AcuUvoB!3(UYPGJl~5%==$(x#(KxT6g-frP6cZ z<8cn753+GSOl{+`EDutZs0l27$ovsU71VoKtvVgB_(!0(d&K!45it(E-^};`Jw7LD z!5+!&i!|HaH^%ws0~22mr{TWxOM=hj8>Il|cv64PV~wrV#*1^T_qE0=co+U}TXW0z zN|KGy?Lv$}`3~#fO8~?`3Yuqy`CC``pwzO>9mK`H5Hf!wH~vlpXMmJR`SH;U>{Hyq z@o^lSt@~5Dzz1;AewT$$(XRvFs>SUag^2)XkuFLQy##LH0wTXRQGc=)JTHRzhwng5 zXQ`U7+c=cN&WY%W+z&PFeI}aoJ?k0T{(89mQXlkgvt#d?b_e#WdrrT`%mR%qw-9j= z3_ax^048vSoxAnGQM_O}@Pa4M;z`}7c*g44neds2tD)hAoc>|rzSmRG6YjtyUJ-2y zA3U&wW`RG3`E>5P&iU71NZ`6<=<0Ak^lDGsF6aD-WE)VWs{6{#TSUNy+@WEa*x&d7 zqz@k$9xSsH^%nGuCwD*h$P%56ZHRG)F58FIL?Z5(cjvBgHr|Rbb)oP=zIAb-FfLE> zB!JiT>JcvsQe0U6Wz2=;?ocMKlGe4{@P)!28hpqqB%}d^xUYFH^%|%y-=}H@7{^uh z&dL{XaSZQ)#sVN35x9s3XH3y0z(>q|zaHKrPBMJ7{caV%oB{&=poQIt>%?JX)_X15 z0xtNr2K6)a>bQns52hn6*gLrhn@5n~-0*N;B-d)iCz*=+_+;Zb4z}g{RB?0LS7NI8 z6AZrf3XHdC`uK_T*!MQj<8Vcf15J87o8%hm1Fj9gWS8E5f{fFN#;?z})i-hVL{hAb zhoSBNknt;b)0y;3dd0T=vu*#zcVHJYNA!2G{6lLP?(iM0=u?}T@D6rRLd9``J{$3Z z>*;OD6@Re>D0n&3*x;iY%l;jk{!rh$okHptbk0tTa_t zx+-fP%ld#wiCvEf4vxp{O@IO~2zf1hL-n`tLzF3Xo*u4KPsX5#PHv7@_*-J#zf^TE z!8_XFWZk8xTj(LZ<;UsmQ1qetH)E95zd+~#;{o_Kz&IhitEfISZso9*`p~`}k^VB3{tT=96T2w?7c4(hr9WGxA7rK9 z22|PTTir*rCv5J7g@)25Lwu(F3bLVADo&yGPJ;rL(}`U|H%%3T373Yz0m7>HV8So) z-IaCahc`3)pts}<#sU*^e2b*AZppceGo7jIewgo2b`vtC>|Vr>V8VE0p&EH1SqlDQ zEo9IH2)no^hZZs}l6WxT{+k3y-jnluay>HHRNcj>5vuNSY`dyX%x1(qQm@CTNB&&o zr-=jf+)_14NWzb>LW|vuSHKF5Gu7YxqFREbE#f<1+cNpU>|4jm-f3bd2Syiv&(n90(tWV6ga zvRpG+O%Ym{LI*PT(yy zD0s&vHC4%i4MDaLnrUwnwxw)lIn*#-m@k>{6{Lq1=$ihzbOuaNU{9FO?y$@W^HJ3- zq-K9uov0TI2#RhoyyHc4;%r*YTIQ`G^+#{lgy%!*)B97rFy}}125Ok2gQiS-Nn<$H z1TV=lAs=n;6#pXQ8xJ1;2;;dN;rhRe`~k-AG51NS*C*EZLoVKpnsoLz12PFEIj`a}PRPC0pBmjP-Ik5H~3R#6@}s2qYnQ?v*8g z3CWK}m)aEnnw}>^47~ypa7ovIhr|P63IF`98a4J_`5OdbP`W-H;K;H9cfH5yzh-+c z6r7v-5M`Gst6%5~UMD|~<>mVEm?b-{iiZ=NP{J+q(JHKpAq=Wj@xn=6Es7Jvi(-&G zR~%<^>yJIcDM8VU|5#25VPaXWpK~5yH|21-=rO$P*PA(zaE=^8Bv$FrOzIyB+WM># zEN8WVYjJyPOF^XfjEb9eVrtB2SP|wd(y_YkB&z<@%ppU(|Mf?t8wXDBtz9s78 zFdY)LJ&-ms;hXbeDB%u}5GHtq-$Tu}?!dnEmgi1JmanzPu(pYnw|?rkdBb^NL;8!x zFbX$efy9JQ_K&BawLi9hoH5Lr-!1zj-2Dudad`9icd#9 zhq1fa5y`}YPMXA8oJydz8v6K8^o^y4vvHy zF@nqu;%h-R%bm;|$~yBA6+NBNhrL1kF)~MO5U*zdO-om?*v=coISN7UMnMqfOma|x zzw-{UbwykYi{kt?BUiBdJg{ryq~-e8z?XdsHF|^i4kWiGICidHMi{U`Z0Y5o1B2ZC z!JiNnprCBcU!2cq3}xRR&cW$V78zQ(hZ*?qSN*;^bc&T;xE?ywV*fIn3`bYEtn*+F$HnSMKVj%md*0k)>Twj zm2m!q1ifdOw-yyqF$-}jr^-Kb3a`Q#MmCgkEMS0KHQb^9Nn{<^0>LyVyak>n@Qng| zTEq9SnS(SA@sKLsWz(u_XO{X(s!nGoZCy zY^k7q55<=J37V}Gj@07WOcSm1kkC(Sp%T$g|1zSPJXUH*ks8i{dYYb45KGPW$#|un znbL*{A4HEUfR`|Q8|w+*hIkV`XVdqP{Rks412y?}`I5pu!K+%2uPO(k^gSF7fft@5 zF>u}cHYf7nRM6R2ml!5DVMZ#bzb<*Q!*>P^go zvy_>%FO#M)X-*_*iX^=SNy)=xkE2kWKl_JEfYk%63q4@z%y-OX3yq`MLh8+Xsd&l* z3Qda`0NJd=hfNYTwFqiy+IluV2wHLnn@EHt35!l<0M`iKM{=I8&a>23I3}pWLZq`1eN+>1~12d{A=McoSSapydMt2{oOSZybmzrYU&)AM*#2r z+uXT3oQ>Vt9!@y?YtfzqdGPPlaDG>N573LGeH%%jc;BZrY)^!S^!hKf)7ES08?kTY z`$%DZOK=Y8v8G2uN1PAJa;yNMxE`8t0|v9RNfUZJHxBmV!89#63%2E1KU~k40Aj>D z4GjoeGeAULz#&G2;rRusRkzryto1?M-l^X+6cz^9e3E(k1{AT96iKo5DwYRUV0?>f zzpVU1|5{)~lJQI0exko;=Ab-H%Qefp5asb6Zy*b%ErKlH&4)nL@ev7A@O2pwc|`ac zBUssn4&K!ejjxxnF$w3(polQOewTh2_=>)Cu;GO`)o;#5+KnrB>Ef0xSd9KFm+Re5 z)q)r@aM4|rPh&g_qjENVfEJ)0NyU0z!M94zdY;5i4sIMla#hbP z*7K-j5!Pg@XJ1r3A4tXtk$TQo^++n#GdQZA(USb)a6QMep5Ce+axYsw!py>W^Gn79 z7^Y@>l2tvDiuJINNP9lKn(f(((KMzXAJqR<~bC#+{Qn5WJMAdVwBzJ}Dd20ab z`5M;S<`xpR*yEE6xL3#>r65))78iF#IHI8wM{3XM@G>j_5FHz3jx&BPwXN%TtIMJb zrRamj?nPdfBWN`KjLBcV*jVES>uVXmJ_l_}90}FP$7q!=l9O?_^|hW~-{4Ch1AJo} z`dlgBc!_&s8;$qWqnir+RDIor2a$w{g7-FJJM9Yr*&Z^%Y{X{cShKO4jqS*1EiO8K zqgEfX&)V>1T9kmDqB@s?qc2m0MhF4t@Djyf)`@el1bz|QX*WVEss8~pM8ljU+}X9- zm)3A??^AH^B3i_mh1MaK?(aQ@HLOXK9p~`+h^ICo+kTFz3lhyQ9T8F96G#?8Hc4c8 z3ageF-;bheBE@*UC&hFq-r5{-JVZ8+l_rTS)Nl?+stO2Hd@j14ezg%{K3os0u*CR8 zQ;a_a_P&tSBULyX)t#wUHT_skFI1!7+CRLOGg>D_-rvA`WIj}jxqV6{Q_$h!cdR~V zWnU7+3%|1TEm{zULCA6-p#l1Op8J7TDl$I6|dSxK*B5 zfE`Z&c*=0%B%=&viT>AwsWUzf4Ug*(@GM3T9xgm@q#PU`j>O2S>(KTXeSQC)Jx0N$ za_){7i^JVEw0v&h8f$P!s`|aaF^#5iObKRC+TO3-;5|5Rxe?16-@u|^GAu&hazh)P zrsclkOJ@LM`bZG-tY@Cww|rY+L;=&tT8H0CGt zBm`{e3HHr8PaTk~$Goo&NIr@;Pbl*PQX035_5fB-V7Ht?(;q?Cng=8?9yA!D6y<)2 z3)PSok~WOtcp3~0P0Y^m;v!wToh$n#6JJ~0fp@gMdx7S+ab^pZaO+0~-zY7JQE*jK zdp{ZP$7oG2`qCJ{Y`d5T9-vOua=-R}GHQrB(f__@XjyWi4~`XKlmR=p_Ir>|v=91d z!0SeLnuiRZ6J&9r%j1UT*-4yH#GJgaFi|fjN918tI7UB@;KR5ksVMFME$OIcNJDm< zsDrr)R&hsr0&roEQ-@N!e?G>b-9S*C@6(uL6}Tw>oUmZz39TgOf)NO}noPX@5%Z&k zSDx-0b2j_MpA>zt&vzwH`{BqRZ0*`lH09iwbAIRB&iN&|_kMNIbAC7gz;k{#0qOR0 zeipy5pZ7~K&-*2wNvOEc4s79}KZPK4h!(g$(P~ZvUrpk%S(q%p8vUghA0jT2UfQqYrPz&pUH(InJ%@xImztroq7uwSTM~_0pC+e*QwnsZi5}Ge|!cz z2M69#xX?uUMM%$u=ROPotLO(KtX=Gh7>=R(nYpJxqc3;h4H~J=`!a{)E3jk>8f6nr z#L0hAmFQ!`cR2RJx3z;nJ*{;l?M;}hMs^fJEhvgnTh)$&xVVQ!??CL8@oERU$<~5- z`S^r2>U%Q$*lodtGZ_?&-zdjCk@8GVl|h`wy@SxnC_KxwpW-;g{<0*98jH@RomiQ1 zJia?Bry|k+h9@Ao0l-5I(Ab8uR^dT7_7aXz{CnAk6P$u*{nsgUNIHr!6iS_!&4BHTj2OoK1&9-B&$!t)%H?SP?dy>#93e< z!~&%SfhFY(StmCIunK@s%n)$>)6kUyHUODyWmMKmo`MbzVA7E!|xb^tGh zLNQBJvqDV5OeiKFug=sZOiARniH~=vViDOm6ATAQ#CJFVIhiF8Dl_Y3m6g#j)PnKQ_6aNk`mnLLny;8mRooV|`Bsl5PFYXj5{X6-dH3ETHRXssoQB{|7TM>QsBhg5;_ zqJXi2@S0$-F=j0Q;aUih)icsq*)AI#X<;}qaICfd%UN0%PobBHeUH zK2~TH9U_SDHI(f~jx?2HWDXFD1qyl_+cWHKL6R1@kOFF)>R4>R!CQS$T}Z)EkwUJi z+_6X6BHr4`S1}4t!+s?vo2S6$m_Zg+lm@_KS_GozX(+1;y*+mJ9QhgN?7~j$*vxQG*Bel$W9qYd*9g-nbFbVgL7nBg_V#z>e>bPzek%7&;R|yM1344TKK=GbPoUb@A%Sf0R$(05`vlhIM zy^2eZEsyvy1fp*NIG%v9{CrB0p{#F%eB=?G`u?-HtQy6(oA)rp(&0H$VZt-$R+5K# zQ%*o`=`aCn0ob&HD{Mxi$0DJ+gTeF(xX!vb$9@lkjSf#188zV^uOMp)k#T4`R2-#C zI=oUL{|uD(AE3kQfFO$whbUS)3HT!&-hUd->GKqLhko?wAhkX!dWIQ2AJIoZYQLgL zZzDBa;HOB9TCmQf<^g=b`lPZWCADi$k^){9bVZR`oG^%=Mrt#OE|3~EsvxzSu~!dL z8;&F0`mGp_-0gn_+>p;S&NFH#nlq6sj;^9AE+i9`ZTgugq2^U2JM|M>;O2LeH(S?} z?9{t4>?@F>o^oMiWIa0Mpj`-s+FFdpf zOYr2zT*WfHFOUa`Kl>CSvFkJ^GmYY8`njNWO8iG7J?Ul#d(XU_L_Bvz4P_7f{z31L z0l{G&;rM!YE}&5Ds(z}W#xH=j@yr_>E-5Cw&X1Mpe1fhZ5`!fHn82sY!} z6YBpjJTSa=>YGm`j+7H6ETHuHz@sNL9xHf1FLx2C2b zq=BhYhY$YVPqaI+5l=PfZg>mIecN1<`c7a2CMCj047IS(!%-4jy-4fi)5g~W>)bV1 z=(I0{et^$-<5KhjO2uSLGm9KMxIX z2T$;X=D<3F{e%u&?Fr1m#^>lH@iyC&<32DJP+f}E{aCDZF2zCT(Mizcz>GxnChm2k z6PZ;sS4flOrnLpu?i9&V+D4|J0BDc((}l?2!g{ai7D?U%$eNMnj(Y_g)zkqv8(~%s zkdgi<2@3ReHvj=~P^bbmcEfpR)8Co2VNHtd?4`_m1dy)FoK0dT7aUIKWpZ4JqV^K~ zeaysiU8!EutTntQf?Dg9WSG9 zrF72I^08BY6yqc^1XbRz!~oQf=Be-AA1uanhdF$V%1o|TC(Y@AW%+Rk!4#i`e&-Bun@oj}hW6@xF$u_d4eVF} zb0N5-plw?{Gr&*`e3G^~{Nf;f7cUXFna}f!lC=z2XiCHhGNu;F)g=lCH4_B~reR|< z3zEgjk2^uWHAy<^pFkj{UNxPf{rbs}8#pR;Zl=PL8un~u@Rkzr#TTPrpGR_)wlTQ{ zHR(aZ0K>*ftTvS8Yv3bq1RnZg@Cr&c^yp=c2ii3@Gbtv=$tOZW8OBk4W3TY2Hm(|J zy{|SFhTqp3cbe~*`;2>R;on7BP>@ct#gBU~=EhpE&;7E$wqBi9Ptf#V7-u>|Tt4@MS*L+-;F#sgtpI_k_3p=lhH^ic{RKKB>sM;wGYm&it1F?@ zPt9BJ#}X=az$QG5CopM<91-ku625_bg-me*nkf1K|2u`v0LT9a0+Lo8Hre46nBIz0=S28(Q-Rl;Ayo@rDPw0%W1iI2B+LkHYYjTuu-dsmrm(no zapi(@;p0L;-+?ObW`jb#C%OaQ5%^b09iTV1VIKoFYYpp&`?neL9bR5j^{p2{sXuaa znG4>M=IoWf5x=eY?ZRmCf2akW%&di;BFVJB#CE{P>E5@r)*apP@PtPF3zFsM-Q5sy zQ@_I2wBRAs;}%xD+IUomrXHX=+{<2hS^a5yA<%7f@7>B{)%fs)dcR#1?B)*Dvl3=p zk1>8L?`nu^(ZB~ny)R*#Fo49NtfCWJc}N&jq!#M;Iq70J}jeC4s4YUV0S@e5JYyB8jP{#xRp3rYqbK>V41EiZqsg83@A^r7v zq?V7KPvc?SEE+w=13#??x*H!`<7N5uY3k3zEme5XB+ohztfCv*X3$lT+K&i)2>Lh+ zWr0%kb9MtOZwbe6!oPN>I;e_QM0uNksoH=7U_d~}8p?;q+B`3eJ z5E8K%;bwPI!}p2KhHd}~YmKAY1h#mBaSPIzusIu#Q$V+0f{0nn92EGpz{3Iw)^!Q` zhd+blx%8B6fC8&ZVE3Wukg#)9EZWn0LiEd!t4}6tfz2 z4a&rlSt?oQKx#)}<925w?-^LZFH~?DEBJ)+c5orng$m=eb@2vc-Q2q7DjA$+V?ztW zH!(bCi+{TfuNX{bf1A85bbTTyXx*p;3aj95+hEV4YpMG}J&sCmX*G_5fwb`^iw)-8 z4^&0^BM9Ny0)5pnRC1Fd!Y7!t(11s<$Hx`rR}ea2AaD){V9tdcB40ti7n)x107sMPz<>UmP|HcBR^x*c$mkC*PKNCdgO(F9A96G z;~*nhcPu9Y7QRf=mPPtdh+(*#;UnAmUt9h>Gk*=R{1uc)`gO;JNy0Y>f$uYLIs$>; zH(5!rUV19ndtjL;(1n1zn*37S;Ky(SILbY=`G12(uPE9o+%X zvo;7U`wiuRsUYx;+yw!506M6fh;}v}-;MM+yN@iXWXk#25&l=SwaT!+(w%n7YpsJ~ll{VZNPs{dQgrG+@1?N1?tna47MeX1)_ zOR#4>9}rZ|Ntnhrk;Aq)7p6;<6jr+PEfjyZ5C)v-C8_0U!y5ST{b$1`r!IUSv&frB z3c2+4NGw!b)G&FV~HiTGeNM=?K7ll5Aq>R3-Zls+72cX zIZP=MSzg@6^vw&B+>le;^Y)XE`0p)fJ6eX>Ud#=~%N(bWpE#XB27U|vKrkPE-d#)G z1;FNuYatbnS+?~H%#)kjw=B|Dh39i?{?+zwGpBUp5|*jx`Sl(+7J)IlRpuxFeXx3{ zsj~Bnp8i=lxj&aJ3T54acMx_B9{RmBcBO(<$uTc*_BY{r7>weRgb1U^hS{NlgZu&a zm*{0s;5t`+o!&9gaC3lI{EzWr|3BwZTUW%zm_{3_6j+#J-SEMzWE*^TIeJw$@Bzxv za<-V3P^1@Huo-_r6yaom&7w~HFZ|T{!gBFN2d^*lN9ywgyK^(Sw-%g@wf8ztYMYTD z$sxDcV3U3$KIl9Mi0EN-9K}Y|qbtIGcqE<&^yx3%@(DE8VX0hrTA?{upR(7dujDZbK4?$FGi?L@-yIH~okAJq|@zE9u z7d$>~!{hR;5-xb0AHgF8G2&56#f51%rIH@Rqje`7h3Os}792W(4f@YIzK6aBt5>L6 zJgEr9W(N4J{`7(Gm8RCH5gw)FD@loD?QdHrOXh;qm1idCAYdfNeVA#Oc#DFW$wg2& zA*{t6x2`DlE8_(4P8{U{boiUw4p{R}nFLO_O-64$TIS$*bU$a!xv;bh?6TY?(r8$Q z;YAuoJ~YzUyEtb&>@Bg@!2$wf()qVd7(QBS2hk@Q04$9RI{F^OAK1IO>)Pe3>xszUZ1#p!|jEU+Cd z3VD)I(?)k{OHpp??EHVA0A93k%ltN$9cn( z`kmI$ny791q9`;B3+D}lBlo~8xb=F~cNZa=kILI`=4Sv_-f&^|wkNJbuSBTf2f4Ug zchHFNeN4mp-gvY&l((oFFhAz7)P3vp*S`P0qu_bzf!y~YGo$UIB)T9tY%48L^$(Gt z{h#_?$Rfh{;j-Z|yi?LyjdIXmLVwGIKyhX&G7WsAHr&KW`}`RCTMd%JD){R#?d^k9 z@~zge36_sogva5s#v+g&_SL9dg9?pH)lHt}&XbA)WQkhRV>n0Pe;?XSn)sy`r=0;y`C`aL0w`KM@By+s|Bj2&I#p zV7!Y2^oF-dM^S6h0KgGNV|(?*-*rJ_6A_4eDX_4GRdDcVk{3*NL7)k2f{+#TZoedH zfdVx}Ua-0Uy%pWYG@HW&rVDcdeL)@90{Sw5#-dQ>OG00JNMCKfJE=uK9Tm5S__;7l zUwic)2$e0~qTF?}67=h_I@9l>SQ5%hpNR&0zy%eltkY|7kTI6L2vwN8<{|;e>n?CV zi@ZE|CwY}w<;Z?5{DKjk|M&NQ82CR7{2vDX4+H<-V!&NBtF*GBY+UK|6vvd(x{9eK zb-vn)s_8?9l-AUEtI8Zx{dKID^2Yzp4AuWE_;pf4xAAVnEA}xM5 z;dcdo!|>~Y-}8eIhVrA^SW;9nwYmx|_D}V>s{ECeuBvLEOE6;g_`vEK-$0}(D;-!_ zF=b%Q)xPrTszK+brw^>F@OcN+lun&dI^A10&{thMwR~Vj)znIVnRj5pIBykl)p`e3 zpfJDp+;WS6(p>H_;|ebsQ{ZtGj2&Be$vD^O0=K7dl&jR|I{iOY+37>FvoiX-YAU^@ zbzWDkx2C$*=PGqg@lSX8s$Iobk0Y?>x*Xm)HQuQ{Z<(v2%IEM^yYYXa6;$J^Eg3Mp zqON3GWp$}9C6zg+cwIo1w?9hqxuyeC(@HDryyv=HDI=@>m1Qnpc}1PeTU%RQ>ze7U zs{>+~rDA4HZS^c~*?BIPdz!1Rx)WekE^k$NY1LHkOmCGB;9lkT*7@v^raBx)hMKAA zX{sB>UOjU(+Y#=F|586>Wu|p%Zs&fmpo?@!S+##kr5AlNO1*T^FH@?kEC1KKbVw{gOR6`|$dO&_%k1I7g>Qpc>H^dTXn^m9A<2s;NFWTyjkD1O%lzB@zl+3@Mny+LEIQ;C=+A=f$z}eNcGwQ(hy#q}?I<{h_ z?RAuQ7AClifm7>hrw-)wHt-_Mb(eUj7tZn3mYzGcrp7S~Jh!@b$Pi3EwcgUo%Ic{l z6;*ZKS{w5vfq5#x1T98u5*@=F<7z9wvcZlqEun?v?2>kNg|FP@nC`9e)?ym>RhN|3 z)|Osv!d+2kl?k5lKRyMaL5kSYvIZXxwH+yckIRqC2n0U{ev z2f@T8SaDTjCgPWgo~TYKt(xwIq~e`AV~DGyq{@{tyS!p*xeIgKnb-JJUBmNaq8l)r z1pu4T(N=-xbIc&jK^bY7U~Bz#6|*20VP5r>)=u~O3YkRWv&+1`(u&F)cB32PRAN*C z5}%$4BC0I~)u=%o8=uL+Z5A>p{j%E98fze7O0Tw$t?Jqmck$>k8Ku5#h$s-w3dd%a z@Hb{`N#=CFuP(=Tb&agu}2FSB)$`mvZ=qLnMeiR>40E-y~d`uZz_+vyc z`XI2TlZ^3C+8>5?kQqu6C$vb?Ro@|&6-(zGVgUb7;MYQAPRZEtKVO#cW%m@%%uHY~ z9G+8BS&6<^-!}RqnzMf#&9*X_zra>oRH-BX9sFBp28#;+i)l+ZJX5treMh$?ny77G zX=!PkKL({`re&pNrwvZaNy|-7OHWVFNFS7*nVyxNojy1{Cp|YKEh9Z6BV$lTW=2*< zcE;e0oQ&K-X@k-SWegfLD05KOpzJ||2jvXP%}mQo&&y*i+jxgg=N>goRs(Sa$t@HXy&aJEV*G~18jKxHXwJ9_m>RCN) z8lqLoBbM)qDyk}GmR5>{H>SGE>liC~6vv3l>M4%%l~5+)oB2MfVye$E+7FS|>nNCA zTH$jPUwzS(D`;(*2KjmD7 zMu&H1jqmDnXL@H=*Ir$s9J8u0QRtMRpi+p=fQhfQUdk>d4hW!i<)t-VrP6Ug8B&AH z5X)_^C8bqmOot4P5(ulMl_Eoz)|Qk(g77-*m^ERs2bsur%&PFt28t)V@k2*Geku4R z;kWPgA3C<;w;sR6_}ztHJ$~i*72}tKp9{Z4{B~{rp<@evE%-f*-y;0#@vFgaGJeJQ z<>TkTuLfnaM++ze19L+_6Qb9_v&=ik<(B{RTuj5iBU|rKzu@;(R1UPN!9sSZ%j2z@ z4pCeRr#R&j$sE!;guE(>?g*=Ml~t69wu!40s3yu>Q?VNGc`t($KhjD%#3jHTF?WISm|DICm zv0v=(c>kpR9iQU&41V9>7k~2p4j+DB;&+j2e@EG=`#Z*+vcKam_+`Soho!rXR-Y zFH8~;xMU$~TlAViz?u@sgo9cOXfP#j?JR1^R3z!NqR!0n-)(2kpu=b=;R`o5(jFJo zrJb6N{t^k44MtP0yAI3G|CN^I9+tW&wq0yf1(dnruC&@?Ys-JLhcYq`QxEaK4daR4p`;6rW1C6XKS& z6SM*nuCxO7tn!Zfu|mVfST!`D$j#~brvRbqNH>{#jPeLIbcP`s1c}d9+OzaQ=G3HB zSC)BeWo4-H4;~ARa;3LW>D1U1z&N%fhi?}`peoERQ#5XStSi z<*vl4&$e(vT#V8MOIieGdrGgyW~8jdmzBY_nhf`3$x zfkpcWuG_I_AAA0fts5;YN9k__89&mR^w{QP63}#8d#r$PgAjlQnZcb}WcL4m6#pRC zf4>V32iK^`hHjhA$igk3hr%y3B2myDDT>1cZLb{f|2BO?OCw|#Aqx_)n*2$VK!L3R zP-Gk~oeJ&_g0)B^qVNY;(v(KY4$UAtIMKZiby!zSF;e`I;ir@O5h$@kWCzHGoG!=o zA(TXl$PTUo8XW^m#v$+%1cHlt_aA7<`9=!{DPe*89%35` zVA}e*6MO|L$Rpq6A6N?{Y zmA@m?8$vM;YdU2^VeSMBGRdCF?NfDC<<+kLN8bCu$312L|99s7zWF!FbV_MZcFd?QD}0w!1>BinOwHDT*vziXtdmbZt;{DT1OX^1<4o2+DTl&-?d0 z_x+yCWHPB>_p?6VUml&l-gDmfocsUWbI<)V74svq%djGBwhOv`N(opbt2^OJv)3Rz zo4uss?-f`0tQqs?&hKa@UO1K-!c5%aSY+9u!e!*A=Ge6ut%u60DTC&)$){}AnP*O) zCp&)3Bz%xr2l-EU9eUIz*3k7{I^q`z#|yvlRgvqF9=ru+a>-ZNL9sR5II_83)6 zWlzfwAnCb1>jnzdCA-Mz={V8Zt~+<$JeJ(pmdEcE&J#5#o1ACSoGQWtrbqX}lH)kO zjvy@Icx?}r$NpXPY_$LLEi#uK!m=gMl-Iy;MWtv)Y5YW45n)R)J{3{Kvb&6=NTWYz zs;jD}hEJV3&opSm%zv{gCd!V5|3t%%EfU3VqM{`a8QSv(5#4Sz$lDZk%rW63{{8#R z;`(6OxpSt@p3Cmu*^ybY5iq(zU9{qmp*cPGQ~n_Q>_c;Um||JSk^TAqs|{gTZcld0 znx<;>!Vgg|4;#{xo!IIJsF#Ng>B%aZrzn0{dYSzvJ=DwSH)A7s>s|jD`zh{3Mr?=b z5j|2B*zNyObrw}$UCmsKyrDfx;t#z4<^G@Tf4Tp6`(KaNr9JDsY%Z3zOcZBg(=pjA zbC=3oV6)GeQ|Fggs&ccm@_qM%cykJ7q~1Jk;!KQqv7j)^+!I8z<`{FyjgNB`<8%15h9PS=m}

nHejM>S0<1F^|&zmt{jzs-5dXlRQE31-CAXt&CG|uNx8cXbm zWDR)XmH>|V&e(G6b z=yuNMG`CoICoQe>JF{ELi(-(R2AhSwJVoqs$!i!BSq`BM3Qv^NX0s~h9Msvh;rcDu zD-m5aG}BFXUYWKfn-Yw%?xld)DCgs3Cy6Y-itwpn*CKTHUVXRn>}WWysAx(kOIaO` zOWVoibI*~}kY;oLfP=kb2qVtm{Kv88got}G1nag+#Z9^0nMp->s5i4V=P#$Y^KljL zqL441eLQbQg^D^)!fK#7zZcuKVP+Sx?HZMH4MEMHBPSd?cG#S1j;_SGTw64r&85gz zMb3|wpF2e=(@YY}Vq9)@%Sa#38KdG1kS%&R0%SrqPmW`#%BdA+vBECL3|T9m&17l% z6xUPWMHJ22)N(Tpdi`$dv}qhf43|xR=_` z%^56{y4Zg!D%zZCJ~R20@X6&9=99*!d0v~dg--*Y)qHCCRPib0Q^+TWPc|RLXWOsa zoJKwkeCqks@u}rg%_omfJ@R6)#-0u_zEsyIcBFXuIF`%4hq1*IMvW|Y zO)vW9gzqw9Z;|Nx$nU;jVk=OGoR*k@+^C23d*|SA)5VHdZWwox_}QaJdhpPnaB}5Q z-nNukXBJI5iea*1>g?%L&MunhE>Gl7oK{{j@8r=_j`Tbb#rbmzDk9U$k1U%$t(={* zMH4x7A%DZg{aKJ7nZUM(f>SuIadPDm?hd>nS)86K|Fep4!jyt3iIpeq?3=$qcy(1hyH--yGK5gBn7;SDjO# zOfmFj3SG{jnzC~-?qy;ZVT10R%0Y4s$vH4{^!^6|4uz!5yzlaz6}rt}t&wJSnIMPn$Wbyo`9=9=bm% znc}8n%Kwf-F*-4lM#VwB9;%@4)(_*>XQKA{;8^_1tu?MDh_5WST%!B#J+1uTI`6{-Qr?|SV%8Kxq=0Si^?E^EiF z2U1#9l~XwNc361Gf3)d$pG11Vu%6U!`P{i@&5uZiMtlB~aI`&pdIB#JZm;R#`P^K@ z@WB56aM7u9)f39Z_%CrEO`BU@F6TMrZ&u|{*)8I_LZ+PUt?r@<=5brg^ol`Twe??Y zKSPK2bcF%qW00izT(8j&@>ab6i;2%2+LIj@XG~|JJ3a356s!T%y6Bq`>R55%bbP_J z$ds2|C$lQNJtw%sppiGEw4<}R_GiW*Jo57~g=E!|ofvW7{(G6|iT!Uz2%}!4&B-4oqM6}T&{@POJR;md z`ES?Dkw^AK@0ct_)2vV_a&sb^;6xh@IQU@2(gW>ax|>@WCuEoqqx7(2|GQ(tkep#X zX?M}XxZGytEfG%fZjLlr`fuGkbwm&6+T5=a^=8KJDi*h;HHV5 zp!?n(2@W@FJiB(m|G#$sh!i7NVtap0$FX1f{KQZ6t>DpIn~}I)XXxH4VRUIvR&BBM z+>z0{>Vc&ncE<7rqnOF6tUfWdtT5ug^S%oEaVdUi@P7eGck=}k&n8Z zCCgRFb*Ek70c91HNyCchiP6jQyp=#%JFj3ddp4WdS-DeT_ineSpUm6(;Dul<3^R?5 z31zYy-lWuhGi8uW(Mn1@UEsFIu(zb>8ig=2v z{sgwvS`wb!0F;Tr{$8=|Ar? zhI32NTYT=^M;0b~A-X93%U( zk3O3H-{#iL@>#R-*^Qtb9es410QO?wYX63={%vjsO(=orl5mAwAH{~ho=;T1Pq}xB zRk_(^l=7~gGCH>70wS}XBCSDslG|Us3vQ!bxwA^|7?)<1RmqK+IpSTH=Mi2Fxi=pC zw}+?i)owe=#9aDiG>O?G=QYxf*U`S13_G{)oJ3YBZPTqsp-7kybY0Lnch0n#^XAT( zb)HNs#OKUtJALYG(_71o0vyC7DjI0ITg&lw!w=46FvawJ+O<)LXc10|$!@mkWhY0K zyj)i1mB92WSs0qpQ6R}G2U@548FEClexhSag34_^uDg1gJ@wo(rWbUIlkHyL%a7z5 z`GZHY#i`si4hnHwinZD?%q5A{%V&4#>|87+iH4)Q@U;5T^EuEm$17Jj5=pz{r$Q3> z6G?`@R4&Lj#v9vM!j;?=ZS1!a*Va9yV)_u*#5m*Ja47?XG27x^b}ku&(UrtTaIc&% zRqE#)UUavYu^-T750`c?;QQEBUcGfIoR-Vz#JRV)5Tj%Oju|U*b9M=Li_fYUc{T&K zxSx3*HGdWxbUXjaq;}}x-L{d0_S>0PzPbEyDd}92-^?%5=8o}uh}VA-g~SXDR?pG#shg~&*5O}P0PGOSaGUFM-v{~sFF&t3V8%<(VTJuM~?hK5YLI$DxxxEL66rK3A{ zVKI5;+_JD@q3{e>wI9zKw7XGJ4B)sD=Z6|NN9mrgHS?2s)7c134BUPeC(+v}D;tJc zgMh{SeK$Kg&NIoyfrC3mG&nMA+F2+Z*G!wpe9D`Lc`Cx>qZ3E$cKqlTOk?8H`{d^- zm^e#slp-=kS5Dtm?pk=9M7K}RJ6D;d+%u+g964IL)5|($7tvhDaLn1AZ$LPS{XE0w z&zU=q3yxU1!(Uy*Ub!QunERf%RW<(iL5ijkcnHG;H+IkChR%wjY2x{rR!~rI#>th# zW#Pwce&d1`b}8d-`Gn{ybDu+j=&YD*=p6}*LifD>Vdm1t%E@!)JehK4GuGfEh z%r5?^A4ma2PgS!+82hBC4AaszRf@`FaYc#Y>0UGCj?iwnXt{kKd%%@sj5Jjyzb;~S zR?nG!&Xnk>#VJ!O%|-w0H`>Sg>Dt}-@m`!np>tcDMxIMAYjJjh)nKTPQVrL)IE%8C z+5)cFn{YR@a7QF_5O5>-3b-Mx)casR4vv2XR)Q%vwmAI{R4NBt0iFi7{zR$%buG?h z+VEsBJY1S@jF0GHz*c223Wsw-Wyu=t6Kazf4nldyA7dOQ{-g4Y&^M|0|{D z-PPihGkuNJw>X=?Irk!;-!xnVZU(Of7qCitJGcpa2ps-vrJe`ZfS-Z=`2mb;?`v_E zfM>03aa0B0?*|#E?)Wou<}3C4b(9mA!>)Uf@~h&G*oVmH0;Mi|xW#Dz*Q{@Gwu7(y zrNznqEq5F|(&DVGR%)NWwm1VXWb6hPfwf@6LViTLp~YEw5$OhN7gKMbeKEd=M=7s^ z`2INgI!>v&4J}SH`Dz6F%fR_Wi?ajF1GkfodNA}GrJBJ-#GChIi?i-`@Pow?-&3>? zM>xB+(0qZ`Zoc199;Bc@3ECd_D$zT&00h_@^pxRD*0z=>$Fbp<; zIpAh658MWpfURID7@}Mv;1;kB+yOQUopMqiQ?FnSSO}JaWne8>4K{!)!DetB82*HK zz!Gqklqc8(=6y;#+Y3H042GLqoJn9kSS#N@r+y0OoR zya%f6R%iH+kheF#X9b4)!4Kwu>%e+&o4o%~tCJlj{Qlf)3^pFXy~tqBfykpBl=g3R zHi4m^@cUP^htL4-P?mN9uBAQX9Ks#Q{fU2It1}Ud4CXhqz$Sh>YYW(LEO#sa7`cu=x`5 z{ZsOPHTOQ#pKHNtun}AZHi7HFW^fZ&vI02+2oLrFBj5nA7R&?dz=>eqHLcDfumM~P zHh~+3el6(-Yr)XL(81wg&ULNMB(MZ52kXFEupV3mHh}BEMsO3@1U7-q;0{o&&#~tOpyx25=|X2xcBiykHn?28V;{ zdg=`ffs?^77y)y@MPMGd0xSX7fTdsq7y&nf)!;U;7HkFUz!3FY50+A&P2d7BbOUmv zp24+X1gsWY?MBK4tOK`!4Ry!?n{Oh0gTX(wI!nO%TZkVlxt;pUAsu(n{sr$O--36; z4~Fg`e?yRSFXaVB)>02(_yOtFe^Fccl9b`TaZM1=oOTD{=*!XfL#X^&WDMLf$s`z{m%* z2eAG_+8G%7nDatS#;;qQGO(eQbb-pD{f!`> ziZeuDo}YWN!B7Tgj*dZYCihZC17|M=@C4f{DV11 zlh1tOJ*LfB0ER|$7x);`2W|r+U@KS+hQ^Xkus_%Y<`u$Uz1DNnHaHO`q)&$ZwNunyb;)`Q!@22kZ82kZkjfdfGGI`6>}aE;U> z*d+D%2I-;P>%mQwcjH#tQ33gVv(1?VmcGroIItdU0M-8>NAMl;Lp{`i6Ty=As7Fx! zopejRfD6FT``r5uHvfaOc3|TNZO%ICCFdjZ4K{)+sh{fY$OY>@rXI%e{gXDQ9IOYc z!P-x$2f=3I9Z&i`qaA|v;0mzzbMBZ2bH0GT7&%}e82%^i6jc8r9bhfEP2RWCPmZS? zz)4_z8~qIox6@w15{LGD0`aJJCl71@7l2K^c4rM3vf7=kV71-uglSiily)Zq)&<&~ z+!FGY+3wVVs&~7y1FYM<-C08W3GLbLYzCXM+nxTjqwwDCP8k>h)hW=yK48f{?al}= z(y!gA0>k^YI~%};AGJH7Q<1lSyE7SVKA_!ME!dxUz`UQ3-k%ZwLG4ZrSb7NQ1ZxMj zI|C*m=P>xdhC#>!Lpj7tJB|!(cP4_hha(TH%WZekP9xk=$OF|+Ne37j(e5;W%_B)C z?Ywa;`KF!M7m_cqx~Sc;e+mCM(hoL|Z+9xeP)WP95v-ov?({ny`ZV~!&~)MhOU`U} zwh7LHZwhiF?M^Kin%C|$f+ZE?yOjK#OL>8H=e0Ylz^31{JMV$D3)`LisqigocNT%6 z#pDkxxtR3O|00*RJL|#l(srj6tX|&k%$!DhxrTIu&DXa(o9LH0w^D!8$@iU<9~il( z-Dv`I?rnF5pGiLNZ+EJ}#&x7i@Nw#K26CSyKVWDh@qpD&6P|uux~bhM{}p`Cl3uX! zIqCzk1oR?@fU=x@(7y8TX&Pq^iA%79#d7bZI>DG2;KWtxmC*J6WS?Qm*zMdD)_#eMxCE^3 z<2Wn9@E(p+4>p1ugubWaYynI9I?i@^pY1qG@&on(Yxi=T0bt(Vjxz#m21~%YeH^C@ zEa~SsRbXge$Eg7u!8)*ZKgU@MMtcJ}=7M>-@PX=R$5{b3 zf$D7dz&>E!PYDOsfs=&}RtbHC<180CxDHguIL=nE39PE({YcVx4*3Tsf~DY0Ff__> zR)f`GBUqR3IE91{7dTG#xwH##AeaZ{fhFKXuoRpLM!*GNHCPMQf~&v=a2?nPZUURY zCa@XY0je>MlXf2QgZ;oTm;>g3`CuM62`mB2!D?^`*a)r$!(*u@FamA`Yr!306FA^E z!~>21RU!2chQKl~3|4_TU=5fD)`2D9TCfgm1RKHapeiDr3kVPP2TQ>`FmwXx{4L)n zzz;^iwO}>45v&Ebg7siC*Z|tULoS#NHh}{{6uy%OhQWzo4mcC6ok%@Y^S+dPU4Z7j|<9o-bu<=74q$qnVMGX!?y;# zAD`Um_&|2!As8AH%pTV(a86paI&SZy4;j|~0AUsWd_JY;;41*5e?p(kXD0MK;rISR zAYD}D;j__kv?V-`fEG=Ugj>m{Fo9MNZ6dVZp!|jTY~Uk4pkh#s35II?CkMkbGlHRl zVA{w4^#XYtpBlm(D`EJP{I&8~2kk5&czlJW!IG1MITNGYlDGbphr+k?KwP-IvS8uZ zU`bJM(%4|>*x<~u!N}NP)!1OQZ)|XhuP|8an-nL-O+z_ys*n?(?`mk33AE+VA_=tB z(8?2N>!Hm|pfy4(OQ5|6trS{N($Ali$4+RI6KI)!e259OFtmvYwBgW75@?0ciW6v) zp%uo_Fb!1%+7A4hs6nsH#|Fa_g9Gw|Ir+id{9xXgU}1i+BtJMQKUkU{oS7et!e<{|1-Fyqq=7}@A~s8hG(QtDf=0W1j>D>e^fBL#u^z6FSf@8n|*#K zBN!eT%pMgCjS8j}1w`&-KEp4*f6=uG|`belUBn z$X#grf0q%=9%%#?2axSPy1H;69aqY?1YWltRAdCx#s((P8YJ#w-sg7nUh-7Vd+Dz= z!e0;!E%p<^LMuNQp5aD1GMH8nDDQY4hwc)-Rq%;#yDQ&W9bt>&_(s832H#A|F8fvV zo0Q=nF}x_l%l$>c>}A%dV0fur5FAjWM+S2ir;H5dE=;BK`D-(z6Veq25Q5Tni?%pW z37wE5JJU&^=v>h`;VmuBFC?C0dKAxf$Sdk1Z>%d%bWR;>AZ2ti`?GSTNX6w+_g=b3 zNxDg;m+mpaI-h^GoA87Ljt+!LpXi+2g{%$nY(I)T7#5~3PN~sL?Pb>GexWZX%zE;G znWpSN5vEi9AYEDl^)ogQEA!Emv+y>;t5|c;lK(9>eR)3E}o)*RZc}D=@w#hJ~@Zid~Ehf-lsrMiV_GIw$`@)^})w_V8$ah$FtQ zm~vxyU6?vHSnIbkx=*$WBeM>fi-_|=i8EE=to6tD7tu+%8LTg|?j-uu?JKl3qsvb3 zXyc<3bdvD3B6|mXmq~i>n`O%1)9a>8LtiB>@RT=zPX*XSm_7n65i-mf?B<3 z_pVlzaM@|tORCj*M#7yH7tY`A1}hAh_L)bxQh%#+8a(!7!cCR*&XyF8W6VyI@{zG) z4SnxE2{VE)Mz6;7Ayw|5(^6YTDs?-uI!u z8M$B1=&{@#@ICfhPwtcOCCKfQj&6#|y{qSPi{U#jD);=Za?5$YoORZLQl>Maa>sUz zFVf!_(WJk*<&Yn6Z6x?=DC0YY?-=+hl8nLe`Z&*DPq!(0oI(9btb|Qy%4g}r&tXbE zA55DNK>ZLrVSFC-_>-HSeeU_fU|zg_DW$CYqhHortgl(RkDBu}2DdS5LS)+L9yf z{TRZbnU;w?vQ+G-8ko%l2P<|hXqB;b?2(B$a?)ZEzThlX9!tv zEKFIfYwV@gGUUtnxx5$l_`@yE5j?wn<#aQC=J0g{f+q6raY@UT>0bLo-!HZ%2J8J+ z0g-z(IY#0Xc@Z9Jqj5e04*{h~OWwq2-(a^ow^7cjnhh38USZfauE3U`jClhyS++}My-oD@0j|vWvPb4( zv0)dcFjKU4CwYlQUtsK`O5|+HXmw_#BIk=TuUtoGrHs1VDq3bQ)io)F3sbEN{ewdO zMHx6j#s>WN4RX^^53ky-)p=cb)4E9m3PtKaVqIl*O+$WQ5h8XXXT@Hvj(>0DSgxF1 zjz!Jd8e<1{ij9DYz9eN-ur_5(a79XCu#W7mhE~t>x|9jQ2A((aX-xTL5H6cEB&jB> z3b7TdvuKOMTAizD9QMJa!{}qmV$9{16vgp1DHO%|{y|}@w{MWLW~WGj6b7XIt%rBf z_*Q3-)ba6d9%}q2N&EX1Q!VQzY9Kq`%%4pA6P{*xs*i7Vej_}GCGgN;K8f+jlt-4V zRsSIMHKEn{Ncw+Pl&3`EIXzfyTQg%kCLS3-HsNPlaw73aTQ>UsH_{~r6Ex}yVr`bR zAiIgZQg2IrK0#YZumO#3VRqupFHQ?K`>i`ul9p(Zq&LiGGcr%DYH|L?v;8tMVT#u{ zBcPY>PXD;J#W|F6kT{pgTx_X@qoT$xly-bVFmG0{&_98ZG2#mj7#F8NWNgYsFYKJ# z>ik6FIFC4}w}t*vqZ`7|HX%PP!rly?-TX8CNbrL-6Q)05P($V~#Q%05@&JwEFn?11 z+0gnyV;R+JAJMsLRfm0-k5bPbZ3b{N_b-O0FiwuhE`^p4jp4-n@l{pwUmi5$Hz*8- z#*qUFTjTL-H>j~S>OB7CLNj%WZ*Nv`0P42fXp97(P$S{jB3s@jScnOzc$F5=QoXRS~AB zBTPw`Fx7-vMVK)iVUF(!CZ)}EUC(vxE zhy+?Tv{vLt>q62o5ZcZJS{}3=3ABmOniFU

0o~Er7NyfmRFcy#(4SXiW*UbsuNlrx?)GT zd}s{`;U+;_pFk^zwl0At`|{Q%(3V51PoS-awkCnL9@^>zS|c=RPw{Df4_aLUZ6~yq z3A9YcgB1z1Ftp_fwBgWd6KI9dY7%IZp)HA{g(YmR2XbHc7@Jk3kKQ4UEFM_!^uV8sB=2&GqaW{PlD?NDK5SlZeu@3<`7UJgFg3w&8}It?-b2r1GJfx% zgW50svPV9=Qy5|~qcG{>osZ)KVWJ;SpKbWH=fk@hc?mwe;e^>ldbcq?*6{4wG|@4u zd;hA`gXkO?kLq}~Cm`FjOJh^Ze0Fk&MT7k$dO`NM))Rg|A7A7mwyxR!mRUkhKh%NK7GCitVq2AeIbIJy5u+`cNzrxcl` z>_J_}v)we=@r?#t`*7SHTDQ&h zPBOcjvoRA$;%_btP*XrCV@qhY@&gDy8cjv>}3E#H!J-$sn=gXo0mR{iT-PUuy znec77(BoU&bG|zGN)~y1(|XRg1-^~!IX2~z*K@uQ{Wgz1%7$;>p7Y7x|8+||zOPT~ zaXnPSmvgDd_iE4i*1@;>GLLU<&-t3+3orHfe&2Jx0sAl>FZ1~3^o*~B_E`#FsMgg_ z_V`YGX6?+Ihv$rQyGMtsH9z2A;F3E0(6Yo~{9*&b*hk0|zlEz?e~w?!xZ8}**-V_< zd4CeG?GJzM`BGv&Im~iZ9!6J|w>Z;zR`g4mx`_S__2ct<_6qZCKLBrR?M0S9SbGwW zhFKglbAmj=HxT|!34cA|Jv+kqA9MWEI`lP~sXt*NgpXW-{Vcj}aa6vj_9;>KqmjFU za5MkV>f9xAmvxbQVuxIu6C!sb;kOY!!zTP$@p5s2N_z1hN&cD%*T|muzl+>+yU4}* z@Mg@O-0;4PN!PYI1u`a_5ifU?D;FP~#9v6bbt_w)fW%)G3s>sD&kaZIN&FGQ)v;H6 zr^H_#joz1(RfKp#VH`m zSjMNY8J>RFRW}RI-ccUczf!QwE?BAyYEtqSruqwEEfm(#0e{sX({@7pDRsfGIC~@M z`+O3HLrRFbv%e-~sa|GZZcT(foG``gbH7{Uzd{&O=Ozzk-4?rHKBFrH*Nr(o;NQV_ zCfM%xf&j);%*!e6B{|xlYLMFp8;5N>%Dxb}QD4Z|8f&3FZ&0XQ(mK+tEw3Z2k93L+ zV4v=#(wMaktvn!*^xa#@ZSOWYdelCXEcsFN3K;$$br}>t#;^~wSW0TPBjLw-ov&A`@XS)I!1dn=K^Pts) zDQoQ1fW+~OUBz)vVjT2tS2s2yV{|l*gSy7i%=@O>Tb%=>pZ1Ez5%V9LeB`rb$of%Y z99Yw?eK~x8?AyCqox`R5_i<%-Hl5MwqRpOgJ3yy(vvT-0-`(mQCw!lt>a~mbb&wMO zbMYO=^Lej=Z)<(4GeyefX_s%A_;TaRzN0Nz2b1tb$17UOZCB06YQDGCnI^JskIIVW z-?P_@54Y5skVMpI<9p0K0Ke4zt2dyUHzKR-zDk)9B&wyZ6I?yPp=N2chUQ`k>v)&a+m zUzp;3q_Q4Si&9?uSgSLZ_9{C5a=f8k*Rs9+Dkc6uB#n_;5m{T2wT`n$+eFqkuWjT| z3`v~k?%KxM{>-mAzx0&&jIZpbJQz(qpYgaL^>>~KOg5pFwUUxlZFV<81&%1-RKTq-^bIHtu@W1?{)ww}< ze-sz*@p0v_pR7vVBdqy>(+8r}&LeJsmMMYw6Plb^ZGc7+&7aV6p{<7&??Wwywl0BI3TVjsoGNYD_!rGuvGJF&roK=YVl@{e%t%KDeg*|y7B_9}N-?(mR+0a4 ztMf$Xbc|=_Io8djJ4XxiK5a1h`()Q;Hjprr2-6_)NUyhVlrmdr%S4KuPq7II+EdD@ z6rT6s`L;e(DaV!YF8OrV^(}Rh`-EPus(wm_?a_cO0-yNunjaV4M4)z-Di z2R~zfio5|dnh~F|b}b!>ynFIuV~WwYM*p9{9x7`eJL#BW)^|l-74llYY<12+-^R&H z7*D)$9?SSw$;T68=Zn0J$m{nnY|PH-OD1n@u+e9I(d`PkyLVR3;cvaF&Do9ev)`ED z*&^7i-R+$%@vluf+K=nrml-DAne@Zu)Ln#U`yAq#8w{7>Svj7y)ih|5w^c)!KMV5c1v%&&_RNR*cNVRj%-^lt&rs?uG5Mb)Xyq4+bPPmnfa z#$(fOW$wC?{(`ooznl;-|DuU*E+I?|NCvd!#6a&;bimt>z|p8v66VNtN;lwje=U>;k!Vm=h% zle>-9U*G23D)EdZp7{KP+2Rq*hC+UdWDy%B#bcPTp<%?wdG3)sN8^iM9|}PpPM8Q` z;@8j%p_N0E<|ls=ZZfo)(4uQZVQ3L(WxJ41Gge{xZarzgjA#4u=9tWh!AT>7B~(?S z+klx@7!lcIenks9Sklr!3`WO#eyRq`_)JC+=Bv}VXN2(f-Gm{1;xv)*eG|mhlr8QD zQ{Pl6<%L|nZsXmWB=5xkQH#vbA35_Z@lHc#bZm{5_xbK-A4Fj`?Nj$t^cCCq=qa^uIjPaxsm?_Cbm1)0n=B*{@?{fBUZ1#!8U4 zCuJhzjs47c%B1uDc8Rl+Fat^B9TMhl!o<>P{0C8=u%G`!gZpLn*XxM}Qk{zNpB+gA(Y&!0$pmpd)C zz&o2~i8r1P$9)G|To*U<;P>DQ-O}c~=JCb%)3~xyxCx z{~Hp4R;6#Se=??LWNeuUU*@fC&Y8j&)5X;Fbho7#Rg6yIb(n}s2)jV$V#gJEWlWp! zbP}~C`{wif?8)g=#>|2xuFb?%d%LHX;^WHkcZ!T|uhdcW-!*N{#qt~rOAqif40jzP zN(k0K`s(13vm5qpgI@X+%=ZtrHvTHaq7U6E1i%81ubF+Yk|&gjv`q(Z-J~&$yzR)l_hk2Mu>EC$`20GZy%N10qVh{0=1vnSm)8hm+MlttWAls7 z?a#Bd6ZH1jZftGzw)ER7;u)~M%^7SH&-_mDByZ=G*ZSo14&%d>c9eGvvKrc)8$H`R zUME}a6t*T_)@74r#@TdS{>l52sFH?~5UzL{fr`v1^q8e``cyY!P%Yn4IEsCC~H`cD!5HO8UAUcq}yO3;mP=%JSrm z{eJR9mv26fK1G|HBW?1V{oZqutAu=$wl$<}_+xF()5x;#9_5uC^Mm;FWd))S3f=Q% zCDujBhG3$OwhLe#2ku@1vtN+HGR6UVZ zYIVE*BC=K^Ys52cJzW=Cp0o;?fr+dg$jaNqomzgrn6YM8YfRQj$;+$2 zPrBsJw4Kjki}tJ@Qi}v4nPerc#-** zl-W16i&ATRa#>@|I&LMhN;mi1=9d06JP&^kJU3D{_7nyaQ*XvbNIrI#T6@LPfEm#- zloraV)TC`z`e6vOV8_d``b=_uDaZdsLO;Z>CS&Us(wO#Yn{%p^`Cl2RO&X1^Om4^E zcm0JVvg^QQ>{O9Ca3uD^Yi-Vc=rsExWPVfGBeN#C&0>6|ru~!0rZ-sk5k1*$yYR-Y zh~>n}G2Ub>2#C(h!$e%f*o8>*C-%SGL09`0a~Y|tw^4wUL%g2nl$4m{Vj;V85O3a) zJqrEJopff8!y{2X+HA*J_=xOmit&*nnv#;c1go3coFMTfZ5J_nrX*PEelim2c*yk}>X2-^{VjFd}r;V>iWNt_1=J!3Bha$7v@{aksk4!Gp*qqWv zMdy@ewmHAcB%KyAjXjlke(RTMgTnr4gOvYiSvZsi6^3^KyrYU*+`2gu-h{nY5~dH0 zq?$V^sYb8vn5M_=k(V@VK+YP%G$GgS)m<9wXd0e#7cbp3NTt0_8d9Wvbd!b^q+wwH z?$c088VZp!iMs}0;n}2NLdUv9v78YtmbuFKj=74sq~|BAEbkO5zR7s2mH3ex#{ICF zIK=*N+wtg_AC?_wUJLHnr?ZtbRc`m%{`nYBJ#71Z=s=UXC$UJ%Okti*K_DJ})`nZVc`uZMH+_)FCsEyBEu(eI40VX2RGp zR%X4E(vtQAGq03%?vVU(4`YM{WBaFFWp+ttlixaAcP~@)Odb6MpW<%uFFqGdh)1Y&_TL+At+wcOS@Q3m^BBrJ9mriHn5v?j|4334rzBzlyvVKO?SdPIp)JM7X|c->9=?T^#r+MD2aYH>uwTP-gxZP zA??o1qVwArjg#r0n4N-t_E}%X$|ycA9MViGlSC$kS4>=CI`TyBk6lk(-(b5KJr|0_ z6K^b50!6P!C8<5dX>%^Tp! z?Dzd;N*S+0WARSa~n@j*5Q}z&M22^;!}pqf!yDFhxEf6yUH}_kF^oA4^XsHlDUZKMe6Cvt60qcoN^Y^P*j(V-2xTHs5#Z?WR4TN>%kq=u^2*m~_Uqoe>R zu>O)f+pe!_63hGpCbTAX&~IP)jq=6? z>n!V&fU-B6AOkDbs=eN|lJb8XcnK}7-WUW4(JC?Cy+`gHt&3N;Q z((e5$m9VExq`#N8JF_HR_^7(*-1u?2)PF@Xolz9MYo#<^yISeV^WgeV<#jkpj$e|jy1@fcWjHZo3yDb zkQuD)v}ZY=cpC{58q(&x$g_PpVQ6>petp(V=XHnlq~~KS?r!^&(M#Md zmSVjYN~-EQ)(TpYSwp-JNqKIOcxy3iV|VdMIV?XFKkxYN=YFI_<)b&YPi=Qbi48A5 z3PIWuZT~0YeFfC0@GXGP=ALrno2W|8m*|_2vREte!8c3N{WF)ZOCK@1#VZT!l54ta z_{d|FQfc(xZ1Q>RZpb_enNgd+FuM1b15;L82AhAc;2M^{8si_IM;P9Gc-u#{IPH@E zo$w~i<3;{V!t9*f=6n(&%+~JYe;k#+&f92h#q~4cb;{*T&IWq0L;~-mz~|;tlK7EzYO4oFzErHgWKwAlI1$WlR^VUNfephGO2561Y;^W-{ZNc50 zY1^Rlp88qf(ZW;n@feCY1Rr;s4fC81IptFZSQ~UFj5WJ*n2Vf}*2|Btb_Te`bGVZXlir z@!%cx^Z+_v?!%Jff8)$v1LODKPPiq6yV(mD-+zn5S413(xxYMbX$sFWrWEy=?WbUZq z4CQ#&3E;mUeXv62`{HB_!&d^|C+LDbdA8@lm(W*5b|qno&-7%U8kOyy&m1e4K8+Cv zga5&(Y#9JW_8RzBM`aI5E<2xzanpnC?mlayht1v#cTKMj9~t>zzsCvj`#q$6EP$`- zp;-HnboQf9uAPAmCuQ)$P_M0F*L3Xt$@6paIM&vfH;OKr2;ZKhQ{>t=!57SnUi)kO zc$K_g!0%8D6}gx4KB{YrmU5j2oC%T-ElZ{I6cOCQb7>1FPNuT^3@$A8~J;bED)0i)9Er&3X z@^Q2osApW>1+m9v$;+D@EqCUE`G{Qj_0SWIn<8f1AWxFYz|n@#Fsp2}eIIo!{=f zB=IgMOth}Jt1bR4vGqKsar}=-wT%k+{|B#^8%HVp8RYE+iQ`iE6Z(VXZ4qI%@;f7$ zdlRODFeY!^u8(2cSk_6YN%LlWoMB{cLFP2#9V~6)Q)DLCipKt*BIglfeI;H8CHOXE zf8te1W2Eg!9twYf&fqso%o(c7qj^X+2Qc-6_rhmIlF!eL|2h|$E0MXG-%XZa9CQl)h(^Y=QR*@-O?;?YrPjDEDmSM@s2iC0%WX5btVum-`p$ zglG8^T3spQ9kh7cU@f#9XnlDve-dtEDfS4o-a_C{Xj@AuBWO{3fhoPwIr-4$$@sH- zH)R&zrt56~Uz1isk-6(QZz^)g553YWtC?m$g2X?Wcat9Noc0K`iO@16Jbz;AFM?JA z&D_D5xOWXBHrw61HbMH3gkMYe#>d*72Yq}=49{jP2_N0NCUyEA;Y&BrpLll1Zlh1> z!@Ic02j|29g0J}{ze`G;zR$D0M~;`rQ_WofOYLzrn$1cR28B&|7oR~NAfD)WABLwC z+Gc1Ic$T)b5FHt2vucR_o}!E-%p$_5C-}`9zzu_0!S%DzK+huJ^6Lv|RlJ&o~;Fbqd=@pC(dmq^h`Yfoof zc(UDjUBVwY*p#C;$HULnv3IM{$6oPHZe0eHFg!cppG`R)M4j3HL{)JJC3+5>*QWex zNXO2=D)F;qsZO#7~U5eRKf<;Gz zkoFxer9bc+OXu@!e|nggKJHK8Vpfa@Q@hdoh^0@L5WbZ9n_?0Eal*&ODe7jaJw|N1 znBQj%%g>XO#*Zx#MqVBAwlPN4@oXj|JZ7sCs1lmSu+o9opHh&VwR%qLx z={Q<5wD+J%JCHvKXP?P`q%S^PHnha_4TP4MzC37>zIfh=&?J5Fw3*N(eetvf&?J3X zk`VqREw#`jeevN|L2H5*?E}*1*FoC~?OBB~SQ zeKqgHKqj^?kLHX%wH2_)-w0(reNvI;~SJIIOeK~Za z-=K>e+3P7hlXGyPJPPCllT|eG!7k108!!{H4 z0rZKH+t7Vj(e3@FGoCdvKbG{aCM>$e^Rct*-5amRu#8mtD{L`s z+fBc_KT_JncHXUfp|gFZe#O`ZExwKQfmRPKzAX-bwg#HaQ{+$NjesWd;=`3d6M6Bp zvLy1VJbBVj|1^!bGAWEbARh z=2qN8{tlC^bC_(2ei#Ao<}bYU`J3Iec>2NY752_XCEdFsd08O*JKCKiu<7jcqj@pw zugP?*>|O6>zwnsAaO7=5p8Zz4bEfF4d{CtUreFVKgtsQ&4?Z?u z#sAJif8Gzo|LhaFa#=`B51M>(;f&A!k}KG4N+Sbd(lZ&pXOWA?nl->5Be%Na_n#n{ zHGmy+sM8e3JtKwO_%j=F-+*T?A0O|plDrO_OF#OSe4+!&;0vcZ?!B#i7g`mx5zzMH zS^k7p1Fh=Ic4s$3VJrmepe=${C^Xi^qw^lA%R>6}I^LZr?ed@WSko?(&HugelHCh( zaHgx_U0}*?1@bojw8eP_+4l3u!)Y0B_*Oxe-{)LK8C*h~-?&a#(p^7bFa2)5WC@X- z{^kcEI!zadO7fdqPJYuJ=eN>!k0gEF?xpnlVTu2Rq}j!BD)QvFP`3s6txu5`h{{VS zznCvT{4vRkk2zkxr196-5dY-2X;b)O_GFIt&Z=7np%0{4VWy4^SU-(@2O8gEd>?1lWX{5g2|#D)IA}I1V3i=b6HN>u>2H+rSBV@*(w0ZO$r`qcAMYv~QP`q0wrY!mANV~o7szxV5x zY_-~wkc+e{BPsE~<$k>>RekE$kEf_7E&X^Zzb&OVrKlz?l=Z1z{5=cSd#6wHGfqn^ z{g$N`+g`lMh?3e+DZlQfF7fHBY_-J3h=8`|udG$D`t+Ta`p~a=zR@Z$Z}ymGU1q8C zH9x&b-g#W?y%08itiILMk1f^GBSOY;_GNx`tK61C`1sF0GoXFp)Bn(_&8Oec>TkCG zLaP_NaMXT6?GHY_To}+-2GymR8n)XqH9R!Lz z*h}4NSxb7U$1VN)Uh19{Qu$G;zBZ_Kq>-4XGd1sC%KFk3ahvZUAJkj?)(3;s6Mns8 zu=-5ya(Kqp*9=zo>tFq8u)0akDLfa_FAi24dT)g9#@*%R*M0OogVmLL>K6v7Tld`e z%R%bizWT|*>aA>j!C>`8wmiSFw_Z9}?bt{EWw2V(Z^)a3+*dz1NWHM1ervG0ZU2XS zPYhDd0f!?~`oe_+G|7155DD`9VGuqzY&;2gFGo(%Q1p|p@u4}d^IP{1RuA~~9fQ>S zmaZP6uGKaPzav$*4OUO3wFjQgQ49C*^W}lP+S6_t#N~j0@DbpPL;bwDZlJvRaG-u< zu)6dxji}28$qk~&4?Q7;@w`u8ZL8b7>I|)sMnr7y+3Kq(jV8>L#HTk~k^;k9dItEF zPc!?d_UkW#>P=hU-Ag&bvM@zI)l2;~ML*n2y_N#u%~btTFZKI0{g0qpE!A>ux_%?5 zR;BCbgX+D2zO0wJC=>F9S^DZ;>VYhMGm_{C+4p!VMZWbLdnW_kKU4MAfO;oQZwsg; z8EC0j1MT%JcuWEfy zs=75r2!BqI=Rc&*H(Y!C%-Y~nOC$lP9Uhy!wEW~UdAQN1w_B=39%&ZuTIv}|*4uvl zx}{#W+*AyVMtoB&Vq$nnBL0Uy^o~?@qlt1wDux9Kl~2YrKZEZ_B8+dL>H2C5CBnDW zW~hN?XJVsyq3w~nLY||8jh&-QJguQGgZCf*X`POt9|STsp=zJzmTeK(E6qnwLC>%lcJs%Z9tbZf8UEYHp*o#@@cLR zUS;WvUAtjlcl;6h7v$X1j%70B(F=~F``+ri&393%`pU1DrK&3}&GQYG{#S~+*_P+~ zY<+#Ix=tU)bEDQ*r}9IRr2Mj!y&zmH8jKR_H6B49G2;6Cy|W1(&E>6&bEtvBtZZjglBWa);zB-yQd zsqNBdnr+>@msn){K*&RqzDH6tcldvjqPOm?mWXjylcpQ@R_oLBhP~A*Qp&HVYi_~6 zJwvbATRoWp;p2d=+gtrJpqKBhUS(|GTYVw}$Eu*N-AmmS)Qk2~OM2;*d#M|HL4G8p z`4NPTA-z3Y-PBt*Wvd5!L*B8Q-kh!4cheiQ)t24$`fT;NSjAuU(W|o6EqmztY_(wz z2s`)GE3(yvef8SC)XRPK>b<0nbcj(<9_9}r$jSP~fO_56_wG!!(685J(sA{pnd&)P z|2<2!rRYzx)D5Zn<)FGdO(@T#N%-w)`l^6hmadlu)HCV(5&ohK89Z;zkl}A_hJ<`M zLq^YkX2?*uBw(ZkWaBLn@10A;Gp$E#^_->ex7AkKkgZ<#My~KNyggknWfbz+i&E4r zT3?l-)@sf3y(#+{3H$$!FznKQORGj(zpT|Kwit-5wuaK82b(}6(NBNKP_JR9r>o6= z|4ZqrP3x`c>W?YtxeaDq^ox`+zv7PWZj&i zp7rbZQ>F2vAU0Z6n^V;$E%cAIekN62o}!;gQTM0p3H?cF2IMh!H27P+HkD-Rt5Vfp z{kkDleQ4{YsbXV5xfuysJ)tGxo3$kTMNuyBj{U@Ie#NhgfO=+4RW&Au%|4+KQCtVX}Yr0g;?HL!jMLp&jX}I^<*2NiWaf<#j zU2RFx_hqO*rdqFLs12!ld%C(NO<$0q9!k>}1=NRWx;8`Iny%kXSFfdO!rUb`ib<9v zE&kK%2o`rP_UVUx>SO6tyiPw>cQswI4&Y_+UejeFVM6u!CHU(wvj&tcGlDc#W9|taGPa0ryo#?+X~e8vMAwiDu8JB}Im5k)LL$`_kPJD{4bYs9rMbJGI*1 zPFz!(WqW%BoQxEubgHOgS^CCW9DKGg~*FR${o^(51d_;Sga?tnr)ureozpC@;>)c%L6HC>8 z+kE=6H1%eRz9@}q(@m-B&Qzf=>!3{%+u-Xg+UkwAev!F`t)I-ozSa+AsmHXwD@(nZ zu0PCDU#9D~vecrCXI#TvV>5Tjp~wn+;Xi!~mYt`433J{4zCEw<=?84zyVl8QPFsIv ztBd4)bQp|gY0qh*RI#t?ZS{pE&o6iiHsf-M?i#Oo@BSmcoJtb-f%hkU>ME}^v4uYH zsk3#HPt^-a;y&27e~k|<rj`dl76-cR*Sa?`_4t&yHpeI-5BeI-5oXc_5Q(O1&L|LuWZ^?lV-LA|-Jdc4;! zH$n;Nn!f6_-6rt=2fOD>_&sj&@#V!mHD5a7*w?pvHgW!#%8>S^)D3Y-h}K~@brZVI z$G)$(eEMF$y3FsIl*Q)_z9R+a$dAy}8=2gt$gJQ+^q{40Oi`~``fn*R35W8wts7F* z1xAlwqW2@@qgua_qTbbdZHoF@ngw-zif>##PYmI=_7yYN>Fc9+F3I*?ouWVPs~%3% z5ACU*$kdNhL_z&kUvlqh9sZ`hBbX z`Zm8>CvDc{f~VgzKD|-PxbcEkPuu$|^)GBm8Jh^@Tz8^x44#NTrTK0bt`GeBo;3BE zwP$;(dM4#{gfKn)W0q=3(~oAUPtx_hLAvffZ)d66O#N_BZOzmhf@*u_;a-IOibUov z(zB~~j_qi3WjTGIGp;o0G3WdQR%1h}I z=0WLz-zV;)R!M8Zi2NI~X&OaPeW0KA^M(G4nOEz7mRgvqpUF}WrV8b9(`cI0^olHX zQ$R1yQcnhSO8}R>{ygBCnaDceq%+@6Q!BOR|8*()l{EE0n!X}kZAuf$QYm&q={aww zc77;6aYkcrwO;FGy(R9ed#f$!`XeZTY0y8&)bI3Gf6vmd^;Rue8p_u}p}fbY@LVrdKSYkN3(WgJoP`EO0$#fkx3$?}yy*3qh*1A;vUWRl{+L+kX{GTr2E(kn9 z$-#Zl>H6vvagyAYqL%n|U5a=*?&VoC(^y9$Q?RCZB&f*_y^Sx|P4}Hv!ox51lY20O z(mVQySK{qH>TXM~>f`&XrSI(H!)df|clEN?HM^@<({=mq{Qj@LxQ}X-gkP|m{-}@o zaJLMY{v|f|3w`t}eblvLZId$hk4Ufb{rXey%rHW9ch;tLM_?xhY*g06ksbPDy15<;@KJZg2HvhJKa*1KIH6;l81_+L|SD zKFkt1tyvWiR$Hl*(DCC?jr=|@9qYcGAjafa_p?J||NWcKt3UQb4Q zTQ@OZqrV&Lqt_@ImZ$577>G@=(Q*%hg>3EO)kFT%3A5PJj|~-{MEfxHg{_|(s-8sc z4_EJK{r)huBSqgnRMn>H2Zo9n`}$C|B28a9Tsr&b!_)=6tQ&@^I~in$s<%Tze|C54 zxnWYypA1pY^wnSIsO$H(77tZV?5%GfrnZWe^kP4K#W2;{PcI&(ZrE49K1^M+pT1$3 zy8cJ{{h?~xkM!$9R9jd-H$+`JK$7#=fT2Wt&B6MyVd~+7H761NbcjUt{2}`GA?mY3 z^bJGQzxaQcy5dm8-*c!YzQ#i}Df{$LO==bolyuY&)NJs6aiD%+h}r>VsJi|ziSVhz z^c6#JgX+aY)J20bi2U9`Mx#klxz@!ZRjxs|CRnricFDl-z{QluEjacqQQ2wMT%Ogb)!~yo1yk@ z&jS0(cj_8i>m&aCZs~7ag#Fd^mVW3!G21>oP$qw09Y|Nw7xtHV&a(chCPm-dUtN-R zf$yyY)nn=Ut^?I28T-6=fXr;4JV0&CIDqFr1oVmn)TaSmbAWm|Q-64XdJUz0fXrn7 zKlZK!KCY_jzZTN6N?6NI3do{@Gg({Q(q-C)ZlNhup2M)>{qry>=trw69zMdmud3qfM|fjZ^5umy zkHjx;ohdnfW~Th|2Qw>xJZ6x`4I=HF)OMa|f*;M4Civz|X@cKXNfUfCV1d5X@YwXlP36Jl{5kHckfYw zmpt9PN5u~g^X{swc%aJrBbwmOD(`#yR@_wOU9jKOE33SVXY4S1n0L*L;i-7B)RhNS z+<2IG`#}{Ty60e&`Ns!WAmir`ekE3pL!_YJMf1$`{(MNqvsK>Uq4MROhaQPv{^U@} z@mGh+FE2i<0?6fu33BCOg7})?wbxCp_~dl&-W@8gnC|^#hgsK5_kOcu#mA<5mrSep zpXuHc(<*@cW11k(OrLA!|7$Oq`em$34`REzv-jg^72nz!){i58xU=`G=@s|y>|L~T z#XUQFx9%)|U$_fS_Ecz`b`mDilw=uuLr=KxCKA_#FU<2=-#SQ*&Oh;Tc-TH!u$GEIrsVZ4)2!__f{N%f2Dl8 z!dD*Y8{a+U+fyFc-TUQ^6*odvoK}HX!M}If3Gdw9`{MM9;oU_~!wcU3X_j1wYaqYf z<;(Je@9%aT{uNI1(}q*^jZ+4ueq(nrBz<9b@2cth;qM2gSKP6??Bj0O-Men*8>ayJ z$Zi$*_piGhjDPX#oke5+3We}PQ$9WA`W=2+gC+6{Q-^B2TXw5>s>b`nZWXuHc#rHc z4aog_Roo2;wRgp6jd#Vq?LdCIf5kwp_qhYleou|}9A*`S-k}v=t?_<**ekJjuUZF^ ze^t!@&xdCw@b{yKUp5sVet+bF`0)8x9*MvI`O14GvkxD;5&ypb)fM>ne_maIFP?sN z1(=STzb|;rXN2f0$G>|DuusP1n}5V=@bCBH@{6CukHg>BRUeJNAE;i4UrXf{p>j7# zd|6j7H`uUx`?@q1w{I1@O*g3$OQ2`xv$`_|qd}Y@u7w-g1&6JyV-UWYucIRC{ z`Hh|B+b4Dw{N`OMZihibOl%MDTJgxPb~Q#mE!RN3+}pbsV|#Cyi;n!j-op1&d&^S% z#l5{Jcd2;S-rnza6XcfN@%zueqT;uEd5`Z=alzi+GkYBQzkA6R>lb@_ckNa2!@a#9 z?|C;*mIc3_wlm!n>0n;hxd8guXp6K*@yjD&K`YLdy{?>}; zM8%kL-s|PNpe<&`ZBq}z^`A$exlhB|-T>iVQ2_vCyB%9`j`yaDbskq3`_os!aDUBC zVxhfaC-3$h-tyz=vSR;oIuxfg{(fk>{C&wz?@@)TGE;H&RI#FuPl3r;cYs@i9e4~{ z`VAGZ2!4A?#q(3-eE-i=ylbY$P1TM!#^Wc*6n%TdS;V#)LCbnVYfor>3}e`|-SFt}pW!ASeqZWY%Y;Qej) ziYpHAu71Uw3l8u;y$9?F-s6=O*X{59V$X`7Bm2E7zPX?G!M%_C(SF`X_YqsegZoz8 zx1aak{e_E7Q}%>81bfLTw@tww0XsC)TD#%o;>S}xi1Y8L zh&vFr4b*ynilq-88{mJ%#Kq&I~8NjdveE$-|px=Ct2IZF*GA$^q=yPsm}tvf5(akcJv+z z(BL5FS}FS{VXxg$thS~sJ9?_5A1EF49M+=>?{4|(3fQhHyid>!WsY{VEc=LEb$!KW zc9>#}L^yG}ZwK*tJ5M?ukJy0@si@dzhdnCJ+TpZ{o2N(v;1un1k~cQxrqq^LfAaFb zFSo$UE%0&+yxamWx4_FS@Nx^h+yehaEl`zGEw%MjO$SfcbodNS$Cy^0sh^vfjxhD! zOq}T;)1kBY`&LbDlMScgDDTXBSqXZ#LLD{s+sOfC!JspRjyJN5Gr4sGQb`?RK2Ok3~b z_n**o;Io>>$ak2(kCAWlr}X>5+cj-m(g+e1m(zl*aB;VH+M>kd^-N@^U<5%^@;^0%Ld&7>P9 zT~!2~tEe@nW5Qs?*3Yq>7gx-&v+8Y7*p_wecAboTpF)8QKBH^Q{g?-#K=8PX3# zlsiMZR?=l6=&mJQGwE6*=tfA_NV?_-x~E7NBVA(zU1h7be--Is5p=UiS4q062)b6% zd8Dh1pu3E8Dz<*{$BgLS%R?T zd76Kj|NG2gs^7LQ4eGg7@!;oEnQ!H|@*Dh1e!e3G@*O=w`41(7`FPdN=R9=_`>N9J z58y5;&*(K<$Is1U`10`n8oDAJ-^~5O5DfP6j5EbGy8!D!;=}fPemn54aK5AD+x)2V zX=NHxN#?<9k8G*ZW?&nti zuG#s0BR^ODO7nGp7xlU7n(%tW&enR2{hIaoji!xEEx!!EA7?tibdc!~(-EelOzrQB zO^4naUd}?lf0XqYd``(M;$ z|1eMNPnudh_F4UG%PH?emS<;8ObiP76H)L@`A0vkU;Kxk?dTup=T8so_kZW-y==!S z{PMpqx4?fz3pB2`Gd5VA80AMJ(`KfvOfyUem<}=>Vmi!pgy|^LF{a~8y*+heRWhw& z8e`hXw3%ru(+txArh`m}m<}@?VLHlmjOjR2j}yC+X%*8L(?+JvOk0^|m<})9b-Dq)Z2&oGp%A8 zW7^2HnQ1H24ATLogG`5*4l^BLI?8m6={QqwU*^xWifN2#BhzN4txPja2bc~r9b!7n zbcE?B(=n#wOuhY>Khr9vF{X`7o0+yU%`hEcI>>a0=`hm~rlU;9n2s~`W-x!IRZL?{ z8<{pUZDpEaI>2<0=@8RlrXx&8nT|0XXX@?G{FzoUjWKOx+RU_-X@=W;()jl<64Lai-pZ%%5o$(-_l6 zrp-)SnP!*{Fdbw%#B`YH2-8ufV@$`HdIvFord3R1OdFXtGi_y>VLHHckm(T9VWuNY zN12W>9cStt%>0>FF^w^8WZKNMm1&0Q0MkLHLrjO6jxZf%I>vOIsdotTXIjNH#2B({ZNWq0FCY71J2gMyAb7TbX8<4lo^LI>dCC=?K$N zrejRUnRRWX02RZL?{8<{pU zZDpEaI>2<0=@8RlrXx&8nT|0XXX?#l{!FWw#+WuTZD!iaG{ba&=^)b~ro&7}n2s_X zV>-^%JDmA5tzsHu+Q_t-X)9B!Ej4Zl*Prb7xLj@iY-YN(pPLcENuJoPnr0iM4N zGj%@?U99g%tH$B(Z)K6YHZ-+YXpUtN8CtoHcoWBG+;5|SQ;jo^;npa0{2CHYPDUg5=3n)7B7 z+)jL$_$kC|KqmB+tCfBZ@plt%C2sef@FzF)8RGJ86L}>VMm*E0^ef2!P;88ae^s3V^GTl~ zJ}NH`!85>(ySNJoeLLwdB)ynV1aHnL{awTtz__@nv78%;5B4j)#O0LdyTnI{-%mbO zXm8=u=<*>xv|0JcbxwJ{O8jinKd7)b0|QC;SU;IplYay7DTs(T_EwcYcHEv%{L%9i z*IYbYAXWNS;+FnCi#vRFf#JsRKR3vK8Sz#Jf0w0yOOXEhy~F3-PeCvB8miO^?ZNuq z1Ot)a!^Gzh|0i)zUdn+7#Wv4E*qMaBlDMTGB;M-aKO;U&+>Yle7|Mi?x0mv_d@ds1 zOx))CPvWh_*RnkC1PkFaO1z(V11czZ^WMt;`^4=$jqRiOUujUY^R%`>d(h6)%gLvc zc$$0=uFmj}9jJV6(4Tn?81Rzs_eg&X>2D=Ic97D`I|}4EdQYYQJL&CwxtVzD!Ajq$ ztL0wDcP}o<~W~YG=p!0i-{Hd`{U*^Bq4_`R}TUcRleIhyEGj-b^K2hyIYK z5#vqj`!3Qy$#%Ggxb-u$>i1`W%QznrL4n843pp;le?1JMLOFbl4TE$TzT2USTfaGL#~mTw%6u)K z4TmYciyv5}c!u z1ebi9iI0%p%AZR5DSn^$mq>5@7j3;RI#&6_zN!5G%<{~774dH=F4wu_`4jQR?Ejh|L_}D5Fg(ojzSk<= z>fm3C$+)fbhI%wZkeSEI+X2#F%JNuwdlU1u^7c32BklP(@hb9>*KwJ4{-u_ui}*i? z+jDG-cT_9=zsbicqQ4>@ds_LdWIOCzqxAN?{;h1!e&8}6$2cFY{P{ryeJvVL%4yHv zEuX7^3!hWh1m&K*9yN^qL|pL}+@9Zmj^+6{aqCZD@##>I44;2!y{ubng81OS6;H6; zt|UH2{Cup_@|@nF<=p=aEvN05DfmP9|NC{?VAh=RF!AACwSZ$haJmlXL8je!u4VZw z#sVaK4t#_1na1`%>~)G)y-@)>FItIP{|9;hw>*CZF5|6{ejsxc_GZ0a`B=XNOaCBn zp+~GGLuvW%f^#$}PmBj_@(y5m_6C9AXR$ncvHfShDV+YeM#Zf^#>0k-=iNa(!wm$c zp?T_2VBv55C@lSFi2sj6KYOmykJ0Z!-mNRo?}$G@dU-#tJVW!8zLNeTwtZGYa7n(_ zk3`-fE6;8i7=mAzm|{NvTNCd=;@1BIku}V7{sN^RSQwo2oE>4yytdBXlWllUHsl-~NeSo-w_N8#w_a~kQtLEQTP{E_(GO*$T| zAJC79zxPDNtv}Gy#4lP*oPI%%5P$O$Esyngx8<3>ROzkX(8o#tDTB|*`ng+rc?Y?S zPwP)~Bk32c3fE`;L;5}F&tlti#&T`{^P3dek9_VX{vHQ^W3$rVLY&>}T~7QS2Y-%u zrdKB-x1-+aE0oVSNe{KkJa%0eqd$&a7w&6O`k#~jaMJ&W_>>d1{GT9x;!2j2`1gq4 zLHyjsO8;6m#8beHp2_jXY3;S1toR7=S>*F0;4*G+S)vTMUGW;w{({^45$1pZdwQce$19p$m*yovZu#3}Z??*NzbT(?}yc{J;L@hRcsF|FQ=q2AO z;&vR)1upy_XjaBUrc|ESPu$)MSwr0F?X$>-#qn;AD9;DT$KEHwG&hgsbArR?C`{xq zJJRRLXAdfFXhHLIoT}x2HE}EFXMm5?%idd=!h+g-U+3ieCenA1Pd)h$oe^H1PS8tx zj$fhuXxsBWvwrqsN`$!DOHnxDi#Q7p8>XyE$3|DVSHW% z+_cY`S`esV<~h;uIoO-+jEA>G@L5kj&2LsdwmjEF(0_#VnN~Ic^Bp2S7Tx6_-2&%U1ZPqKJa`HwdAdwI2o2KmeH!rHkG z^wQ4O?`?Nx&}aA@)k0S>%-e6Tqb% z?7beHm7dqsM!Z`2Sp3(-hrXf}{UyKnN4wIGT&CrDjQPGgE%`!UqW|3y#8&{9e64?` z<$o!0>sN2_6V@ob^?SGYr-_fZXgPT-;>}n~dit|NqczVpz(udK{^wR69qJR6A*|3vw}jrbpdn{}4`V&&5Ovz3qa zFSY#71}^k>U#R2U%9F2<-rg6yh5Vm0_#U41`y=fAoAfdIz3##v2x@>{=u+s@s@?R^eA9%g2gkN37<`TL1me_`8h zcM*5T=W*vKpJCR^_TyWLS6!_2ZPUcNs7L#^)#=|K0hjqQFsSYJUbg4!dzF6Jp?@E6 zp>I7`%l~nr_ui=eW$zu@ez_C4kDPYivfXZ`CB)oi_gkYO z@)??`Ihyx>;Wz&zKDI0Zb{rjaYNTABo$0&(^K&$@7$ty$5RfeENLF-S#{Rf>Fv7bL7Ami8sDY z`-R7T-d6@lPycnx{|~@LPCh{Uyp@xQ3zWXmk$dL@mv*-Iwd{D?=R&0)VZY2|`>Z8C z{ylAnorzyz@c|vr*oK=YbCL2HJxAxuEaE#}tavN56nT0$PMU}hIO8FHiPGEq5cA0A zY~WIUdymhS{~e?sifg+yk^Zc=E1&VRHQ$}|YwvmBB3JEwZL2@*d#U2~-m~2=JPBO- z%ibHX_S?fQ6Z(Vk${iH|w*7m63%$KxVaNY1hJJ=;@5kHne~a|?KD%9ir(Ul77jb`K z`OgO~{2MRU_P70Yt)Yiqj`ut3*$xjAxA$XDB7XE0te2y|olD%_Q?T-X1h~|9> zAnR4B0L`!7>#tJW#yPP4zKM9NBbS~bKDJ!>*cSk=yqe|b_+P;CA9ank+W_y|TuZze zxQw67f?&NqMI3MEF_hNM{2ck%dpmaA9{wKX)6Dh7j>EHnOMgA;$n(2MZ{sq2ljZp< z>GyBfadiyw{`V?>S8w0VCvLJPRY@;CHoH)kE$>qAN(;{R-SH-=Z}0!=y9IqtRpAfsqJ9nfnd2b z&nR%=Gd`jjevSAi?@~VYey5%9j{rCHj=gBbrxnle-sW!Pa~W{yM;jmKY2ur(&FghWM*1=KubtP^zo>i$KcofuCF^wx@e#)!awG9dyjxD5HKbqqCFNt|WLbHBIq_k} zD}kD1o*#W#>216$OTW*E;+4+6U@h^{CT$oy-tGr36>ur% zI%i)0n)KFBdnU{I>aPkt_9>1&a|!V(NB;ku_&P`K9S4CR^=f6DgkO<=6L8@_?${qb zU~y-@JWW1h2Wh?Rda=(pl>cC}R@C;(8jCZ|ft4p80WR%e<7inq@Ojd^_TMrLUs>p@Mv3#9cdi;+sl8&VwdI@;1*I4=Fx& zlh*fDI&HiUd`s(P<7(M@t^2mpkF;w!?fUpu;L;A89ev~dq>u6bz16$!BVOg~E8p-P z;d8L}oYRl*050t_;>h`fzpJ<_pI=A3YEiHshky(JQD;2-o%A*?g>BCxMwS0C^>gdL z)B)Vg)3bHH*!CGDedCH?zP~o~&^Int`kmNcv%aVNs~*$#Tub~q;KJX=U9&&1ZNhxC<>J>f}14||9+{*U{K=G*GvR{}S3;|tm_x3hhoG4xnRT7%>B%Ac~F z_bK6Yt+w}`pD8}Pr?!}thu?l&@`WDGcxUIazn1-6@PoXY+0KN$(+!UK<;eNlflE0n zpU@27#BwekBhHSscA@Kln|bQ!=YJ!;jT>X-$;@A9c{V%x^BaLnzBYc8o!2A8M{d-D zWY@tbl-|an#PBxH$;1Z+g5^Ksms(D|1jNwUc|8nV^0o0RZ8^VBe0Ylv5ZhnJKB@H0 zrzn5B?p#9LJx{rfxGVo3vGk4}SoJIA@1B!31DAGgbjI@sNI(1wEhpWQy!$PF=Z>bn zR?m6H@WFo28E+eYt^BK;_W26&n3M0`ztM7L=4(6rOS{Kg4_wBfjYs$~?G|sJr&zB` zm2f@jW5jJdGu!^R5g(YT<-bF-@ecm2@)`c9;(yp#GuTW#cBJAbQO?Z%oz`o_sn_Me zO}{(#^Bu9#5WJc8%q=Yc8-B0#vT*{hCH|Pf5B8d!b*kzQLO;VBcjVXy4Gup6XTCg5 ze1PSwB>#DT)bfux>&5lJrJOcCE7S(_JVLy=OB*c1b{_ka(vLgq%j$8(8=ZFeJ8>JI zHbee@`m@$+|Fd)+9Yg%eXIM_&8`_EZqXx%#pj_HWe8peb&a^9@O1ulWjPq97?Pn4H zq``6S@U9Ajyps4Mq#ttR*sK1k{4?|$--YxS5+Aul=L?o0^L&E%nA1KB{+I3ZMjcm! zT5azR;4;qf0%r63e)2iyS*0IX9Fz~20XK5e8He8^eHF_&hkOqG8_VykYtx=1exH{A zD9(#M;x=xTt?y01CEo#OeEz}Um7a|wdN%Vt$2jx! zRsYd^y%V&fo@aku4P5daap-?XdK+hXPnPEmpf>I7oC}N)A9wWm6Q?NsQ`BcRlTR=4 z&5m69BXHB6j^3V`s(fr*Pust@0~h*1>bG`0{G9mMRa#MN_fGGi^y3$5zuWd7BR=qT z#XBgscdgKTA8_h*32@1Gi1vhiS+CC#Z*}BYYMRow($C+{uOZ@96>Tej`o zwF~*s@91mfzh+nB)E{hneiXQjtIM2m^*7SH`+|pd(|i|kf3%ML{|elU2lnqm;tjhi z?wzR(u!8udN4aq3sLUOyzgyDwgY_&UPBxkJap`&i!_flEJDviz3M{~8?jCED$0lfHT% z<>PHoKK8u+E{i+s$ZPji{>_wocD;U!!4LATb@cNafJ-@T{P$pLg3|=x2D(rGom& zalobgL(V#T266WsVGD8hJXhk>NWY9RU)wL!5hq6Y+qmp!s5tYEAYOTkwuAMDJoP}O zckL=aAnwkiBn&u4KCBMbtNLK&W8*H^_B_|%&;l^t{m%uYxA7_Mxc#`roqg`YL)dPN zYiRZJZsMcdki9~y=N)>e(vRPv6D!W_F9dG-m+>H9$wJ*q`q(SABOnnS^n!@sqM4QX&>(>s`E6%@3EX$5_kQ$eh6IJf1G;09sh?M zt^5b%f+U_{^0}J0YyZEO_~5IxJU3|Kz3mv~f2t$5A2B%efBNUV)8qoNbFAj;>J?8A zw{cLdK781#lzy1|4!iEp2QKsL0Y@+TvZdce`-N`YUhAusPvhl^U#W>#V-Cvq^eW$P zXO!*T*ZL!7nTR3HIZP*DCJX^Sg-K_c>TTZ;UCuYkyt=T;za#|G_WW z4znRBq#gEm>h(V0QZE~K#mF6~_DwDaP6cwk_!k`r8Sak`f?-z1RpVJfec*S%g?{Kj9f!!yJbTYk zyy`c~r+qiY+lkxv|5*8OjivvhGI*AJo+du{b}c`Tqr8XbvYgZtY=1pZ+_ksNoTue) zrT+hb7RQ@sa7a7G)o&zzDd{~&ul_P{sjn-4-aTLOaq87}zwi+8v9z{N2j|xw38f$6 ze(Ff(dm8a7%EPt9?_$>x9?f7cKMb-GPgzczf};{v-~3Cf19A#X6#D`DI50{Y8czaSu2zW|97b zzz@eb$vmq4K1}?6;#F7c0Jr*pw}a{v-17*62L_73=%g2>RK}!uekufqxCS)YltU{!6ub-k&1qcU!LX z!`ExQpw5{mPW;y^6@c1ip0j|5)pu-C7*)_05V`-LBZITDcf5;gpKOsIipbcZ!U+<*w^2{baM#GiWds~QC-mK;S zsCJuoF7UAOe;@+)R)o`^0o;s#9=uz7_$Ppe)%T~QAE3WeI~(I)#K&&Xa=w=1>WCJR z1K8KokHyN#dBDTUpN_yEi@>L1{7Lz%-l^^UWzF7O4m_;<8-a)A`@snO8S)=uzcjMG z8&1}ID<#k$o+0xu;;<1P#;MYNZ4#~U3@jOWWaz9+^JLcSj z`yb+!@6dLr;C%eH;RF9qN1xg4l<@XB1o&RyUwMe;+sk~<05179^B(91;$O1(ceTSm z&wNLT54~LhyYGA6@aKO$GBeaP5ws$597Z$0`CVN*3Rz( zZrb@e&G4<{|C0zl)6NRd_h{f@{1->iuaCgbkHD{rz&{;24{QG~N8pb|;E7g_D>{(w$#o(Whkdg^!^-p92z&i3Qkz$M?IOSPgOV|hLt!RPJ>{14#w_4tQrP|IN;R|8U@DTycMG_eYnK z{sD*nHsE3Hv)lS`e02o=&Io+3&Tu~I2>i+j{1d>XUIUACoZrp%8IPc!u^~L)<0J4E z;L=}>&OE&bxa3>;cg@%8Y3Fvaot<&=e&Av4`K<_ipKhVYx!A!ff9!eXn}AEczd0LtSUIl%F6}VnjFX!p=pQHlF~+;lT6o?+BIpl1 zN6XXrW*z5n*ECN(aIs^I)1Tb>Ei4BfR?hwi{I&@EKM{CsPdNXRfSZ12|6a;+9?~05 zzXiCIzl!^ab)^3laN(1=Nad|PAN+R&pSgY7{_C7^_(|Yl?eJ&>{x{$z-;FB35-iW& z{o#BXfJ?tO^FFVY!)Hd&e;By%AL2pW&Xki!ZPa!c;(7Ej%y$KFL(f1FA12opU&=`Zeo?L1u^LH`!gkG)$9V%u{tg8mjm&qDZ; z2O{YI6oDWAmhkfTMc|(UF7smO4JsMhnD?RQYJFX~bO!LSe!nUL{~GWxz2!ILKje(J zDd&arKOVT)0je(6a#}mUnh5&aBJiEg*Ltp2^eG~91U%T5sPXjLFaQOSWzMRP~ z&Ly8gt~*x#JY?wYuP4&SNgumQ$Agu_eHyo&t)3_PrV_qjmj&oJ%MR{ks@evGpotphIQaqS6r z8hZHoGhWse%(wAE(lg%9LB!9r_{BP}aZSKHL%_qz`6b|CnZDH_5b~_ z9!U8I&>r&G{%QgqmhX>AKX$r~lLeGB7hSCLE5rI)ef~qh!}vT1T=E@q?2t1q38!y} zz)uEl>gDLA>m%qd0WSHvdenlqE8fWaV#k`+@VxVYhn4@=5%@oWhxJPh70un*f|G=k6HBJhK+2rvKf z5qJ}DDgUrDPu~Sx#$hA%7CXQ01YX6y@F(9PAMXQN@L?Vt{tkFpzx;#rL-%WatsH(7 z_FpNFtDnc-skl2%P696DWQ^yGR=#ZjF7n@_{BL4m-%UPM|ImSB_oqYe3NPo6BJjTh zH{;*gH#J_VU9^C=h=}eE4p~EuYzg;q$!iYX{OJgMpR2T-L)@3q z+~PG7Z@p8?S;hLk2Y6U{?geh-2Jd|w%65L7eB5)WX|U@_`;2}q*bdde!}u?az`ths za9q)><~@2%IG>jHgyTIC_*KBAoK>II26&R~a2xT;hqe5#ApS6LX@>#c6S4N&KahT$ z=d%fpw`uPUFV7s{!l#jTb!*@22QKAt&&xg(!RK4Rt3t;APsnG0@rdkxeE0W-muJ5S zyb-v_s|OrC=TzWQPIupNSp=UuNI(9#$_H!b`9%c%LGKSQ=bL~_yTvG%>{vP(xRmGW z71~k9vcKLH!RO%!{P*PJ-5VUY3$G0?&uIpS9gOnu6!yiY2>M%qOF4&mK6oh0`5^JZ z-BpnAYww4U`$9k9=mY;G?)pEx@`K^!yb!qX8KeEv`g`9TL4P0VtCHGZR13W4fXjGj zrk|BvzYhP9@^7ZTaW328b--;O`G@I3(pTQC4e$`#p*4d365x{Wh;#4z!x8jfkHDXZ zz+d-2Y-gU&TK)C{;9=!|7`R!-X*afZgWnmv67k)Y!%S9sj}I&VR@#B>_>Th*eCzTZc}%X3r&el~D39-RHoB@y)B0WRZk)RE`E zfPo^c9i|K^y}M65koW-ieRdp<0T1J|<8@L_+>fKYwfl~Pfrshgvw%xED;;~Zd`4?L`#=aAm@Tlf<3N@w5kzr;tlk8b6kW;d^JS9(WCKsqe^pbR4ExuiGN{JVg32>LGUg{|$Ip`_KHC@^|+; zrvVS6zZrN~dp;O}x7;N3=wIF=_%X}h3*7W0<0xAFdHKf`cl{jh1RhqNuaMsLbND^* zuyP)7v+{TCW3MIdt{2OI8~wrYL+B#C>+kVV;9>dROFrY|W9P*W$Y+4(GFE>$<`dz3 zRsffFYn?;An&W>9a2Y@2&c6F&5qw4>@O^IKICSnMy#~1S%Pz z_)jD7|Bb+B-m3Wy(jVp?&bx)c&G`Ag&I`LQ_^zS%6@-5td7IL&J4qYtShm|a#49%| zpJzz_8{lF6xZCaF_z4mCnGtwT1pZs#VdMOePlo6Fh6wz8;4*Fp_#PH(H@=zp*j2%G z^V`6sJ%=ght^ApFhvLm&)b_cQ?RG11*Z=b=;F9mydzAm>Y|s8-9arnPjt>%lCveF( z!}WS+;22R|KEOBJf@ARQ|4fJ_5M3 zgU5C7wXD}F(hpNUYb$x)Wx&Jgb*tsi^P1%>|AP_qzly*Q$GRo$;OeD|h!3#+_hUKN z6R&dQ^TJPumoo)i_{ZL>?RXl@O*dutmf<5lbeByJlx;WV>^I{ zwe!1xhmD6XN6`N@0^k30TAtxsv>m!AuWlvo(Qan@{fEHA%JWPF{yOOKB8Qvle`xo` zDd1+_F>ZjJzvls$ere@>WNWXx$I#=x58KD;bqhbQ^aBrS`482^d%MM*eeT`B!^;0C z>0@tJJ?$xuhk5s~eH{DYdBDT?+)jFTz577~{qH02sxO4|pC5s50dCsu{n~-H{J%2v z(5tx)+A{Bdua38Iwr3UR(M`aG&tOX1VT|pt*B2Fc{ZdW^F8$a_zqLciXA5wf7a^0} z7J)wmJZwDw3AmZ3&UxiAUkc~H9JtWCcFr}od7=DY^W|_p?Gg9@aN$4B^G&O-ewes> z&h-c2(mo@!kJ)j)^N5_&;r_nkXL}-WQ+~&v%1fTUtUA@#(49$l^`zIvtCI_tET21P zNiw-C8LzM1)RW3&(ml!Ewp3@jJ=xyfm(28ZxA(X8CD-)#0!j7trMvn%y1SCy8`C{K z9qsAV>dtg>b*gRsrc_URV#VTREy?8{@HoQe3YD?4VB^zoJfv-9{ z+gFpWZ&r6kxTiXs+7o%pT32t&T54g#%D7^2i!bYz>SUsQ_3DHIZ}*IC}F)xF8?{=Oi;6)0ui7OKwv zo_*Kbo=LW?>+Wbv2RUimhc$t=!0No&)Mh&(+XPyDUrQuYYu0phrIWpV{i}m{OD)&YY5tPsheoU;5Lk$H$7L%QE#%?drLDiWuX{r>#gUO* z)0IqhcBFbcy4EIqaxL$0Ety*p1)OaqK}Z36(tZ6sU6>f@&NamsvK)?u3mTv07IbxQ z(Pb+1<81aTf-|(XV?(C1sd;f*^8#7gvWrh$GLcBEXo)3PCYP^_B}HU3Bv+)@cJxAG zEbm#B?&(;wCE=T8+k$Vl(EeFjR}-9-6H>`eh%nhyUwD$#))ivx7k1U#fnjL=3WZsY`vVKoRUu!)J$-#%Ngax!s%7?TgbrUFpvF>P=r9_`qEuIVXFAc`(pSGR-3G0pN!AEm%@ZxDrk2?v zf$(f>7U7fqot7-o(qLp#?fj13OscPK-Qx6?mULgDWnO-@E?#ZRwy`6Li{LsM5DO5yZYa1E@^Dp|ehzGQEAe@|OFsR~vSQ*c73hniSO@60X& z$x zrdyqDc`3oVR3@$3L6A+}K4_U$pp;ovMC*vQ_iw23^Y-mvr03(+OVb;=d$ug-?%9yS z0La@s)p0wj(i<{;TcBpk-mX32Lp_R|^F>RbA%tFhGRem9v(ZRzYz zwPP#Yn{F}{%ud}|$)z&lARIOxpGo!f>Cb%{MQ|=p4E2=km$V~G7)P^`fz5D3GJjW1 zes*C>L$Hdq*?Hmwt>Wn&Sj7s6CzUvctid1~lc zZ=w!1%g!xLP@GN_a}rb?lyr8w%UC8m)-;_uy907CwIQ9{*t9YUvP5G3%C^qlo|Z+) z2COJzxLh*7E76;2>uhLGw{>oW`4KxtFiIq9`@~Q;KeN20DVaFA6M`H@&E{kxnTW?A zmN%rjAaY>pLypNgU0agvslF75S|CH?klXro5}GSiMyQ^cgDaczwS~0cB(z06+QKZt zs>4Z5@-|BSM5D|rrBUXyQL-v9lwPa)wk$(4iO`7{H-X)HJehxp#3p5k%$6a7tO~S- zDDE60iBei)0b2t(VEE)6AkF{J1_+uomTxd@dm&Qm3XwV1wD{5pDAS zL3R|;4BLBX_yq^n793%9_jmQl;bgiq4NGoQv#VJ&%*$k!b+=;|Rd6xQQ{P1M0R?eUNrB z$5J&NU41?My|8j5GkxZ)C?2B=3l6a5m^?oF5>MXgOv@RD*Tk#~C%CKJD!gABUF?B1E|psNiI+XH*|iOqkF+5C&qTXxv7 zWL*-bk6sLT@zQ}SS-L&f_)%S-g56|es;9#kYLdmI6TU)xfH(S6Yd}$_X2Kz&)jg)FhEJuz39-+3cKs#)$4I_^n zw>GZrx(O*bu1jxNnDLiFgU2~Y51a~{nsH3B5nb9Kof)GoF4&DZhbHkCxo#aVC?`=@ zOpN}nwf(R>Vkgsy8IR+mj;`KxPhY{gSH^HoOjuW>sbQJLp-ym$)+XVFmh7~%$aD+l z*u)Hg5w9khOtqg4Q*3fwI+fX;1?@zEeewh+j3Q5RO53t9xiX$ySsg5PJsd7f(~8?F z{4!stCJrri%K{N)fo3-4g+e1m#Swb<_HF4x&8!7RGf$NHzssJzY_T!(Pb_MVMZ2vt z0n0Z2!|{#(Jon{Rh-FXQA}szBVkod~{+EcY3S|8(QEP)N2nsorAQxhElHzY@2%x*;%4~tl1OM66}`wH+S)d9@s|VN0Qa5 zdv$MIeujBe)5zR%t*^DSO6`H5|C5}()+S|-HE~ZwPJFYMHKLj}Uel7=m~LOiO@>kL z{VUj3n2_^#1x>;$S8cN#%wR9SV+$0YmcjD*=NUMY(KdP5Z)L;e;XHqvwhGN}#lYzS5-5>W5Oj3Ijv+<@4OJp+)rxGH0}{i`Q5KI}p4uhmt}=k+Sb>+x#1 z?RBQ$Ln@B5O(*J^yBxj4lVYV$b#$kN&nqNXd<1>Ks`K=a-7asx!z!40R<1&my~sDG zVvTY})=Rd@qlA4a70N$X{)>iD{)(1y!nKtnDSvskf5T@7Q;67!$;HF<=01UOuME9< zm}RionO~Eq*42pVwnHxFrh3-)s;6iBmaf!>?yL?Oub)R6Iklc6WFnbb3Lk~%?GWkg zt*BQXNgdy&m5R3I*($83^gw5Ys@lVvL4G4>;;hWKEHOw#RKf~Y}JLNW44hV9V`Tb+GW6V$|`#AOl} z>T#oen@UYcLx@cyA9Hpfa!?pDbY9#A-j>b0RZ^&omM&p-JoY)|L~{ozO6>7VZSw{&(kGaUG##yx5|)_zumY0_ES$lvVtw-7R}0*|H2Ae=kBen zT%NJ=U)Yr|!rEs0v1wbFK=V)5@@!WtTV66H-6n0CXAr%Z8g$Z&X*LT>;??VY_mItI zA;j)C`^6A@Agk_$6tHHy1#8GxirMAg7bIjBT023>VSNsal)#T)v@IK^qN$EXP*tGFB zYT@ia4iN;ib3^So@p1C}y0*kXLU~xlW0MtfByT49LipGpE?i|IojMXGUft$IbTYpA z+1N~BF`Zzt_Z@E=&)Mn#o8)RtzxAC6EC;L83zKW;0mWogmCYI4{`!McAR3m&N^47- zP&OuF1wk9)p_gGSq)(*1C&E0mn~*?yr!%oJQD-i9npP~1E-yR(Q1G*>uE#YN3TT#e zVY*9$dL>ec+BKc2wJ5`q_$=JUFj9Q2{km7|ipBE^Z9O+jZJb~WlV(wO_j=oSfo$Wm z;Vh383rj%)&Ox%#^>FDl(LZ#C@-Ba)<)zwmqKr8ev??UJ>a{)L{!DgG-J5mSe z^vS+X+#-@Y7Ve~V8ZBPe*5A_urB9+|W&0#gHqqq^@1q9D8|{q+h+)^<-QAh!KwwD| z1vq@hz?IUJ&3!!*P%jvy%xLJfaDDE?su_=UbYb~Q z%3Vx5)J;^}nz%purG!uH@A8GhG!ZKiSVwcNiJyY#?{>*Xt-4nr&;qt927>f%LV#Ta zg4Fos&RQd3p73Su+Wx-YSyB$shO&Ra0Z=?0@c7Cd8WhunshbFuNp(!_M!In_qr+}4 z=kcFR;CL;XldU)pK_sOiv$5agj$*P_zOAlp!QKtysk>`=PYYuC;RdEQy#}!|a!1YZ zY^`|xN(mFrDB(>QKsNe=$#xd9^>v`ldLqyqa?Zy0L%S4H5Zr*A5P;jJe_Q(M`uZ`i znojg95-s&f{m)luxkqMUaXpa_bL2lcpYwTVBe0ZsXU~;T`U#8zsc7UNF${!FA}K#Y zIHm11W>#^`_|6ukAN`8I__e~Z9y-j%bYeqbo>iNWr*sihzah*QJJ&GgN}6OrEzaD0 z^G$wVABg|RL{sAnH*O_k%tkjjIuWgnAYSsI)e|aGLZ_zicPF8Y0>c2!p?YztVhk39 zf?})$0fhiHeubrWq_DzSVH+$1WH#Y z(;+L4q$m>tLLyN@Xd)zO-V)ZyH0%T=C9IV~jxENl`SEJ3TRox)EW=8efH$_N?-8pqvp<4lfA!%fKz=-F8!aq8ut zY6+zm!qnQCpKeJxr9%^?8<*~4&~ls?Yt<+Hf#&xKEWx~qLD?1fNs_!@^S zRXwSUs?-E6MRaQ@mJ*->ib$e)V=5p#N*O)w;0}x!e3)2L3QC6h{NoBEAFk{YD>2&~ zrZGw#@c!|RKEeeatO$ja1vBkT&WxyJ)>L;XFI2@c7u6E_!?d`$f0m;T)#Ptpy?;Gl z?_$7%pd49})3ni}p_mpIhdVqvPyB`h*yOcmN5%}ht&iJbD-q9SWu-=S%t{Sw zX%fj{T7uen?L_#HKDd9II4o+M>gw;r5e3v$6FuGC#%?kk&trJ6)l4!ViMB3je`yi^r;p1*Swizj&sSRH|N(qo?Pos;{1)%H8A? z=up2Z!{x269S6G%vp>5Vmc^v88D>{4z2LbrUuh zz&JOrT6)O7)UVcb8UnnUc=32YM<6e=n#O&vxW2!| z=*d=AOWCaAmDL#oHuZR1or2m@ZR^q#_QHC-lM%0x`+=S5&5LmOknSPdvO2__=Dq>s zXo_3`XU*?G$S>qh{m`t=!8(`fihw>TM$44R>3b`ZZ<`!l=u#K72-7JhqWRA(7Wm5; zYWT17N>txh=9Q@W*|+$K1+~Ch=i?!nz&7VYd|U$?`!d(4?O|&xa=lhFfnOEL-DV1_ zGF9{!ay6c?Sz&~kGb-`YYHFU<-J{Fpk_|P8J-uWDEUBWu$SDCVX-jcDtJ_0)qc4`I zti?JWwowQOKnPpzo3Jim?i4&C$vv4UQ|Xn`7t&%Sv#qm9y(Fh^P9PjJy$Y`>)ioM|6k#^|H}rSHp`aZ{<*7cORui$lSOZt!)tp9Io)E}l z=|&{EXYUKWDaQrWLYVSUFZSyd>n;bJ=|+IZ77Dju*R6`IracnLH#Q8Tf9wl_5xHZ*lfgiP&d zwHlgjNRA;488*BBv^aaVGqaRpAA7s%?x`s7kL?b2-k=iM8Z zg9COn^@xrM*Cpd2VqPxRBprF88OZT?3xeZzuClLytj($yOU|xCjr>N4M8fwjH{AJw zgKZ)SKdpw1${q*hP|1xY_C)*?=Na|J?-qyqy|QD)PPJ*niAy#ZZ(exsCO3BUN`xK_ zK8yoAywXT&o!w`YAcqS`MQK;QOA<NUo&fUU4^7R$GCj5|lqe$DGHqs=aA znZUpE&DpOB_M5PpVmbBkw>!pPvFnQO#GzCb(4S*r^G(45sog@&@;PUY+hqe;-2PE! ztzT|#>+Pc%-Pui0xG&CDn4s+oQv<@)5!Hpv$Wd14a?L^8aAajwfKa{A%z8cHDoz!Y zLAOwj!^2-em$jA@GVUW-ys_=e@uu{a zErxkfZnpbMSY-TtgH8!5fxfjPx5MDCA%iii|M7EH{gXgB;_o66aMR&V;qOw1qM1mr6FGTQQOb?VHm9Jt%K#|dKXRC967 z*&Th=HCUT`9zEimyh*%Q#j$p_R!ofgSzPiJ(HG`g8r=01WjnD8=E{}@LFce|QWYKF zn~JL)fe8iM4BiEffvQ@6Iw4(`VS{b8ciC3>y#{M?#bvgNVWuKTk*v$wR_Fu~*6Zq? zbQ^B0B-0!R@W&FVI^hmdhOIidzxQpK9j&f(*6n`1cf#sjIwSwO7|yd)>i}9UI2r?U z)2X=`X{Aa(f0>cn#Ap+PPGE>RCE+T)UDtA2h}A_SS{7xONVK<9owZ*`nxNMIRopm2?7);DX#z;VI%TRxFl~tdwOh6s619*1Uq?l+f|3XEc9-f@* zIA-$kD(&(G3tI33!{0jq;Ok?}cxWT)}!B^zp?Klr}#`GojX_DT0N z$tNl4)u3f7PBvPNw~!q^?u+!0yDp2IFRu&CLJO$u*1UZDO&Rt3f?L z9%0j|mUXCEZUf_X(uVGyEm?(6dE{E>}IJOe~9Z67T6-y`>Mz@VZ0-=9`r=S;`VBS(}GXSmhk+ zo61_P3|FbwqI2 zu298r46LXyl>~;SWrTJOMk`y)nH1^CWqKOrjJ}9w^GxtF)5u2ZmZjVVtVgBlaef`c?R>wv2j2zv2`_-1EW1uyH7j>r z%5g=QTKtDxw?n1fnEy3 z2?ra286ex)3DqLUR$Q*<9~LcaR`6@*_DRl&jxveSTH+MfoWbX)LxIy2`04o8ZW*(Z zXL0(9+kpWiy&WidkGbOh{jXa25@tV?chfwpA17{U7=GGYB2KTv&C@7vc2Gm?7Pfub z<1NF^NX|+3Tnbpp)IomYfJ#3{AA6(o)bG&04sUZ0CaV>jh@-ioWMHyE+ivY`71QKa zjz=7~)vIG~CY@UEdi2H{QoXRj+j|o&I2M&sO-$XuSimKkPK^38EiHZlAv!evm%$Yo zC)1c#zoOD;3tmiojGcqKdsSRrAx^oBl_H;c=M19r{L|lns{43qG4}9 z0KC3##A0qwuIa)FRbPKkS1mL==4%?tKM?7r|v=mn1|V1L@$ z!*leoSu>mgyA9&GOHSLUP2IJ{*yXSR@DBrM_lN{GtC-+nSu@{U2_80&_pHTlw+#OT zWo9`^BUl|WXrs%00jTxiHAgvCa>Rq@y2i{pVHrViEtvO2FD`oK`aQ}fwg{ms|3vpq z`c}ZNx1i(9kzK_6E@4*qYO&3B1}6oyy;2vP;1(!&QaLH-STYdG)YzOpF`9iupU822 zu`WYj4n4))k`nZUG_ZP)W>T7;k&rtI#Yr&?G77ra#3(%;AERu^>|k*crs-T zXp6dik$&Uxp428>NtTekJ?f88o@y0w`eN?b8`F!K_Wl#j7t`Tqm_Mz5c@h21s-VT4 zgvw4hd*7y3_nQ8V2yR5m*OOptG=pRN=u7#Bd(d|+9`Ej%C#I|gczKJ2@`xqR#{aTH zPAq8d;mL&OoY<$ZYY|T`;}VCvl!)-!*Cf|3H!hovd!k+a$+pd#L4&uc^x&rF#Mwf# zgq)NP$hzL;5cXW469PO^`NpIm2>e7;#M zg2=+=WEdZC&1kM0CK&=%uFB3cS7l{s1CO|S+-v4}P}oN7mS?vi)^)Nyg#e|^8=Dtz z%m#ljipAs}*aR056Q3k&e$CZAWQ8vf!^D_mJ7xc+cVCSUaH%S@pNYtV2yHp!P_!{9ffVGy2E^f**Gb=9&AY3nWg1%R3#714y-wVWO z7)F!m-k9#`ft-<75e8lFpitsJ6vuCMLaT~;P$%kvojUm&Hz1XZ63e19nyo^4rq!41 z#w@oB;fsi6p|`ug2Z2F({W9NsC($F5aPC=Pe^s1e#0Zth5mMrUU&a}jPh12RXvchC z9%n8i!NWC&b1_PVe`?7MPTx_T!%dYy-H}X$FVrn{>2;8RP0>dhrIy?;_kxVG+EA_& zp(0x+ z!1>oV--3gcK3Bmhylumlje#X;b#k0D3f!W~?%U-(Y{^Y{8x>wx(sZI2bzsvl|1p5- z+&VGI#KqMbF!p-pKWQYp&x`P^@oWE9!H)2yx1`P_(y{W=f0Ih}R=r6@R-&?N^ZCCt!r=W&5RY`Rf_B|@I<(RwS9m8ekVUBxM&cltIk zA|%{%0%K%ylx=gwyH#Q9Cd9Q7;HxDrp-;7@oHOylZ3G%4R9&j z%7Rfe(HPjv7I3pOQdO2d`$nWvb(H)XkEwI1D_TxXqGv)e^<#KF?H?^r$RP@&!Ar}B zTGIR)rL3KS;FrZ|4t8@}Y7_h0qp&WUGs3dbe`Fxq+n{z4E!MTb>|o- z$`M9RT#z#5pnn|5p;V!{E#nQ3)GTnMkbAIG9oLvV^KqPjmys^-^53#FYe|7y ztR~LGRxd$;(IggT^oy^V@pW)9K%)nZ%%O1%3(j603fe0sW}(bwyJIoSRPMXR5U6;| z2Du|QIdUqy)hcO~hz><=VoDYeMFox&$`qY}6HL{PpaT|R&=->_o_woAoc6;>YV(jO z5KFD@<*R2+toQgV_2_THDW-(_nJYIWQN)6B9_Q#1dM(lHgv?7}5YIMo`EW*#tG}~M zY_?>FyjFeN+dUx{ikS^Yk_37q+M1wN12NT@Q)R0FWH$%4KewT3D$>p;CUB@!vUf3O zd#U=O6SB@Vcm^k64WmctvR)6fOggnew=%k0v)Wyhg)2KZwx%tMS7*3IDqZ)8h^1dT zBN6z{4a9;5<`AtiXNxFT8*9pY2jpZ(MHPm5{5mjz3kggU5y$tc;^5erbVHIW)lK#V zoloGe!>~Qu!}MVdM~;mVUSGO-dHJ@#u|Sy_6c&)JOqb-q&~S!!al@D^O>*T@iD@@K zImr=9vOAvK^}ep5)br)!YQ$>Wl)}m$i17lcI`N;geeiu{6n*{dTd;h`(1o_{+aZc& zP{j_&+iXjUQk-3_tbHx8bvKL0{8NtaNv}Cx%#oOsH9d${-?1T`#A$tZPq})*F`>3| zWa2yRFK&!kBWKuFKJdCjpTRXdX~Rzst<5s1xYpa(Y4=0!$=<8F2Cjap#!5J+g`M52 zQ=QG--TIDgnw`uRG{=WKy9fup#>;+pqT_Oql~ASzbm8~4?CeZ`?>ZyH6U(6lO3x@| z2db4jCgL&kD%FTM>ee(_#7%3z2r1{tK-9afpAjKuz~Y!X*FJn=AUptAs9irWS7F^L zZj@UmfHJj{oQ4mS0v{$k@+M&$nbpGGEr*?0lVWs*ye{du$+fS><9*%YAKR{nGv&lV zHH)D6`AVz|%dB@N!&a&lCs*CV>&-L$$u+neB3hJaP{z7e3;ANKD~nQH?Qr31_DKz_ ziT1vR?;2&4xh!Vl@3!MANN>DOqUt7lQyay5P@MNAyGimoH|5Tj%Q~P-bZx%_x_tII zpc_9^8TLibWxrL!*;KoxhH?g$t4bNcRE)~b8*A>knq0fRf93`ztr&)JuI^Ky@pVvZ zwd2vc#M;mHL_|L06s*uD8@U8MjivVz@%1^#2p&Ub)&-2eN z!)01D-71ziFlB<3S^-%J>}Uy99m`;tYrEQt1vqdgJUH3|<#h!^UiBv8Wib;6IoobZ zo2^G+UP~8b-Q&f--#5d>a9zxF#jmnrHc@7D0LufTXrduX(DmHW>z|+n>!5};NEy2w zDMzsTMW>lNdD^+e69HQ+bPnA2jbkA0b@Gjya<6mkOZiox5Mj7H8u=>FGR2u23d5W? zqF?;u8=E=AwZQphdt!A$Zu&)Ee$nGf{5wZNVa#PHj1rWJ=pZ#&xn16Ua(CpHA-Bzp zv%A#U6Y=!KFcR=WDdSC3TyQP-?3K2F;i^3C%rCnF;@GyRnUcYPE|@KPlcl#kgL`w` z5}Z23sOSt7b9)7fiwqQzP(EE*L`7C9<>ei}p{vwmBey&Kf-LtkY8i(XR+vP%NQK-d z<$Jj0)IV!U2noM3(I_kZ(O1gUxN^j)oN5$u2{N-(#7TBf7`iF;3jrBKRtG6=b8JmO z#qQ!KlnBR}FyAnpeNI1K)+}+&u}w?jT6Zs^Pz44myo$|6&hM#L?&|4pZ0ThiGEpy( zPE-Kj1+}2!oIyzCs%FJ>r_VvezSNSIwTa{76)}R?5Cnxh z%yMm3rK+yZskR*(PB2$f)nNDF`hagu+qxIe7p!K4@jPI?1ci2|e)3r<#q7zsY_L=a+_U65Zji>v3z@|$T!xOE|VQw zb}2i#;w|zbsI!B7_L}YWrVqO-JlIyf0+QoFIj!y}^K`ma>k7zCJ=^j|pHl#9G zd`o|G(&}uO2E6D1Qeq-ZgM6N_k}ak}Tki7V>MSLype6VLxr{8bX6xZix>@xv5<oI7cvPLw{V=yQev8XmhCqXR3t(> zGOr9MN5(+eKUz03`5Kt9MDPQtF06}%3Em%^(d*{rO=mx)I2%_ghJB_5&#m*DMh z4dR_AbrdW;GRnjWFeicY)BG{*4e$n_#>yC=ccQ+3YdJXkbkvxreCL z4IzQ6&4rDQuC;P5Yc)4fp)mCtlJ-Sc47JkJgqUkl0beBKjOhma6T|Ha*QR6~fttm# zE^l=0=-SwATx44MW+&%#X_PcbR@_{-3fxwRkC0Ub?O2{Lk(kU`VN!gV0>e|gFwbF| zSXWBcwcg|V@MWXTv#RqbfjfCG6=lL%*@W0et2Gl-dmbn)6vzQT%E>-G>zDx?Rb|qBv3p# z@up}i;d3oBb*u6E?e%b=RHNj?waIceGu8kb*ad`bX0~SikRcr~Y(iuPEN5leHj7zf zkde=hDf61HNnF2eD~GS6MX8a+z78q(Qe8E4%)Xxf-j0ptHI2zWyfio6XV$c>x&>_J z$=39jSrNBx0$UHTe6}QGY1LH#!y)8+wc3Ba+O%;w7Wh~)UY}f*f&r&XRGjXf)Y^2D z=zS-ubcnYg%x>o@2)b^>;lw`Kp0O`EyAHkY7gM;6W@7guU{_*e;^fZduzw{_gaN35 z=etcO%Gz#J04)D4$&Gy`K-r_9;v~_eZ&3UVit(Q>{69{ji_s*EV}*#E?czsnUjk6xa0)7Z!IuCO ztZUYZBfCAxas76OXmRI#)){s}pym23bL$H0Qm_tjjmYR%U#p!}2>VT|wr{nx3{(&w ztk^;C7gl?WyAPHop0DLfkm`8SL}gwB_wyvYM&;%@PAG!{1^J*ly2Zd@rzZ(h{hiIcOasUNepoYlRq=|)CX?gbVGztRPueE)s2DOEN`d5!pH zKsFGUwP2Zg$n}~RSVQqkP_w=iVf(PIpcusgIke4AoM}we?v+NfrmF35$H?wbvK_WH zd2iiJwK`0B(Ubj}Ye2Zl&4Cwir=4AXLp{N8wHJjgc(ar*5a{UY=o589I?FDDxt5Ww z=~DNA0u5&_jQSK2C|FzPwd$7<*;<6Wl+{mXN9AO%v;p~`2a{#4yotcdn``FCQ6jb` z9)%MX)?UV5waiMP;Wdb=4LXFkjt@7V?;NUvqQ*#6>DWnbDr}RX=vz7HFEVj8l3qCT1bj1_IP>`|*N z7zd@lHPEjRhaX~Dr8r`Rsh5p=2i`gJf9+jMY$RD$HGdETGk}l~AZVcx0wgR)X8I?A z)b(+9d+KME-8~})6rD~-*;$pDk;+WF$|I154I4JFKp=LA4X}a52)469njN#8y#2!Zzi91#>u(DbgG)xTVlNSe%Oh zb8>g3e1hGJ?(}HVA9g3#53mDS@n$A^S!|u{o(ZZ9;2QCDIgNY4=WZA{C4Bi7kgOk9 z8`9f`Y>IIrUnr606}-( z9bc655(sm{E^&`Cz_IzePv6fE-pk*4e(%Xa{@%mq#4~O8qSIiRssj0Mk#s%=g()5uQq&Mxn1NYVECY;1COY$>mZw> zX2}{7mVd5K$fTmf7dPym-;2C@mRHopU;HlBhGoc_4MIglz zt(a!{%`+>=?=-tjUZBntA4U+`Id?qn0m)FYO{YUbGBGp3D08x?soqodcGbGy4@JEt zV@u3FG3KzVh59=0K=ejS0*fkaJkBL-Z>192(EQ09Yxd(xo?I9Ls#vQRAo6$0Te6J|1>f;IRvV*rr-? zbM{R(J-f+9s=W6#m;It}(FXL#b{K{v@j;?3iHE{&kOH=V3n7NdOp!Pt{AR%+p`M|d z<&6@;)w?&a4Q+(lguUH4z7jT={0sj;!SjuBRe?hr;bQ|$9)lyA4%F;BHSQm2;&op~ zwmZEZ^)4pm2!S9j1~d%_l_7TeWMdZEFezh>*_AYwl{K1hnbw%ZLZ(H%GJmf-y^x?f zCCtx$Y_BnaX!4rv@}_;(S5qTv94^S0Y(gWbV>F?$0o3~OuR~Brr27(G3wV5uc%Z4p z28#Rc6jQgO^3tU(c}7(HZ1-d;B%jDi|3Fu|vv+Tp!&Vn2?LA+gLTgh5Wh(OS3&fQd zz~e4YV?o1U)3bqRJS#`ZNMy=0LMZAPrw|6q3w?J7U*|k2&??{@w z+lfb@0N)eS(wcc`w*9;qmy_AO(b+?|b7w3s4T@Ps?N0g9LZL-Y`oF*`Z_g(kH?3PV zOEe>}&wzK@&hx<-eQb)-SCoCOGKC4}@hmB5X5DC-1Bb^MS|pBnYPQe7lCaww!kSB9 zl7PW}a^||MxkNp~t`dS&IJoKmw|{27mc)syPe*A^Ura~~D=#$UtXHO<&X5ZTFtCa# zE~qw)ZJeH>aN+srGUx8cC|rmVC&-k^`);SF!y{l@Q2rsMB}gPR9ooyW@=!}iIcl;z zV_*Yh9jkebIJCQ}#=dT>Q+&zk6w*zTElt}@nnJ&1nB8R|7pTWJaA6^yICxa3TRC)5 zOfdY1VUZOV%%?@mB$YI9@e+g@!{-I*x+=k^kwe*})V1Sm57}J#w7i@k+)K6ATy{1y z6&8afX;&+h)RuLLojeRmImMb1mk1IsPbBL$vhwOC`s(5KGSv-RIc2_9_@iPnDs;nv zJID=mk<`A{I58P4+oj$Y@&_{_)+MBnMPL>dxl0+Zyc*WZq8U|Ar&}@!oG#1mX2@6M zRE6~;?Otkg0#$cmdN;CK!hs4c)w&J?Asy7J0_8WuDCTO1ivqm3Dn-Tbj_a7>E()$t z2XMMuLG;nk$+)e8#Cd;oOlEf*0u(h3yb-My`$PyYFJ@)TC>S zmE&|GU{~^Ztg_tH9yRZVth=Bjz6sHJyZOyA(UkLb;BjQq80r%~D96_*_PP=~qR(wr zJ1^Bj5_lqV#d;8p%fWZdVvVEvxw3G|$x(?^?qc$8F}pWIFx2VgteA?l#}dW3q2Mi} zAW%^56sj2V<^o-;l|M#0e?Be;{oeIKt75R(*3--3X))oza0eO-eIJ0(FvU zN3yEK6xqkVZ)3H>9oiSv9ot4@`aOx>8l9hBzBufmwOJ1j&S#Tue>Oer@%El)uz1V9 z-qSB}vVQf!c5hI0Cq#N$`C&Qbzgc%yy#4Ou!}|}u@|C>3*|9}n=ywMpB)DkA9d?k@ zw$sBtkuc!#E@-J#RJpN(y{q_Mj?=pQu zo#q_#tka(!U!KnJ@!5+5#J_-tY~@GCC3}}fe=)MEwKMKcrv+o>&^wOrXgbCWj%I^v znOJFJJHtGqLg9<<g+b(lcnX-F{x2_orAT z2pzh@AiO^|H8ib2O0bZECf zy?@ysaHPx06G`OUKqr{csq>V#Hd~w9c-`6H=xP|WU~^x`V!wFPE5`4hlo6bt7$6P%V?Zms_Pxtlc|Pf$xyU-BB#uJX(X3>SL@*kdw#pK^$)-b@Hv~9g zFcuF^rJ8Z}un#5$(tlK3`(YEkztvGcj1R2UwLR%hG`=?5((`O3Dh?~HB)3${UP-wk zto1oR+f6U7(_>+WW6CK#x~KiC3>9}EUqwRaGqea3imZfSHN_2ZhF-9FTHvIw@W7Z4 z+MAtx-hW!oNO_W4gmptPB10&YunQJ@9~;PJTjvZ`&QH0S99E?F(0ExT1W&*2`eOoK zy40Yf{lv}-b+xFz_+)Z%CP(wAqm*U&$t5}_A)SYhhb+B9@-rV1U?lk~&onHSZe=^+ zXDOiCR;#u-fO@g`P8qU2lS)~pWA;KsR5)nf(~apx>Go;wq8N5tTf^=vaJOYR6JvEQjF1`*~GRRr_zhZudP~QQ9NM*=BCwl#at`??_|P~%|O?;XB8csxWx zy2%z}kmyN5nOPHMdSw=LmGE8>Nv)lkl>7(7U~Pq7FDElbr3EM|C=;|T#VgQ62lc5C z@oUH%3vF2A4=yKE@LUetv2K7ac?t2}=yG=@1*u7e^+S_ZX4_Wa@MwZ@E=Bd~c%4z) zUn+HtnTN)`6MD{YZatjKK04Og}S4Gq#gtxQ22A%!K1z zK-hh3iy2dNN72j_Q?qGDYBW9BH5kGa3@%pFsIr6j0)eo)9d}TF6kmE`b#$bE*E?!}q#{NJb3w=_I zRN=n>Q^dFsbqjQwa^R2*!#P>iWUV}DEFR1i3y=v9s<}6 z;i;WN#rNh@v7BnAVmz*qib=sr$q;FJ7y(S3zy0YHGa>S_Zz zK6t2~5;cSBd?*Tj-2PfIDG@*5EP-ck-UG$6(!s3oCAGADR@YR4bfrb+TEj~TOw+wI zSsw0+!OTY|ps15Ass={xbtHy(#hG>orZBm0M6Vg`In|C7sm?km-qX8ya8dMLy#q4j zng{8=#6P}=n~AqV^DTTDA9a%fry@{|H69|=?zVDiwV`kzoZ_TZrA|bFXs~7$!=^0I zM_BQ1`m;ch-rV&q4#yyD@uv14Dr2_u;OKpiRB;|UQ&c7up>ftdibib%Yj04DQgXmk zI(a#swozhFAz4vdO&RqAtzNW_ux^2Bblkkj5h}1)g9H{E!~AhC&#fQmZ`6`HbU0jz zu|jxb0I+DTse+P$O@b1nEs%u@*X}7MFqjz|3L|@AXof0=y1Fyq>UA6<>lo}Nw@jrD zEGt*WQcGBR1Z#9T7)Un`LCY96r|B4gvk!F;^GZ=m%Li};j5dWUrq@7Xr&jCr1xiJ{1}?%?$-R9=JyB)nMshiX9J8GnOjK4> zJ1JH6aIC8`Yitb{&-Rq%Em+oKBrtKdOYD%v;rb&WH?(A=MG1Lvh|OM^PhlU-qp%Qa zm{-@*Z8VNX?upGk>S7D^XX7rKYaChm%Jbp{wMJ(-dR@^OjoCg*77tOhm{!B6IEUqR zMZsXR;SkH6S^2mG1({;l&hKAI`A@~#AyVPA%~;Y zNgeR%ngPgBA3*pYmns9Wcz_2B%SqT}0%3#W;f~+f1cHkj%2R0nf#)J9j z|NP#bHI&9=e*IfOEkB`M=nuiQ7{;D(|m_ny~T7l1lP3 zctr(L!E^-?_EDKs5EJ9aChJ#q;yuU|ynqgjh13?BO_r}<30a%COdnD%FV4=T6l$=_ zeAAqe*b}c7M^rd}`aqJM;Ya}Ve4AIdtlQ9I5hL?j}@%si%6-f>SPGfbJwoyd$Ye!h zA3pIX2n2w)dve*E9Sj?QolpgaxpO|dP!%{qf%tw>yR#o%Mcs2&$%0===BPrq;n9R& zHOr9#WQAhIyVD80JI7LY9w>NNBp_xzI9X>NPEDkc%rf}1r^T_L?i8r&50Gy0+q5{d zHVmTv`6ySPIaYBwQThG=LVi}zNeL7Kw!-XMC#O+;NFJTF8@AVAioE9q$qfQXc1GP{ z5#T`Yk6v>#hv&r1hogr(Jius9Zl7gdHQgb3K&0XSF zoXUgPkAjtfs{pptDivbAGHF{j*ml(FvSPzYH6;PRnMrW?&d05<%qjSi<sA*6z*aCJ2;Z9KKz8(#vGnZ+@*^0=_DdOPr?F#~_cX`0+x-nkbf{%J|zljz}U z3~7rw06^u7EMpsr5gh=!4|po2unt5}$&8@kb!dqjmlE!U@#kc%At4lgcAs(7tx;Ky z!&IWAL0j-iSbIdHQ20#6J75jx-xW#&^urdlsisyenDihwh0SQxp%Th(wRY0! zHNL>kA=6iR@uujhXIomdKKrFn=%TNl!_*66w!%Omq;*{nPZ_O-fDdJ7t&H_qbBqa` zdn3dtPjdiRPS>Iw&f2PuK0AFG<4#-=m8azJGwH}NVd2(Gwp&$1%cs{v+AoV%BbT4K z$+33gv{)=9lBQ%0Y|rR+KWz54OMyBF1C-A9f8#<3)$biGsG`>e4r@eRk}74wLSxL( zDZ09CmlnWC4PFBP2CS+22B_F8C^!)bWqa(%QgL^O9l?Z%>9u>+N14}d6F`hXu`a7s z(K@;Ykpe$L)w!AR*GF8x!9H2r+IL~{+eoLpx02YLS8TZL{z@%a}p|cT1 z(-5gjn8^h@5oorY!ljBVEneF5RD2ulc{#I!H7_LtAdSY@L5JNvjy8G~zKr_VLRO)A zDZNe@z|d5ZNBFc*Sgz&<7d(QUM4$@>$dfO)PP12S=zVHK+;D%MNA3Q6kD8onSR4Ad zMvof3Y2Kq|L`TBPA8M>d-o1PyyBxvaX7L*?s3HA1Og2#MDQ*>L%we~NfwiL@h*4;N8!=G)M2 z*;0$trN$@;QI}StV)*Zk0S5~pk(qIDrck};Pc9}p1TkQrX{aLw)qyVs4d+K-D)T-B zYzma~zU{1My^Ai*cAb|E-Zo83a(sgvK_kc%TXb5Z1jMSoL#F~=BLNt>`?!CK7`tnu z5nWTd3rZ+hg#^Zz*jw&O3t%S3zc92teeoe?o4beHhud{e{Gf1Jia)0AY)yj`g>sNe zI6~+SOdUqM^s6Y1=Q<14=xvRdf+_oqEG&C49$O>}4R5d@3lM%b`2dFOiz?=jz!T%H zx&z-!+_4geQ-<^j*;wl;ye()3PT>_AtW5N558BkU57o2~7c_&aRgKs+jGgqBnwYH} z@tEb8qv_?CE?k6T&p^nCIPD%FNQR?9e{y?*bOeU3$mNN=0;7zyvj1nwp2c9*I!nu~ zQH>Q#Qw+reMu}Vr9DJ$;zrpp=9Q@`1$Q6+#YVHbCRn(u4+RmY83!=Dox89Y&EC+s^BY9YZRCe13^~vgfRiT=2IZ(YA+s>PX;LBU z9K4|&ab7qK+id5?T(?j)g7QEQzxeMl_LWZrst%O^YCOU^0mAo&V=tLpY>9BwNbz;@ z0o-HdXhaPszr8O4^H@0}xFVZ2HvYJ=8g6tKjw8Qg9KCPpp5iq}$0%zBmhmJXc1M7d z!KuO>lF^lCDE7nPBt4CY4ayuabk046(E{Z-%lgxMR|t!#1j|C{ZnBX~EDWQR+3lFb zaWEw=s)bGpmAIKl)3QwE0tjj=qTS@T?%@><<-l#5MDik4W;M*(ay~iCn zrAiWUWUF>6vjPOZt;mi+SF-8e?4mzyNWJq0b93m(TBGQ3?eyNSr(Ohq%R1U5W zU?gzw3{|L4$3VopGpKoV6UNkz>#|jdohhvxo7;?jqw$DEk0yn|#ZW)Yh!))%&dr+` zmkyr@eDTtOYI2tWmPvQl$H|1eX<3@#A*^Lb&4G8svFp{;036g9G!&tlGWCNlDaSw} zM!>u&wosnArEcK(X+cPQMHt4DK`ftMDBsL2Vfetg;ozIl^bn)Ro1J1?I;$|$05~FR zMAh8P1I3{iRa^suIt=xaX$rW<$|cy8(&AOXUQ{s*9>SN-DTO>}!By`e!N(6rSSLY} z9ISkBS9+=V_=KZoTk18H%u9IvED6(7R8w%aBV)y>7f51xp$v{Ci{kBW-@k|DB~&>b zDX}i=kik}uDxVsGvMku=-7j}1BUriMeprZp<7LT`$dDmAapz$nf_>~gL?TR~(OGta zfFZg4T?B85sSWDPxCPNNAP!>+76(8^BNhkcIg+Wxo@(&cvag-j=&Lj0to43kB7A{L zBz4sBwbxNIjY(g!bUH`}t*OqBo(I&$a5-Z=0Znhq{<3vDh9iPr!XVLJGO8DDw6d3i zoTJwf+*|8c%EQb*D~xU>OP*v;tkTH!P*K&nd<5i5lLOmPE-*S$At?`kfea=;@Xk1( zGz&cyjzK#Ng)Ss$Z)_rfB}h|m_|3~f!DZfDkz3$|YtBLG%*Pt#R5R9y$IZnWeGZE0 zP3&B#D<@oKeaEEZ^rXASfJU#L#=AB{X#g+_&CbZDvI))Mb*6Dm&czkY35I=L!^stkM#)D~WY+ZkyoMFTux< znA!Mfl)YQ!=`VwR0&|R09I%~vel_ebNpo=S@w&Si*r>Ui0oM1=FUw1U>SOgda>rdLkX0F64QZda-92g3QX`z>P)H=X6Xa@Xh+&Fu zxXyqImI`w8*$Z-f=5F9~1bKVDAeU3kf*g-)6yz$zT{00!o4GwV#1)@a;8i9yNrV<6;d30<>lt z?J5jY^t&!*xrc_N;+$R%$MB6!kbz8%g#vJGcqw5pN%DfzhJ}g+8EW^+*V#5>aZvTe z`}4#`3S39|`H;mBkgqk%B?N-2B2WSERLorz6{#ot^MNo-N&PVXQeirF<% z+&VDcvVC6@9>99;&}o@+s6@pba3NQsr`75X|fP zTjV!7HM*}H1(n7(b{NW|CbacTC_jpJX1~PpNTdj|yq;muc{<|jPr75x6~oWBFd%k- zb5m|es0pCXIT0od>GA>tpDC%D4Zr!~f@hkg#R6er5hlX*`)@E7K29^jX*qNuLgk%c zp|EYxtZi`$OJ1DWOwwF`sCDAPRu~PK?x9V{X9z$P4t#bt@fx5M?W%}6)PC%CQFt3{ z1D75u?6p?;$m5otlex-v+fpV++#7+W?-!lRwT>i9%IYJP9~x-HL>mzBeZhX2Oy1h^ zA5R`1lLbzvs6fOAIKif1BUU{+P$5E`JcT~-y1RnabmHok-^0=3N_^r{JVOntQYr$- zx?NI$5ZXt7c5npQRpsu$0j|{zp&uoT2Zo7oic}(jRj-g@A{v4)&uj;dXBNJ(vUo1a zN3K|)S&>RLC|Rc#?M4Wt7%*_juTm*e)c+thv6W3P&(9GymIFYnGR-uKNctL9w7~n7 zCgjYvrv2wAtu{e@KiZu&^I+`F;w?`ZH7|V}N(WH_jeZpBg4m9YU zfER8f%pgjI!84d?6qqN93pu@Pea<0tguV?}V{kac>m;rV?Fk`0rG(IF_*parvRfDe z`T3hS+)if7NGoL0chYKNkL{ycUJepIG&qC{~vwl~lhWBXy!3J0K(c$||j;vb_UOF&si?_Z~hju8IMQz#uJ{ z^5FD3@1MORasjFG- zoZ+bLr7rGR1KFdo9eIqL3KpqWa}+MGukP7erj48|<63O1;&!x~yQLQO;N)me)2Yku zjY@2uan0PMJ)Z|eEhm_LBr>D-7M{J>jeBwu>ch7#XClpLWpp>1!?ZZFuu+*)$X3lt z-_oq~LH||Jkd+*P{Z4t1TEv2rQFNn^JHal}_Ji81Skm|bR+0O;Da(vc^_E{ttv;=m ztz)0C&i@la+AO8Y8euO3^rgl<*RT1ftH3N5nJ=e?*NnY5D#rjNQ7ju`#DV@1TMB@8 z(7mn@i}bcf2*spEcxFbp#HJmkU6plcF-a-&n^)1%DTY3?8>qXO`UgV&bUwU2E4)Lwp@>4Fxo#wz)Zr#X2OLq5C^)qmYgVtX zI!k1+dda9Zhl8sA4`)Hu$2DBoRaq7UaiqmBAtM`tajE?)h|tQEN8!3Y@<|{g=cTzr zk-J%@yBoGiAwdoeI~x-&hbt)M{yp za)5WQWiQ$gZ z$b5XAYt@E;gf^M?sF;f{`;4McSm;*lrc$SNp|xiI^|+L4#Gj^S3bOjFP78~yF#)LR}`K&V23qbhH{ zhq5aq41sow_xW%(-tng>Xxt#Ww9aM#zICFcJqNwXKqtzFDPAU@#huWg_a-Fx6bh^< z^2Q4~zSE?xroaUGS=V$D(wLF@s(--+97 zUrTSjhWi(;%V9nR63A^a;8~WEx}vwp4e(eEE(?cq8|k2-YeJZb3=|CzMHc(09cjsv zNyL)OeN_>@iJhjnC~OQSoP@Znn^G;;AQ7{EEW9VR)kc{vBqK_&k+@W+ng=SOl2NLs z6CGYkQiE~&o;esyF%D>VH=^BRgf#shwtK`V$tC?5v9dkXhzJBFsWoAgBKa3>GD01h zgG<&I2cFlIk!p~muC=!sCLLp6XHY=fh-JE z{)B!+$rUs?nQ=CX=hIEvbUPxmx}^MwhqCSy)(SG$_#OSj;o$g72(#D^O09&#Tt-0O z7TI^uX`DsHmW28ej2`(b55~!p9JLvMgtFB-6&^o<<1{s=FjKL`DnpvETXdx!LLg?1 zV-|{;CdCYrmkSWvWYD2L+7f3_$_qI^>PJR_OrZ|9wGzC|NRZ4??M8*aty?x_o*lf6 zva5xQ2nlH5CQ2D5m~mar@?T0AER8GVTyoPvE5fi$4Oc&zVL^2PfhiaAC($ga`N4W^ zBh+p7NN7?!MhbtZZjZdb5u&l1ETdCeMoEMOs1jauONvvb$&~2%KG&OSAo#vbJdHEeh396e#leaz6zXOBLoCG$`R=T&>M?$ul?5 zOVvD_tC>6PqbUP8i5J{Q$$nSQt6on)d59OqqOGLRit->K%zfGKEVjIa+(IVZ+Eshc zVyd_~2)yc=@*zSS!Cg%Ami@LQQ6jMlWx45dLx=P72d8q;lu^?w?sSGYScb^9xFKDI zVPK~_>_gi{h59|Kzo)C~Qo#10>jtl7t#zp`8tqdM3QJd`KN!e`4e&TsU|-cm3i9CK-Etwx zO%C`bz34NTQxo#srebL#dTYhf^0qpgSnITyplTS*j+)Y?B9MHFS@G|WV9@Z3w;1lG zLV|XnAp}O`G3oK$51)o|o>!wCHVYBJBWQk(tuR)g{Hm#%6JmznGw2 zXnt{qr`b2W21BD^qA1?SA{d*q)9F;7op&iW^hx%aPh(^7U}<<4J7M7|<&5pXFY+>Z zLa)UBf=DL0^`D=^6MhK)zSlkJyY7P>-*Ug(;fL*mJ8t3_-oNpE?u0vjF}eQxlkfNN z{l7l?sI>S0io4H_U-=Qe!R|}n{|Kjl6#wl0uYXRjx8qT!-+usy@4-L2|7Y~)jbj}7 zO1uBtzo{qe_%9Q8p*!#y;h(SK%SS=`zwvi^){Z~#8gA{k_t^0Qm;ZNg|F^%P_uKJ9 z_d>Rx&#?CERT~?x-S76it-seZcHG$4@AiJX-}?Xi?*99F*~Zqtx%=JmLlOx7@cy60 zzx4h;#BXfk58|Kv`GRY(JG$HLtUSkm!G}Mjg2q3uXg<7HF>YR(yNy{y$Id|08$59p7H^{=ZD_|8_^;Z^z&GPe;(M9oBAZ)?X+0f9}^f(;FLh z{4x1f{b%!Nzx*ve@QpVA8(Vt69Y23=SL*St`0K_$;Cp!g7yeZ5x8pZ{D!w6o|34=8 zf9r4bemjn*@%=_W_Uk_-_wW6JQj#5i`=C}0zH(%0YI^OB~ z{}uQCIPb?l{z>ouP=2lcv-|D%A2{`S{ImPNUtM2Fs!%>A_zxJFzN#6hUq2BlP-_he6PguARf2`eh z{As*DZ2!hs@7wrLk9&@P>)rMHe<8X5?Q6a7?Y801ei7%{7VG~vzoz&7 z#Y;W@yLHsv_m(@_;aA=Jx9kUZwEKTO`Q4fm9+M%GJHGAS|E=HE2mB%qzl=uOxEcTD p@L1dRO8x)O{#D!md*8G->F@Ekjg6m}bN^5OmAm_T_#b#)A{_t# literal 0 HcmV?d00001 diff --git a/PyTorchSimDevice2/torch_openreg/openreg/__init__.py b/PyTorchSimDevice2/torch_openreg/openreg/__init__.py new file mode 100644 index 00000000..b3ab54a9 --- /dev/null +++ b/PyTorchSimDevice2/torch_openreg/openreg/__init__.py @@ -0,0 +1,86 @@ +import torch +from torch._dynamo.device_interface import register_interface_for_device + +import torch_openreg._C # type: ignore[misc] + +from . import meta # noqa: F401 +from . import extension_device_op_overrides +from .extension_device_interface import ExtensionDeviceInterface + +_initialized = False + + +class device: + r"""Context-manager that changes the selected device. + + Args: + device (torch.device or int): device index to select. It's a no-op if + this argument is a negative integer or ``None``. + """ + + def __init__(self, device): + self.idx = torch.accelerator._get_device_index(device, optional=True) + self.prev_idx = -1 + + def __enter__(self): + self.prev_idx = torch_openreg._C._exchangeDevice(self.idx) + + def __exit__(self, type, value, traceback): + self.idx = torch_openreg._C._set_device(self.prev_idx) + return False + + +def is_available(): + return True + + +def device_count() -> int: + return torch_openreg._C._get_device_count() + + +def current_device(): + return torch_openreg._C._get_device() + + +def set_device(device) -> None: + return torch_openreg._C._set_device(device) + +def custom_device(): + return torch.device("npu:0") + +def init(): + _lazy_init() + + +def is_initialized(): + return _initialized + + +def _lazy_init(): + global _initialized + if is_initialized(): + return + torch_openreg._C._init() + register_interface_for_device(custom_device(), ExtensionDeviceInterface) + _initialized = True + + +from .random import * # noqa: F403 + + +__all__ = [ + "device", + "device_count", + "current_device", + "set_device", + "custom_device", + "initial_seed", + "is_available", + "init", + "is_initialized", + "random", + "manual_seed", + "manual_seed_all", + "get_rng_state", + "set_rng_state", +] diff --git a/PyTorchSimDevice/extension_device_interface.py b/PyTorchSimDevice2/torch_openreg/openreg/extension_device_interface.py similarity index 100% rename from PyTorchSimDevice/extension_device_interface.py rename to PyTorchSimDevice2/torch_openreg/openreg/extension_device_interface.py diff --git a/PyTorchSimDevice/extension_device_op_overrides.py b/PyTorchSimDevice2/torch_openreg/openreg/extension_device_op_overrides.py similarity index 100% rename from PyTorchSimDevice/extension_device_op_overrides.py rename to PyTorchSimDevice2/torch_openreg/openreg/extension_device_op_overrides.py diff --git a/PyTorchSimDevice2/torch_openreg/openreg/meta.py b/PyTorchSimDevice2/torch_openreg/openreg/meta.py new file mode 100644 index 00000000..c475e8e0 --- /dev/null +++ b/PyTorchSimDevice2/torch_openreg/openreg/meta.py @@ -0,0 +1,13 @@ +import torch + + +# LITERALINCLUDE START: CUSTOM OPERATOR META +lib = torch.library.Library("openreg", "IMPL", "Meta") # noqa: TOR901 + + +@torch.library.impl(lib, "custom_abs") +def custom_abs(self): + return torch.empty_like(self) + + +# LITERALINCLUDE END: CUSTOM OPERATOR META diff --git a/PyTorchSimDevice2/torch_openreg/openreg/random.py b/PyTorchSimDevice2/torch_openreg/openreg/random.py new file mode 100644 index 00000000..6817bd79 --- /dev/null +++ b/PyTorchSimDevice2/torch_openreg/openreg/random.py @@ -0,0 +1,61 @@ +import torch + +import torch_openreg._C # type: ignore[misc] + +from . import _lazy_init, current_device, device_count + + +__all__ = [ + "get_rng_state", + "set_rng_state", + "manual_seed", + "manual_seed_all", + "initial_seed", +] + + +def get_rng_state(device="openreg"): + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device("openreg", device) + idx = device.index + if idx is None: + idx = current_device() + default_generator = torch_openreg._C._get_default_generator(idx) + return default_generator.get_state() + + +def set_rng_state(new_state, device="openreg"): + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device("openreg", device) + idx = device.index + if idx is None: + idx = current_device() + default_generator = torch_openreg._C._get_default_generator(idx) + default_generator.set_state(new_state) + + +def initial_seed() -> int: + _lazy_init() + idx = current_device() + default_generator = torch_openreg._C._get_default_generator(idx) + return default_generator.initial_seed() + + +def manual_seed(seed: int) -> None: + seed = int(seed) + + idx = current_device() + default_generator = torch_openreg._C._get_default_generator(idx) + default_generator.manual_seed(seed) + + +def manual_seed_all(seed: int) -> None: + seed = int(seed) + + for idx in range(device_count()): + default_generator = torch_openreg._C._get_default_generator(idx) + default_generator.manual_seed(seed) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 34ba1031..1565a26b 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -111,7 +111,7 @@ def write_header(self): inductor_ops = torch.ops.inductor assert_size_stride = torch._C._dynamo.guards.assert_size_stride alloc_from_pool = torch.ops.inductor._alloc_from_pool - reinterpret_tensor = torch.ops.aten._reinterpret_tensor + reinterpret_tensor = torch.ops.inductor._reinterpret_tensor custom_async_compile = CustomAsyncCompile() async_compile = AsyncCompile() os.environ["TORCHSIM_LAST_COMPILED_MODULE"] = __file__ diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index dfd4aab6..cdcdd2a7 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -8,9 +8,6 @@ from PyTorchSimFrontend.extension_codecache import hash_prefix from Simulator.simulator import TOGSimulator from PyTorchSimFrontend import extension_config -from PyTorchSimDevice.extension_device_interface import ExtensionDeviceInterface - -from torch._dynamo.device_interface import register_interface_for_device # Configure logger for Scheduler module logger = extension_config.setup_logger() @@ -174,52 +171,24 @@ def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None: def setup_device(cls): if cls.NPU_MODULE is not None: return cls.NPU_MODULE - source_file_path = os.path.dirname(os.path.abspath(__file__)) - source_file = os.path.join( - source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimDevice/extension_device.cpp" - ) - hook_file = os.path.join(source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimDevice/extension_hooks.cpp") - - import torch.utils.cpp_extension - module = torch.utils.cpp_extension.load( - name="npu", - sources=[ - str(source_file), - str(hook_file), - ], - extra_cflags=["-g"], - verbose=True, - ) - torch.utils.rename_privateuse1_backend("npu") - torch._register_device_module("npu", module) - from torch._inductor.codegen.common import ( - get_scheduling_for_device, - get_wrapper_codegen_for_device, - register_backend_for_device, - ) - from PyTorchSimFrontend.mlir.mlir_codegen_backend import ( - ExtensionWrapperCodegen, - ) - from PyTorchSimFrontend.mlir.mlir_scheduling import ( - MLIRScheduling - ) + try: + from torch._inductor.codegen.common import register_backend_for_device + from PyTorchSimFrontend.mlir.mlir_codegen_backend import ExtensionWrapperCodegen + from PyTorchSimFrontend.mlir.mlir_scheduling import MLIRScheduling + except ImportError as e: + logger.error(f"Failed to import torch_openreg: {e}") + logger.error("Please ensure PyTorchSimDevice2 is installed: pip install -e PyTorchSimDevice2") + raise register_backend_for_device( "npu", lambda scheduling: MLIRScheduling(scheduling), ExtensionWrapperCodegen ) - import PyTorchSimDevice.extension_device_op_overrides - assert( - get_wrapper_codegen_for_device("npu") - == ExtensionWrapperCodegen - ) - cls.NPU_MODULE = module - sys.modules['torch.npu'] = module - register_interface_for_device(module.custom_device(), ExtensionDeviceInterface) - return module + cls.NPU_MODULE = torch.npu + return cls.NPU_MODULE def submit(self, batched_req, partition_idx) -> List[RequestReturn]: # FIXME. Construct SchedulerDNNModel From 468f41487a13f5438e3a3dd9ecb2f0a639ca0604 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 22 Jan 2026 14:04:12 +0000 Subject: [PATCH 085/194] [Device] Use torch.device(npu) --- PyTorchSimDevice2/torch_openreg/__init__.py | 11 ++++++++++- experiments/gemm.py | 3 --- scripts/ILS_experiment/test_matmul.py | 6 ++---- scripts/chiplet_prep.py | 4 +--- tests/Diffusion/test_diffusion.py | 4 +--- tests/Fusion/test_addmm_residual.py | 4 +--- tests/Fusion/test_attention_fusion.py | 4 +--- tests/Fusion/test_bmm_reduction.py | 4 +--- tests/Fusion/test_conv_fusion.py | 4 +--- tests/Fusion/test_matmul_activation.py | 4 +--- tests/Fusion/test_matmul_reduction.py | 4 +--- tests/Fusion/test_matmul_scalar.py | 4 +--- tests/Fusion/test_prologue_fusion.py | 4 +--- tests/Fusion/test_transformer_fusion.py | 4 +--- tests/Llama/test_llama.py | 4 +--- tests/Mixtral_8x7B/test_attention.py | 4 +--- tests/test_activation.py | 4 +--- tests/test_add.py | 4 +--- tests/test_batchnorm.py | 4 +--- tests/test_bmm.py | 4 +--- tests/test_cnn.py | 4 +--- tests/test_conv2d.py | 4 +--- tests/test_exponent.py | 4 +--- tests/test_gqa.py | 4 +--- tests/test_indirect_access.py | 4 +--- tests/test_layernorm.py | 4 +--- tests/test_matmul.py | 4 +--- tests/test_mlp.py | 4 +--- tests/test_pool.py | 4 +--- tests/test_reduce.py | 4 +--- tests/test_resnet.py | 4 +--- tests/test_single_perceptron.py | 4 +--- tests/test_softmax.py | 4 +--- tests/test_sparsity.py | 4 +--- tests/test_stonne.py | 4 +--- tests/test_transcendental.py | 4 +--- tests/test_transformer.py | 4 +--- tests/test_transpose2D.py | 4 +--- tests/test_transpose3D.py | 4 +--- tests/test_vectorops.py | 4 +--- tests/test_view3D_2D.py | 4 +--- tests/test_vit.py | 4 +--- 42 files changed, 51 insertions(+), 125 deletions(-) diff --git a/PyTorchSimDevice2/torch_openreg/__init__.py b/PyTorchSimDevice2/torch_openreg/__init__.py index a69151e9..5e404f7d 100644 --- a/PyTorchSimDevice2/torch_openreg/__init__.py +++ b/PyTorchSimDevice2/torch_openreg/__init__.py @@ -1,4 +1,5 @@ import sys +import os import torch @@ -11,11 +12,19 @@ import torch_openreg._C # type: ignore[misc] import torch_openreg.openreg - torch.utils.rename_privateuse1_backend("npu") torch._register_device_module("npu", torch_openreg.openreg) torch.utils.generate_methods_for_privateuse1_backend(for_storage=True) +sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) +from PyTorchSimFrontend.mlir.mlir_codegen_backend import ExtensionWrapperCodegen +from PyTorchSimFrontend.mlir.mlir_scheduling import MLIRScheduling +torch._inductor.codegen.common.register_backend_for_device( + "npu", + lambda scheduling: MLIRScheduling(scheduling), + ExtensionWrapperCodegen +) + torch_openreg.openreg.init() sys.modules['torch.npu'] = torch_openreg.openreg diff --git a/experiments/gemm.py b/experiments/gemm.py index 6b6ece4d..0e1a15e4 100644 --- a/experiments/gemm.py +++ b/experiments/gemm.py @@ -48,7 +48,4 @@ def custom_matmul(a, b): if 'pytorchsim_functional_mode' in os.environ: del os.environ['pytorchsim_functional_mode'] - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() run_matmul(size[0], size[1], size[2], config) diff --git a/scripts/ILS_experiment/test_matmul.py b/scripts/ILS_experiment/test_matmul.py index 667dfc66..1314e483 100644 --- a/scripts/ILS_experiment/test_matmul.py +++ b/scripts/ILS_experiment/test_matmul.py @@ -60,7 +60,5 @@ def custom_matmul(bias, a, b): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() - test_matmul(device, *shape) + device = torch.device("npu:0") + test_matmul(device, *shape) \ No newline at end of file diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py index 213eb85b..e2437904 100644 --- a/scripts/chiplet_prep.py +++ b/scripts/chiplet_prep.py @@ -64,9 +64,7 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") parser = argparse.ArgumentParser(description='Process folder argument.') parser.add_argument('size', type=int, help='Folder value', default=256) args = parser.parse_args() diff --git a/tests/Diffusion/test_diffusion.py b/tests/Diffusion/test_diffusion.py index 082ed865..85eaba9f 100644 --- a/tests/Diffusion/test_diffusion.py +++ b/tests/Diffusion/test_diffusion.py @@ -637,9 +637,7 @@ def test_timesteps( args = parser.parse_args() sys.path.append(os.environ.get("TORCHSIM_DIR", "/workspace/PyTorchSim")) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") #test_upsample2d(device) #test_groupnorm(device) diff --git a/tests/Fusion/test_addmm_residual.py b/tests/Fusion/test_addmm_residual.py index ef753a67..d517796e 100644 --- a/tests/Fusion/test_addmm_residual.py +++ b/tests/Fusion/test_addmm_residual.py @@ -43,9 +43,7 @@ def addmm_residual(a, b, c, d): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_addmm_residual(device, 32, 32, 32) test_addmm_residual(device, 128, 128, 128) test_addmm_residual(device, 512, 512, 512) diff --git a/tests/Fusion/test_attention_fusion.py b/tests/Fusion/test_attention_fusion.py index 123376d1..045c109f 100644 --- a/tests/Fusion/test_attention_fusion.py +++ b/tests/Fusion/test_attention_fusion.py @@ -75,9 +75,7 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_MHA(device) # test_Attention(device, head=16, seq=512, d_k=64) # test_MHA(device, num_heads=12, embed_dim=768) diff --git a/tests/Fusion/test_bmm_reduction.py b/tests/Fusion/test_bmm_reduction.py index 4f4d3ad6..7a3060de 100644 --- a/tests/Fusion/test_bmm_reduction.py +++ b/tests/Fusion/test_bmm_reduction.py @@ -42,9 +42,7 @@ def bmm(a, b): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") #test_bmm_reduce(device) test_bmm_reduce(device, 12, 512) test_bmm_reduce(device, 4, 256) diff --git a/tests/Fusion/test_conv_fusion.py b/tests/Fusion/test_conv_fusion.py index 694f3bb9..6f3d5984 100644 --- a/tests/Fusion/test_conv_fusion.py +++ b/tests/Fusion/test_conv_fusion.py @@ -101,9 +101,7 @@ def custom_conv_bn_relu(a, b, bias, c, d, e, f): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") # Vanila test test_conv_residual(device, batch_size=3, in_channels=64, out_channels=64, input_size=28, kernel_size=3, stride=1, padding=1) diff --git a/tests/Fusion/test_matmul_activation.py b/tests/Fusion/test_matmul_activation.py index 2f1d014f..94e5c4ad 100644 --- a/tests/Fusion/test_matmul_activation.py +++ b/tests/Fusion/test_matmul_activation.py @@ -73,9 +73,7 @@ def test_matmul_activation(device, batch_size=16, input_size=32, output_size=8, import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_matmul_activation(device) test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="sigmoid") test_matmul_activation(device, batch_size=42, input_size=42, output_size=42, activation_fn="sigmoid") diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/Fusion/test_matmul_reduction.py index df8cf969..fdd72c00 100644 --- a/tests/Fusion/test_matmul_reduction.py +++ b/tests/Fusion/test_matmul_reduction.py @@ -89,9 +89,7 @@ def matmul_fused(a, b, c, d): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_matmul_reduce(device, 3072, 512, 768) test_matmul_var_mean(device) test_matmul_add_var_mean(device) diff --git a/tests/Fusion/test_matmul_scalar.py b/tests/Fusion/test_matmul_scalar.py index 0815bb90..96b49a08 100644 --- a/tests/Fusion/test_matmul_scalar.py +++ b/tests/Fusion/test_matmul_scalar.py @@ -39,7 +39,5 @@ def matmul_fused(a, b, c): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_matmul_scalar(device) diff --git a/tests/Fusion/test_prologue_fusion.py b/tests/Fusion/test_prologue_fusion.py index b27312a9..850f386a 100644 --- a/tests/Fusion/test_prologue_fusion.py +++ b/tests/Fusion/test_prologue_fusion.py @@ -88,9 +88,7 @@ def bmm(a, b, c, d): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_elem_broadcast_fusion(device) test_elem_fusion(device) test_elem_bmm_input_fusion(device, batch_size=4, m=512, n=512, k=64) diff --git a/tests/Fusion/test_transformer_fusion.py b/tests/Fusion/test_transformer_fusion.py index b1cceb2c..f85c6158 100644 --- a/tests/Fusion/test_transformer_fusion.py +++ b/tests/Fusion/test_transformer_fusion.py @@ -203,9 +203,7 @@ def test_EncoderBlock_validation(head=12, embed_dim=768, input_seq=512): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") #test_MHA(device) test_EncoderBlock(device) # test_EncoderBlock_validation() diff --git a/tests/Llama/test_llama.py b/tests/Llama/test_llama.py index 889e5fa8..5e87b8e7 100644 --- a/tests/Llama/test_llama.py +++ b/tests/Llama/test_llama.py @@ -369,9 +369,7 @@ def run_llama_model_test( args = parser.parse_args() sys.path.append(os.environ.get("PYTORCHSIM_ROOT_PATH", "/workspace/PyTorchSim")) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") #test_triu(device, size=(32, 128), diagonal=1) torch.compiler.is_compiling = lambda: True # FIXME. How to fix this? #run_rmsnorm_test(device) diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py index 58955928..c48ef7d7 100644 --- a/tests/Mixtral_8x7B/test_attention.py +++ b/tests/Mixtral_8x7B/test_attention.py @@ -163,9 +163,7 @@ def test_rmsnorm(device, seq=32): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") #test_rmsnorm(device, seq=1) #test_concat(device, size1=(1, 8, 64, 64), size2=(1,8,1,64), dim=2) test_decode(device, 32, 3) diff --git a/tests/test_activation.py b/tests/test_activation.py index 49a9467c..20cfeed4 100644 --- a/tests/test_activation.py +++ b/tests/test_activation.py @@ -89,9 +89,7 @@ def test_SwiGLU(device, size=(128, 128)): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_ReLU(device, (47, 10)) test_ReLU(device, (128, 128)) test_ReLU(device, (4071, 429)) diff --git a/tests/test_add.py b/tests/test_add.py index 118632d5..a9d37d5e 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -58,9 +58,7 @@ def vectoradd(a, b): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_vectoradd(device, (1, 1)) test_vectoradd(device, (47, 10)) test_vectoradd(device, (128, 128)) diff --git a/tests/test_batchnorm.py b/tests/test_batchnorm.py index 251805f5..19b9f29f 100644 --- a/tests/test_batchnorm.py +++ b/tests/test_batchnorm.py @@ -37,9 +37,7 @@ def test_BatchNorm(device, size=(1, 16, 64, 64)): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_BatchNorm(device) test_BatchNorm(device, size=(1,64, 32, 32)) test_BatchNorm(device, size=(1, 8, 4, 4)) diff --git a/tests/test_bmm.py b/tests/test_bmm.py index d90410db..65e5e64b 100644 --- a/tests/test_bmm.py +++ b/tests/test_bmm.py @@ -46,9 +46,7 @@ def bmm(a, b, bias): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_BMM(device) test_BMM(device, 2, 256, 128, 256) test_BMM(device, 2, 128, 256, 256) diff --git a/tests/test_cnn.py b/tests/test_cnn.py index 54225747..ecc452fe 100644 --- a/tests/test_cnn.py +++ b/tests/test_cnn.py @@ -53,7 +53,5 @@ def test_CNN(device): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_CNN(device) diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py index 97e5cdea..4d989a0f 100644 --- a/tests/test_conv2d.py +++ b/tests/test_conv2d.py @@ -40,9 +40,7 @@ def custom_conv2d(a, b, bias): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") torch._dynamo.config.cache_size_limit = 64 with torch.no_grad(): test_conv2d(device, batch_size=8, in_channels=3, out_channels=32, input_size=32, kernel_size=1, stride=1, padding=0) diff --git a/tests/test_exponent.py b/tests/test_exponent.py index e60f8407..a3a706a9 100644 --- a/tests/test_exponent.py +++ b/tests/test_exponent.py @@ -31,7 +31,5 @@ def exponent(a): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_exponent(device, size=(32, 32)) diff --git a/tests/test_gqa.py b/tests/test_gqa.py index c5f2f6f6..ba262fa6 100644 --- a/tests/test_gqa.py +++ b/tests/test_gqa.py @@ -301,9 +301,7 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0. args = parser.parse_args() sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_repeat_interleave_compilation( device=device, diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py index d103ee1b..dbb5f2d6 100644 --- a/tests/test_indirect_access.py +++ b/tests/test_indirect_access.py @@ -83,9 +83,7 @@ def vectoradd(a, idx, b): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_scatter_full(device) test_scatter_full(device, size=(2048, 2048)) test_scatter_add(device) diff --git a/tests/test_layernorm.py b/tests/test_layernorm.py index a2e842d0..5c15ad12 100644 --- a/tests/test_layernorm.py +++ b/tests/test_layernorm.py @@ -41,9 +41,7 @@ def test_LayerNorm(device, size=(64, 64)): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") with torch.no_grad(): #test_LayerNorm(device) test_LayerNorm(device, shape) diff --git a/tests/test_matmul.py b/tests/test_matmul.py index cd30bd30..0e04738d 100644 --- a/tests/test_matmul.py +++ b/tests/test_matmul.py @@ -94,9 +94,7 @@ def custom_linear(a, b, bias): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_matmul(device, 32, 32, 32) test_matmul(device, 128, 128, 128) test_matmul(device, 256, 256, 256) diff --git a/tests/test_mlp.py b/tests/test_mlp.py index 423d6e8e..b6b70c02 100644 --- a/tests/test_mlp.py +++ b/tests/test_mlp.py @@ -109,9 +109,7 @@ def test_optimizer(device): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_mlp(device) test_mlp_inf(device, batch_size=1, input_size=256, hidden_size=512, output_size=256) test_mlp_inf(device, batch_size=8, input_size=256, hidden_size=512, output_size=256) diff --git a/tests/test_pool.py b/tests/test_pool.py index f5505dba..37248164 100644 --- a/tests/test_pool.py +++ b/tests/test_pool.py @@ -47,9 +47,7 @@ def avgpool(a): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") #test_maxpool(device, b=1, c=8, h=16, w=16) #test_maxpool(device, b=1, c=8, h=112, w=112) test_avgpool(device, b=1, c=512, h=7, w=7) diff --git a/tests/test_reduce.py b/tests/test_reduce.py index 4781112d..93caba7f 100644 --- a/tests/test_reduce.py +++ b/tests/test_reduce.py @@ -47,9 +47,7 @@ def reduce_sum(a, dim, keepdim): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_reduce_sum(device, (29, 47), 1, keepdim=True) test_reduce_sum(device, (17, 68), 0, keepdim=True) test_reduce_sum(device, (327, 447), 1, keepdim=True) diff --git a/tests/test_resnet.py b/tests/test_resnet.py index c83f13ba..2459cd58 100644 --- a/tests/test_resnet.py +++ b/tests/test_resnet.py @@ -49,7 +49,5 @@ def test_resnet(device, batch=1, model_type='resnet18'): args = args.parse_args() sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_resnet(device, model_type=args.model_type) diff --git a/tests/test_single_perceptron.py b/tests/test_single_perceptron.py index beab1c54..7475e1fe 100644 --- a/tests/test_single_perceptron.py +++ b/tests/test_single_perceptron.py @@ -82,7 +82,5 @@ def weight_update(a, b, lr): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_single_perceptron(device) diff --git a/tests/test_softmax.py b/tests/test_softmax.py index 005c3ed2..82218518 100644 --- a/tests/test_softmax.py +++ b/tests/test_softmax.py @@ -67,9 +67,7 @@ def forward(self, x): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_softmax(device, size=(64, 128)) test_softmax(device, size=(64, 128), dim=0) test_softmax(device, size=(256, 128)) diff --git a/tests/test_sparsity.py b/tests/test_sparsity.py index a2493673..eaa7c63c 100644 --- a/tests/test_sparsity.py +++ b/tests/test_sparsity.py @@ -96,9 +96,7 @@ def test_mlp_inf(device, batch_size=64, input_size=64, hidden_size=32, output_si ) args = parser.parse_args() - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") #test_dec_inf(device, sparsity=args.sparsity, block=args.block) test_mlp_inf(device, batch_size=32, input_size=784, hidden_size=512, output_size=256, sparsity=args.sparsity, block=args.block) diff --git a/tests/test_stonne.py b/tests/test_stonne.py index 04ad05a8..ac26c273 100644 --- a/tests/test_stonne.py +++ b/tests/test_stonne.py @@ -54,7 +54,5 @@ def test_sparse_mm(device, input_size=128, hidden_size=128, output_size=128, spa args = parser.parse_args() sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_sparse_mm(device, args.sz, args.sz, args.sz, args.sparsity) \ No newline at end of file diff --git a/tests/test_transcendental.py b/tests/test_transcendental.py index 38c2f4f6..b930a3f5 100644 --- a/tests/test_transcendental.py +++ b/tests/test_transcendental.py @@ -73,9 +73,7 @@ def cos(a): args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_tanh(device) test_exp(device) test_erf(device) diff --git a/tests/test_transformer.py b/tests/test_transformer.py index a3ac55d7..bfc31233 100644 --- a/tests/test_transformer.py +++ b/tests/test_transformer.py @@ -119,9 +119,7 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_EncoderBlock(device) # test_Attention(device, head=16, seq=512, d_k=64) # test_MHA(device, num_heads=12, embed_dim=768) diff --git a/tests/test_transpose2D.py b/tests/test_transpose2D.py index af5aacf7..60a19ed8 100644 --- a/tests/test_transpose2D.py +++ b/tests/test_transpose2D.py @@ -46,9 +46,7 @@ def transpose(a, b): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_Transpose2D(device, [64, 156]) test_Transpose2D_2(device, [16, 64]) test_Transpose2D(device, [640, 256]) diff --git a/tests/test_transpose3D.py b/tests/test_transpose3D.py index d6c1092d..67d4d88a 100644 --- a/tests/test_transpose3D.py +++ b/tests/test_transpose3D.py @@ -61,9 +61,7 @@ def transpose(a, b): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_Transpose3D_1(device, [62, 34, 44]) test_Transpose3D_1(device, [62, 134, 144]) test_Transpose3D_2(device, [62, 34, 44]) diff --git a/tests/test_vectorops.py b/tests/test_vectorops.py index ed895171..ede70e0e 100644 --- a/tests/test_vectorops.py +++ b/tests/test_vectorops.py @@ -6,9 +6,7 @@ import os import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") # Target shape seq_list = [1,128,512,2048,8192] diff --git a/tests/test_view3D_2D.py b/tests/test_view3D_2D.py index 148fe8fa..ae8a67c9 100644 --- a/tests/test_view3D_2D.py +++ b/tests/test_view3D_2D.py @@ -44,9 +44,7 @@ def view2D_3D(a): import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_view3D_2D(device) test_view3D_2D(device, [12, 512, 64]) test_view2D_3D(device, size=(512, 1024), h=16, d_k=64) diff --git a/tests/test_vit.py b/tests/test_vit.py index aeb4f148..6149166d 100644 --- a/tests/test_vit.py +++ b/tests/test_vit.py @@ -202,9 +202,7 @@ def test_encoder_block_with_class_token( shape = tuple(map(int, args.shape.strip('()').split(','))) sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") #test_multihead_attention(device) #test_encoder_block(device, seq_len=197) #test_encoder_block_with_class_token(device, seq_len=196) From a62540913e622ab7577c682b54d99a261fd1c5ee Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 23 Jan 2026 06:17:46 +0000 Subject: [PATCH 086/194] [SDPA] Use math as a default --- PyTorchSimDevice2/csrc/aten/native/Extra.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorchSimDevice2/csrc/aten/native/Extra.cpp b/PyTorchSimDevice2/csrc/aten/native/Extra.cpp index 129ad621..711d114c 100644 --- a/PyTorchSimDevice2/csrc/aten/native/Extra.cpp +++ b/PyTorchSimDevice2/csrc/aten/native/Extra.cpp @@ -19,7 +19,7 @@ int64_t _fused_sdp_choice( bool is_causal, std::optional scale, bool enable_gqa) { - auto backend = sdp::SDPBackend::overrideable; + auto backend = sdp::SDPBackend::math; return static_cast(backend); } From a053314da8b29abab746ac9ac66525eef6b2c2fd Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 23 Jan 2026 10:46:54 +0000 Subject: [PATCH 087/194] [AMP] Add amp interface for OpenReg style device --- PyTorchSimDevice2/csrc/amp/OpenRegAmp.h | 15 +++++ PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp | 28 ++++++++ .../csrc/aten/OpenRegMinimal.cpp | 21 ++++++ .../torch_openreg/csrc/Module.cpp | 67 +++++++++++++++++++ .../torch_openreg/openreg/__init__.py | 7 +- .../torch_openreg/openreg/amp.py | 33 +++++++++ 6 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 PyTorchSimDevice2/csrc/amp/OpenRegAmp.h create mode 100644 PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp create mode 100644 PyTorchSimDevice2/torch_openreg/openreg/amp.py diff --git a/PyTorchSimDevice2/csrc/amp/OpenRegAmp.h b/PyTorchSimDevice2/csrc/amp/OpenRegAmp.h new file mode 100644 index 00000000..2f81e9d2 --- /dev/null +++ b/PyTorchSimDevice2/csrc/amp/OpenRegAmp.h @@ -0,0 +1,15 @@ +#pragma once + +#include +#include + +#include + +namespace c10::openreg { + +OPENREG_EXPORT bool is_amp_enabled(); +OPENREG_EXPORT void set_amp_enabled(bool flag); +OPENREG_EXPORT at::ScalarType get_amp_dtype(); +OPENREG_EXPORT void set_amp_dtype(at::ScalarType dtype); + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp b/PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp new file mode 100644 index 00000000..fd650026 --- /dev/null +++ b/PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp @@ -0,0 +1,28 @@ +#include +#include +#include "OpenRegAmp.h" + +namespace { + bool g_amp_enabled = false; + at::ScalarType g_amp_dtype = at::kFloat; +} + +namespace c10::openreg { + +OPENREG_EXPORT bool is_amp_enabled() { + return g_amp_enabled; +} + +OPENREG_EXPORT void set_amp_enabled(bool flag) { + g_amp_enabled = flag; +} + +OPENREG_EXPORT at::ScalarType get_amp_dtype() { + return g_amp_dtype; +} + +OPENREG_EXPORT void set_amp_dtype(at::ScalarType dtype) { + g_amp_dtype = dtype; +} + +} // namespace c10::openreg diff --git a/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp b/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp index d54ae552..39f019c5 100644 --- a/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp +++ b/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp @@ -4,6 +4,10 @@ #include #include +#include +#include +#include +#include namespace at::openreg { @@ -105,6 +109,23 @@ at::Tensor wrapper_view(const at::Tensor& self, c10::SymIntArrayRef size) { void wrapper_cpu_fallback( const c10::OperatorHandle& op, torch::jit::Stack* stack) { + const auto& op_name = op.schema().operator_name(); + + // Generate timestamp in format [YYYY-MM-DD HH:MM:SS.mmm] + auto now = std::chrono::system_clock::now(); + auto time_t = std::chrono::system_clock::to_time_t(now); + auto ms = std::chrono::duration_cast( + now.time_since_epoch()) % 1000; + + std::tm tm_buf; + localtime_r(&time_t, &tm_buf); + + std::ostringstream oss; + oss << std::put_time(&tm_buf, "%Y-%m-%d %H:%M:%S"); + oss << '.' << std::setfill('0') << std::setw(3) << ms.count(); + + std::cerr << "[" << oss.str() << "] [INFO] [PyTorchSimDevice] [Eager Mode] Operator: " << op_name << std::endl; + at::native::openreg::cpu_fallback(op, stack); } // LITERALINCLUDE END: FALLBACK WRAPPER diff --git a/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp b/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp index 38c45633..052a9ed4 100644 --- a/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp +++ b/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp @@ -5,8 +5,11 @@ #include #include #include +#include +#include #include +#include static PyObject* _initExtension(PyObject* self, PyObject* noargs) { HANDLE_TH_ERRORS @@ -73,6 +76,65 @@ PyObject* _getDeviceCount(PyObject* self, PyObject* noargs) { END_HANDLE_TH_ERRORS } +PyObject* _isAutocastEnabled(PyObject* self, PyObject* noargs) { + HANDLE_TH_ERRORS + if (c10::openreg::is_amp_enabled()) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } + END_HANDLE_TH_ERRORS +} + +PyObject* _setAutocastEnabled(PyObject* self, PyObject* arg) { + HANDLE_TH_ERRORS + TORCH_CHECK( + PyBool_Check(arg), + "set_autocast_enabled expects a bool, but got ", + THPUtils_typename(arg)); + c10::openreg::set_amp_enabled(arg == Py_True); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* _getAutocastDtype(PyObject* self, PyObject* noargs) { + HANDLE_TH_ERRORS + THPDtype* dtype_obj = torch::getTHPDtype(c10::openreg::get_amp_dtype()); + Py_INCREF(dtype_obj); + return reinterpret_cast(dtype_obj); + END_HANDLE_TH_ERRORS +} + +PyObject* _setAutocastDtype(PyObject* self, PyObject* arg) { + HANDLE_TH_ERRORS + TORCH_CHECK( + THPDtype_Check(arg), + "set_autocast_dtype expects a dtype, but got ", + THPUtils_typename(arg)); + THPDtype* dtype_obj = reinterpret_cast(arg); + at::ScalarType dtype = dtype_obj->scalar_type; + c10::openreg::set_amp_dtype(dtype); + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* _getAmpSupportedDtype(PyObject* self, PyObject* noargs) { + HANDLE_TH_ERRORS + PyObject* torch_mod = PyImport_ImportModule("torch"); + TORCH_CHECK(torch_mod != nullptr, "Failed to import torch module"); + + PyObject* float16 = PyObject_GetAttrString(torch_mod, "float16"); + PyObject* float32 = PyObject_GetAttrString(torch_mod, "float32"); + + PyObject* lst = PyList_New(1); + PyList_SetItem(lst, 0, float32); + //PyList_SetItem(lst, 1, float32); + + Py_DECREF(torch_mod); + return lst; + END_HANDLE_TH_ERRORS +} + static PyMethodDef methods[] = { {"_init", _initExtension, METH_NOARGS, nullptr}, {"_get_default_generator", _getDefaultGenerator, METH_O, nullptr}, @@ -80,6 +142,11 @@ static PyMethodDef methods[] = { {"_set_device", _setDevice, METH_O, nullptr}, {"_exchangeDevice", _exchangeDevice, METH_O, nullptr}, {"_get_device_count", _getDeviceCount, METH_NOARGS, nullptr}, + {"is_autocast_enabled", _isAutocastEnabled, METH_NOARGS, nullptr}, + {"set_autocast_enabled", _setAutocastEnabled, METH_O, nullptr}, + {"get_autocast_dtype", _getAutocastDtype, METH_NOARGS, nullptr}, + {"set_autocast_dtype", _setAutocastDtype, METH_O, nullptr}, + {"get_amp_supported_dtype", _getAmpSupportedDtype, METH_NOARGS, nullptr}, {nullptr, nullptr, 0, nullptr}}; /* diff --git a/PyTorchSimDevice2/torch_openreg/openreg/__init__.py b/PyTorchSimDevice2/torch_openreg/openreg/__init__.py index b3ab54a9..81c2fc60 100644 --- a/PyTorchSimDevice2/torch_openreg/openreg/__init__.py +++ b/PyTorchSimDevice2/torch_openreg/openreg/__init__.py @@ -66,7 +66,7 @@ def _lazy_init(): from .random import * # noqa: F403 - +from .amp import * __all__ = [ "device", @@ -83,4 +83,9 @@ def _lazy_init(): "manual_seed_all", "get_rng_state", "set_rng_state", + "is_autocast_enabled", + "set_autocast_enabled", + "get_autocast_dtype", + "set_autocast_dtype", + "get_amp_supported_dtype", ] diff --git a/PyTorchSimDevice2/torch_openreg/openreg/amp.py b/PyTorchSimDevice2/torch_openreg/openreg/amp.py new file mode 100644 index 00000000..0a9dfdf0 --- /dev/null +++ b/PyTorchSimDevice2/torch_openreg/openreg/amp.py @@ -0,0 +1,33 @@ +import torch + +import torch_openreg._C # type: ignore[misc] + +from . import _lazy_init + + +__all__ = [ + "is_autocast_enabled", + "set_autocast_enabled", + "get_autocast_dtype", + "set_autocast_dtype", + "get_amp_supported_dtype", +] + +def is_autocast_enabled(): + return torch_openreg._C.is_autocast_enabled() + + +def set_autocast_enabled(enabled: bool) -> None: + torch_openreg._C.set_autocast_enabled(enabled) + + +def get_autocast_dtype(): + return torch_openreg._C.get_autocast_dtype() + + +def set_autocast_dtype(dtype) -> None: + torch_openreg._C.set_autocast_dtype(dtype) + + +def get_amp_supported_dtype(): + return torch_openreg._C.get_amp_supported_dtype() \ No newline at end of file From eda34ffb26692a1b1950759590ebe22900759140 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 23 Jan 2026 10:52:27 +0000 Subject: [PATCH 088/194] [Tests] Cleanup unnecessary code in tests --- scripts/ILS_experiment/test_matmul.py | 4 ---- scripts/chiplet_prep.py | 7 ------- tests/Fusion/test_addmm_residual.py | 9 +-------- tests/Fusion/test_attention_fusion.py | 10 +--------- tests/Fusion/test_bmm_reduction.py | 6 ------ tests/Fusion/test_conv_fusion.py | 6 ------ tests/Fusion/test_matmul_activation.py | 6 ------ tests/Fusion/test_matmul_reduction.py | 4 ---- tests/Fusion/test_matmul_scalar.py | 4 ---- tests/Fusion/test_matmul_vector.py | 10 +--------- tests/Fusion/test_prologue_fusion.py | 6 ------ tests/Fusion/test_transformer_fusion.py | 6 ------ tests/Mixtral_8x7B/test_attention.py | 6 ------ tests/test_activation.py | 3 --- tests/test_add.py | 3 --- tests/test_batchnorm.py | 6 ------ tests/test_bmm.py | 6 ------ tests/test_cnn.py | 6 ------ tests/test_conv2d.py | 5 ----- tests/test_eager.py | 8 ++++++++ tests/test_exponent.py | 6 ------ tests/test_indirect_access.py | 6 ------ tests/test_layernorm.py | 3 --- tests/test_matmul.py | 6 ------ tests/test_mlp.py | 6 ------ tests/test_pool.py | 6 ------ tests/test_reduce.py | 3 --- tests/test_single_perceptron.py | 6 ------ tests/test_softmax.py | 3 --- tests/test_topk.py | 3 --- tests/test_transcendental.py | 3 --- tests/test_transformer.py | 6 ------ tests/test_transpose2D.py | 6 ------ tests/test_transpose3D.py | 6 ------ tests/test_vectorops.py | 5 ----- tests/test_view3D_2D.py | 6 ------ 36 files changed, 11 insertions(+), 190 deletions(-) create mode 100644 tests/test_eager.py diff --git a/scripts/ILS_experiment/test_matmul.py b/scripts/ILS_experiment/test_matmul.py index 1314e483..b0bc474c 100644 --- a/scripts/ILS_experiment/test_matmul.py +++ b/scripts/ILS_experiment/test_matmul.py @@ -52,13 +52,9 @@ def custom_matmul(bias, a, b): test_result("Addmm Forward", res, y) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) parser = argparse.ArgumentParser(description="Run matmul with given shape") parser.add_argument('--shape', type=str, default="(512,512,512)") args = parser.parse_args() shape = tuple(map(int, args.shape.strip('()').split(','))) - device = torch.device("npu:0") test_matmul(device, *shape) \ No newline at end of file diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py index e2437904..2266d74c 100644 --- a/scripts/chiplet_prep.py +++ b/scripts/chiplet_prep.py @@ -1,10 +1,7 @@ import os import yaml -import shutil import argparse import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -60,10 +57,6 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None): print(f"Modified file saved to {output_file}") if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") parser = argparse.ArgumentParser(description='Process folder argument.') parser.add_argument('size', type=int, help='Folder value', default=256) diff --git a/tests/Fusion/test_addmm_residual.py b/tests/Fusion/test_addmm_residual.py index d517796e..917628e3 100644 --- a/tests/Fusion/test_addmm_residual.py +++ b/tests/Fusion/test_addmm_residual.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -38,12 +36,7 @@ def addmm_residual(a, b, c, d): y = addmm_residual(b2, x2, w2, r2) test_result("Addmm + Residual Fusion Forward", res, y) -if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - - device = torch.device("npu:0") +if __name__ == "__main__": device = torch.device("npu:0") test_addmm_residual(device, 32, 32, 32) test_addmm_residual(device, 128, 128, 128) test_addmm_residual(device, 512, 512, 512) diff --git a/tests/Fusion/test_attention_fusion.py b/tests/Fusion/test_attention_fusion.py index 045c109f..ebbd3037 100644 --- a/tests/Fusion/test_attention_fusion.py +++ b/tests/Fusion/test_attention_fusion.py @@ -1,8 +1,5 @@ -import math import copy import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -70,12 +67,7 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512): test_result("MHA Forward", res, cpu_res) -if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - - device = torch.device("npu:0") +if __name__ == "__main__": device = torch.device("npu:0") test_MHA(device) # test_Attention(device, head=16, seq=512, d_k=64) # test_MHA(device, num_heads=12, embed_dim=768) diff --git a/tests/Fusion/test_bmm_reduction.py b/tests/Fusion/test_bmm_reduction.py index 7a3060de..45e31dab 100644 --- a/tests/Fusion/test_bmm_reduction.py +++ b/tests/Fusion/test_bmm_reduction.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -38,10 +36,6 @@ def bmm(a, b): test_result("BMM Reduction Fusion reduction", res[1], y[1]) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") #test_bmm_reduce(device) test_bmm_reduce(device, 12, 512) diff --git a/tests/Fusion/test_conv_fusion.py b/tests/Fusion/test_conv_fusion.py index 6f3d5984..bc200ff2 100644 --- a/tests/Fusion/test_conv_fusion.py +++ b/tests/Fusion/test_conv_fusion.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): message = f"|{name} Test Passed|" @@ -97,10 +95,6 @@ def custom_conv_bn_relu(a, b, bias, c, d, e, f): print("Max diff > ", torch.max(torch.abs(res.cpu() - out))) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") # Vanila test diff --git a/tests/Fusion/test_matmul_activation.py b/tests/Fusion/test_matmul_activation.py index 94e5c4ad..232ec98d 100644 --- a/tests/Fusion/test_matmul_activation.py +++ b/tests/Fusion/test_matmul_activation.py @@ -1,7 +1,5 @@ import copy import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -69,10 +67,6 @@ def test_matmul_activation(device, batch_size=16, input_size=32, output_size=8, print("CPU output > ", cpu_y) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_matmul_activation(device) test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="sigmoid") diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/Fusion/test_matmul_reduction.py index fdd72c00..9b09214a 100644 --- a/tests/Fusion/test_matmul_reduction.py +++ b/tests/Fusion/test_matmul_reduction.py @@ -85,10 +85,6 @@ def matmul_fused(a, b, c, d): test_result("Matmul+residual+var_mean Fusion reduction", res[2], y[2]) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_matmul_reduce(device, 3072, 512, 768) test_matmul_var_mean(device) diff --git a/tests/Fusion/test_matmul_scalar.py b/tests/Fusion/test_matmul_scalar.py index 96b49a08..d5a159ed 100644 --- a/tests/Fusion/test_matmul_scalar.py +++ b/tests/Fusion/test_matmul_scalar.py @@ -35,9 +35,5 @@ def matmul_fused(a, b, c): test_result("Matmul Scalar Fusion Forward", res, y) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_matmul_scalar(device) diff --git a/tests/Fusion/test_matmul_vector.py b/tests/Fusion/test_matmul_vector.py index bf1bd513..f87f9432 100644 --- a/tests/Fusion/test_matmul_vector.py +++ b/tests/Fusion/test_matmul_vector.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -41,12 +39,6 @@ def matmul_fused(a, b, c, d): test_result("Matmul Vector Fusion Forward", res, y) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_matmul_vector(device, size=[253, 123, 47], dim=0) test_matmul_vector(device, size=[253, 123, 47], dim=1) \ No newline at end of file diff --git a/tests/Fusion/test_prologue_fusion.py b/tests/Fusion/test_prologue_fusion.py index 850f386a..ecfd5fbf 100644 --- a/tests/Fusion/test_prologue_fusion.py +++ b/tests/Fusion/test_prologue_fusion.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -84,10 +82,6 @@ def bmm(a, b, c, d): test_result("BMM Element-wise Fusion Forward", res, out) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_elem_broadcast_fusion(device) test_elem_fusion(device) diff --git a/tests/Fusion/test_transformer_fusion.py b/tests/Fusion/test_transformer_fusion.py index f85c6158..1581cd97 100644 --- a/tests/Fusion/test_transformer_fusion.py +++ b/tests/Fusion/test_transformer_fusion.py @@ -1,8 +1,6 @@ import math import copy import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -199,10 +197,6 @@ def test_EncoderBlock_validation(head=12, embed_dim=768, input_seq=512): test_result("Encoder Block Validation", res, origin_res) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") #test_MHA(device) test_EncoderBlock(device) diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py index c48ef7d7..57760370 100644 --- a/tests/Mixtral_8x7B/test_attention.py +++ b/tests/Mixtral_8x7B/test_attention.py @@ -1,7 +1,5 @@ import copy import torch -import torch._dynamo -import torch.utils.cpp_extension from model import Transformer, TransformerBlock, ModelArgs, Attention, FeedForward, KVCache, RMSNorm, precompute_freqs_cis, sample def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): @@ -159,10 +157,6 @@ def test_rmsnorm(device, seq=32): test_result("RMSNorm", res, cpu_res) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") #test_rmsnorm(device, seq=1) #test_concat(device, size1=(1, 8, 64, 64), size2=(1,8,1,64), dim=2) diff --git a/tests/test_activation.py b/tests/test_activation.py index 20cfeed4..dacc102e 100644 --- a/tests/test_activation.py +++ b/tests/test_activation.py @@ -79,10 +79,7 @@ def test_SwiGLU(device, size=(128, 128)): test_result("SwiGLU", y, cpu_y) if __name__ == "__main__": - import os - import sys import argparse - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape") parser.add_argument('--shape', type=str, default="(512,768)") diff --git a/tests/test_add.py b/tests/test_add.py index a9d37d5e..7a0d23d9 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -48,10 +48,7 @@ def vectoradd(a, b): test_result("VectorTensorAdd", res, out) if __name__ == "__main__": - import os - import sys import argparse - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape") parser.add_argument('--shape', type=str, default="(512,768)") diff --git a/tests/test_batchnorm.py b/tests/test_batchnorm.py index 19b9f29f..065c0870 100644 --- a/tests/test_batchnorm.py +++ b/tests/test_batchnorm.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -33,10 +31,6 @@ def test_BatchNorm(device, size=(1, 16, 64, 64)): test_result("BatchNorm Forward", y, cpu_y) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_BatchNorm(device) test_BatchNorm(device, size=(1,64, 32, 32)) diff --git a/tests/test_bmm.py b/tests/test_bmm.py index 65e5e64b..02a6460e 100644 --- a/tests/test_bmm.py +++ b/tests/test_bmm.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -42,10 +40,6 @@ def bmm(a, b, bias): test_result("BMM Forward", res, out) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_BMM(device) test_BMM(device, 2, 256, 128, 256) diff --git a/tests/test_cnn.py b/tests/test_cnn.py index ecc452fe..e6b01bbd 100644 --- a/tests/test_cnn.py +++ b/tests/test_cnn.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -49,9 +47,5 @@ def test_CNN(device): print("Max diff > ", torch.max(torch.abs(y.cpu() - cpu_y))) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_CNN(device) diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py index 4d989a0f..533a04db 100644 --- a/tests/test_conv2d.py +++ b/tests/test_conv2d.py @@ -1,6 +1,5 @@ import torch import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -36,10 +35,6 @@ def custom_conv2d(a, b, bias): print("Max diff > ", torch.max(torch.abs(res.cpu() - out))) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") torch._dynamo.config.cache_size_limit = 64 with torch.no_grad(): diff --git a/tests/test_eager.py b/tests/test_eager.py new file mode 100644 index 00000000..7a2df6e2 --- /dev/null +++ b/tests/test_eager.py @@ -0,0 +1,8 @@ +import torch + +if __name__ == "__main__": + device = torch.device("npu:0") + x = torch.zeros(10, 10).to(device) + y = torch.zeros(10, 10).to(device) + z = x + y + print(z.cpu()) \ No newline at end of file diff --git a/tests/test_exponent.py b/tests/test_exponent.py index a3a706a9..20f0a143 100644 --- a/tests/test_exponent.py +++ b/tests/test_exponent.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -27,9 +25,5 @@ def exponent(a): test_result("exponent", res, out) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_exponent(device, size=(32, 32)) diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py index dbb5f2d6..95167d1e 100644 --- a/tests/test_indirect_access.py +++ b/tests/test_indirect_access.py @@ -1,7 +1,5 @@ import torch import copy -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -79,10 +77,6 @@ def vectoradd(a, idx, b): test_result("Indirect VectorAdd", res, out) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_scatter_full(device) test_scatter_full(device, size=(2048, 2048)) diff --git a/tests/test_layernorm.py b/tests/test_layernorm.py index 5c15ad12..3db27dc5 100644 --- a/tests/test_layernorm.py +++ b/tests/test_layernorm.py @@ -31,10 +31,7 @@ def test_LayerNorm(device, size=(64, 64)): test_result("LayerNorm Forward", y, cpu_y) if __name__ == "__main__": - import os - import sys import argparse - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape") parser.add_argument('--shape', type=str, help="Shape of the tensor in the format (batch_size, features)", default="(512,768)") diff --git a/tests/test_matmul.py b/tests/test_matmul.py index 0e04738d..a5bdf422 100644 --- a/tests/test_matmul.py +++ b/tests/test_matmul.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -90,10 +88,6 @@ def custom_linear(a, b, bias): test_result("Linear Forward", res, y) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_matmul(device, 32, 32, 32) test_matmul(device, 128, 128, 128) diff --git a/tests/test_mlp.py b/tests/test_mlp.py index b6b70c02..e3f79561 100644 --- a/tests/test_mlp.py +++ b/tests/test_mlp.py @@ -1,7 +1,5 @@ import copy import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -105,10 +103,6 @@ def test_optimizer(device): test_result("Optimizer", model.linear1.weight, cpu_model.linear1.weight) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_mlp(device) test_mlp_inf(device, batch_size=1, input_size=256, hidden_size=512, output_size=256) diff --git a/tests/test_pool.py b/tests/test_pool.py index 37248164..2848e04b 100644 --- a/tests/test_pool.py +++ b/tests/test_pool.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -43,10 +41,6 @@ def avgpool(a): test_result("Avgpool Forward", res, out) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") #test_maxpool(device, b=1, c=8, h=16, w=16) #test_maxpool(device, b=1, c=8, h=112, w=112) diff --git a/tests/test_reduce.py b/tests/test_reduce.py index 93caba7f..07f8fef2 100644 --- a/tests/test_reduce.py +++ b/tests/test_reduce.py @@ -37,10 +37,7 @@ def reduce_sum(a, dim, keepdim): test_result("ReduceMax", res, out) if __name__ == "__main__": - import os - import sys import argparse - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape") parser.add_argument('--shape', type=str, default="(128,768)") diff --git a/tests/test_single_perceptron.py b/tests/test_single_perceptron.py index 7475e1fe..7d3401a3 100644 --- a/tests/test_single_perceptron.py +++ b/tests/test_single_perceptron.py @@ -1,7 +1,5 @@ import copy import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -78,9 +76,5 @@ def weight_update(a, b, lr): # plt.savefig('result.png') if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_single_perceptron(device) diff --git a/tests/test_softmax.py b/tests/test_softmax.py index 82218518..2dca97b7 100644 --- a/tests/test_softmax.py +++ b/tests/test_softmax.py @@ -57,10 +57,7 @@ def forward(self, x): test_result("Softmax", y, cpu_y) if __name__ == "__main__": - import os - import sys import argparse - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape") parser.add_argument('--shape', type=str, help="Shape of the tensor in the format (batch_size, features)", default="(512,768)") diff --git a/tests/test_topk.py b/tests/test_topk.py index 0d5c08ec..c8565310 100644 --- a/tests/test_topk.py +++ b/tests/test_topk.py @@ -38,10 +38,7 @@ def topk_fn(a): test_result("TopK/indices", res_indices, ref_indices) if __name__ == "__main__": - import os - import sys import argparse - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape") parser.add_argument('--shape', type=str, default="(512,768)") diff --git a/tests/test_transcendental.py b/tests/test_transcendental.py index b930a3f5..34546539 100644 --- a/tests/test_transcendental.py +++ b/tests/test_transcendental.py @@ -63,10 +63,7 @@ def cos(a): test_result("Cos", res, out) if __name__ == "__main__": - import os - import sys import argparse - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape") parser.add_argument('--shape', type=str, default="(512,768)") diff --git a/tests/test_transformer.py b/tests/test_transformer.py index bfc31233..2b7f308c 100644 --- a/tests/test_transformer.py +++ b/tests/test_transformer.py @@ -1,8 +1,6 @@ import math import copy import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -115,10 +113,6 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512): test_result("MHA Forward", res, cpu_res) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_EncoderBlock(device) # test_Attention(device, head=16, seq=512, d_k=64) diff --git a/tests/test_transpose2D.py b/tests/test_transpose2D.py index 60a19ed8..4e9807ce 100644 --- a/tests/test_transpose2D.py +++ b/tests/test_transpose2D.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -42,10 +40,6 @@ def transpose(a, b): test_result("Transpose2 Forward", res, out) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_Transpose2D(device, [64, 156]) test_Transpose2D_2(device, [16, 64]) diff --git a/tests/test_transpose3D.py b/tests/test_transpose3D.py index 67d4d88a..e4d4e952 100644 --- a/tests/test_transpose3D.py +++ b/tests/test_transpose3D.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -57,10 +55,6 @@ def transpose(a, b): test_result("Transpose 3D Forward", res, out) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_Transpose3D_1(device, [62, 34, 44]) test_Transpose3D_1(device, [62, 134, 144]) diff --git a/tests/test_vectorops.py b/tests/test_vectorops.py index ede70e0e..90e9c0f5 100644 --- a/tests/test_vectorops.py +++ b/tests/test_vectorops.py @@ -1,11 +1,6 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) device = torch.device("npu:0") # Target shape diff --git a/tests/test_view3D_2D.py b/tests/test_view3D_2D.py index ae8a67c9..cc7b5e41 100644 --- a/tests/test_view3D_2D.py +++ b/tests/test_view3D_2D.py @@ -1,6 +1,4 @@ import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -40,10 +38,6 @@ def view2D_3D(a): test_result("view 2D->3D", res, out) if __name__ == "__main__": - import os - import sys - sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) - device = torch.device("npu:0") test_view3D_2D(device) test_view3D_2D(device, [12, 512, 64]) From 3f8b866ff6885f56cdebacaabac501e4ecc962cd Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 23 Jan 2026 10:54:08 +0000 Subject: [PATCH 089/194] [Cleanup] Remove built libraries --- .../torch_openreg/lib/libopenreg.so | Bin 59728 -> 0 bytes .../torch_openreg/lib/libtorch_bindings.so | Bin 166144 -> 0 bytes .../torch_openreg/lib/libtorch_openreg.so | Bin 569736 -> 0 bytes 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 PyTorchSimDevice2/torch_openreg/lib/libopenreg.so delete mode 100644 PyTorchSimDevice2/torch_openreg/lib/libtorch_bindings.so delete mode 100644 PyTorchSimDevice2/torch_openreg/lib/libtorch_openreg.so diff --git a/PyTorchSimDevice2/torch_openreg/lib/libopenreg.so b/PyTorchSimDevice2/torch_openreg/lib/libopenreg.so deleted file mode 100644 index 272fb567b8daf1c45b8dc0f7b3a557257a8b68c2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 59728 zcmeIb31C~rwLg5lC~+1ng{7D+2yog2h_SL^0RbzvlB+~cOq@UP6ezTWmeSC&C?vsQHvt}@Kr#Cj1`g%ybl>o_ zn=zv(U4%rJ;Ik07tlsY!R0-WbJhc!4E}53_tU^4}XHb;qrdEL`)7*NB9Fd%~T-1{( zwL#ZteCR%3{|JXJi7(TJHzUn-pUa=jTd5K*EBmV`lFxckZ{R9kkefD(e#=z0cRK1x z5et%En}}gh7K`@s^{-FJNv5PN)b$omF`JpC{N|LEWdgC&Npvw z4HO=?_(S#V?_L%;*p4D)5;?<&gx=Ian-_6S1lf> zElAxqruG=ZWfhUqbrMgMlaW^A;}H2;k)Rr4&0U(g?VZZe zTlaWIpS`B~2y5@c8@wl5k1Ow5Kk=3|g}=S(wcpIFEWG3W3g_!rt$)4hvKy99yLilL zzk2o3BgXA_+;bm4IPQ(xjw#!EK*_}oe?IK42hJ+|!eVcGv-72^JRSFab(nJA`n&Ia zH#YBsGY(vI_1uql4-LGa4Hh17c;OpQK0N!`tIsd7x79?iT>Of?>HXQie*E*!r)U23 zH=k5|`q(SC9b>uspkPUD;Ynf zN=FJ}nSc+DrFIWi26Nz*fbS1FgAtC{Cg@)+`B}~3djvd)ai@0s zF5&QV1pIFpF9yGm!!O{8LY^+s57DBDYrT-ek%E4GD~De!;1_}(;a7@<7|pTkaKYyQ zDH1NTX!kOKUoG&9gkQNsz^$}sq2suc9CnV|2=lE|yzI6RUz%4!4qW;qi<2HVth{!@O45ysq;DDaly}tkPpeNehr703;gTExLds(U={FN z1^sS8e};fh%%cy}gq#Qb9C5qACx3?cPYFk9qJRfLhx9}he)nVn-zfMQY~q0Z1iX>i z0qAXj0~QN-3HT)X^V= z`^(^q=5@Uo7dh`X%)XT>R-ymj6@2~ybbbjs8?NF2sh?*F{M0HAUm@@nk7reT zFyQHkc*7BoNAb9p)OnhHVc#l$N5mIiQdiR!4EUCKSGM_h*~lfHrY^6i#UJpt`Ooty z!SFm^q}JEzZ}Qayqk)JKfCeLN9iCObh{xO997a)t#8o_JH@DSywj&L!X;(s#*09gp z>}iiid|jTlVAE<8G(>Dwp2aIYk+9DPKAXb6c3&V;KMWiQHv2p)y&b+B643x7((Y?- z3awSzL*Zb=*A!tyr&WTj)^Ko*r(-Sp-R|*)!@;m$X>a$2l=f(Vl2EY2-{nDVbaGWd zX#occu$zK`7WAUd+Xm|B9Z6zU)EjR0c$?0R`ooYLgL%;lKUy;&gney3Vl$^({=heM zi}0t}bust@9X<@6H_+?}`&N3}yn!a)$~+=zsPrsuz%bK*H?8(GwXXKGc>Qevtm^9W zgnZ$SAPr(_hHg89VC$cu=^Y{ZTK_~b+dtpPw9)zSlEp6UaP|EtSFXRpTJiabpQ1X*_)sk9zt|2Cf4RVot+TGq9vZ%(4r-k9_P}wPN&D|T--3r<6OuxGdy#bE_8Ns zwV3H)l|4j;&qCI;&F*OkdD^|JeV&e1G~4WncvsEzGn<+m%= zS)MviI2w>dM;gf)YrOsl^ebYsMFakGqdsO!Y0#KuB{9OBkjgLvxDqB;P9~;xq==B` zNTh9*fndbnvepCTlq}3@0+~Q?y622(wrRqeHj&|-9tf@>Bu|@vWs|4FA6V7qqc&mV zea)De&Q4hJS|1cMxK?IQkNU!T#@Xf#b@-Z>_?a!@Gau8tKJ167ju0A$NgChGHkLM> zZO+MzAHZxd81>j-Vx>u5FbtkuYDkE7*qe4Nf)QV*&W!8d)sVl7w(<0<)qD2@cg4RTrd!Zf1J`g1XXPf46V`My` zV4J^bt#g{w3CjnC;_DK`AV#GDwwOkhFYauhYA|jQ-s;8sj{G`OL(B*|yb@P9 zeZ_6_1y)5``HEY=ph=fpwgo9H-iWkEGr=i$Tlmm9F*21x!$XlUnJ8ftsANX7uf-c} zi@<4U^F<(L8$?=97MZaFGm*8k&{1aDu|{;Q)tv|ep@i0V9~Q+XZ=|VJ*er~=1Myp#_`T1n?e&7Ot3}9=BZZ}?CrFQnM%#FWsW&6kF8>Qg`J}>TL#KADy9w3 z+H3~YVTh8Ts5f0O#k0dOrwP^srp>ODFIw!H=UR9II(L$|rwvjeo--8Vp9!B)cxF?A z{22T#W*^Fr!To3k8>@_C<>LX_51&$e2-nQ&jbVQYzXW%qu|pe!e+7u`QJ(Jpr~mN{ zAGKl{&!4ea0jmG*T10#%DAj<`Mde2r$`A1Su~oMys$OnV4iM#y!2iRQKfw;$tSlC= z0~vm)vO?tVX656RRU&^4%a2pSA|E}}kDYjlav}0`6+jjrgCn_bUHkN%$nB@Z1^(JS z6u}y+{6yrvEI&rMS>&I&b2WC}qm{cwzL}MeQXUlfYL+)CPl|kG%2kMP6e}-_e3+FN zDQS`SvwWel2YI^Wb4XNT*9NhV?-KDK2~UZKN!>Z{kihTDfme!n$#psKZV}g6p9Aj` z@LO}>iikIC$bqjH_#1QJ8w7k?4qWNu{VNlE$$nWxTw!t!yi&lca^Q^uKDdd~lXPT0 zCEBHM%7OUEahLJ6T)0Kh&xKp_z;_tv*KrouUm3?FIutXL?d~z)8{u+%a`80-{^OAK%kDq`?z1N>A2oq7X&wgJA(06)b5Z#2MH7~ri2c%uQ{{UFas zxyYhE0mr1rOwC@`8Q>Hfl-GI#or5G0>8%1j0ry7u1_Pay2KKhW`OT9z-JlYdkpXs4RCD}&yZeCHNeYKJcFb7f;4=111$_p1DFMWFU5eij9kYN@vOdK#1UJr&TMcm1X?bl(@jJrbUjmVCOz{lC z58$`#Zz{zz1pf}dWq-G&c!uBy^IP_JAjLBTpTlq2-yJEQA-Ies6J#(&f4Q6w;kWFs zmf{)0Ka}6Hzq?XAL-1;T%l__3@eIKa%u1^b@JR-I)d2sl0bXx_f6oA4W`IvNz#9#4@o0~g zwi@8)81O>|xO^5t#a#yYaR&Tu1H9P)zsvwX-T?12z{L?RR(hQQUSYsrZ-C2ZQ&fDb z0bXgq-(Y}GGr%_*;L{E8lmRZEO;PbS1AL|df4~55GQf8j;0p}!eYg80uulT}B(P5c z`y{YW0{bNJ|2GMIXrB0)8hf`yO_*N&v7)HGsYqdFK#grF+043+6_ev_W34dm|hU3#82j=R&p)bq4q+?~Es&yQmHOY}T# z3U;T@)$_E`+nx66c?ultPM@ylX=ArL?a=eIq1&Cd>3Q17?M@%1=V=4CJ3T?qm$H0; zo;S1n2Vcql(nbya*YmVN+ns(|&(p?icY2earwtkUujeT+h5qY#3Rs~3dY(3FyVIBG zdD@_%|9YM_VCcV|r;Qi-ujgsQh5qY#+EAhYdY(2?=)az)4HWvX=V{}#J6)jXX~VQT z{lS;A|5lcNRnH&A@=xn|+8Ck#dj1%eze~@Tv-~ghJZ+57e?3ndBJ^L+(?$sW*YmUi zLjUzVZDi1YJx>7_^k2`@h6erD^R!_>|MfftO3;5jPa72UU(cVw@*jL5`%fDZjK7|z zfDOi9&(p>P{nzufA;I|TdD@6z{PjF-K+u0ZKa1rr(eo7GLI3r970Y{N9+ZTa`JEHty;bD9kyjJ70Uufi@BoRO~AwjB#TUB zl$scwsV`Au_uoo|=H9ztA*quWaTc_7NYf$oj2bx$trO3*zLC5vS#i4K@1q)|1u0nw zr%vEHky70+S+k6%&uC9lf6>fu07k>^ z_*>cn(c;y{miS=r#oe8v`YlL_JCCu>Ufk*SzQw#Vr6v!Wa2e#8ocSmv{idgo(1kvz z4#;qd=`KLhZ6MuJVLq~l6}I%DI1Lq0lcv`wvG0i-1Xu9MUfe?|$7BZ0M|KnHW|0E_ zUWPyO50p&#N{#PQH@$zFx@k|5TJW&?^jDDsLEuI~pd>Tc!uXW+Nkh74_oA*6J#MKQ zn>`koQ2RF{XA5C7TEw!1q zw~>6(%`MD7rSc;i)cEH#pb7Mx=D{-%L_&!d*;31iz7ao`DOhu414lncpx*@aco`v& zKtD0soq&#am8eBsm5z@$6-1Cbcwxdl77D4Bsfl{4ng~sD4}2B{CtIC^TH0H+~XXE9GuEx1`~HBTURmsYCrdVGq8t}rm%=Qz`G zhGVH?33!gZJ6VmxD1kq7&-IWun|$s>sI0E{V{^|cK?nDpWiFQ@Zsi-$i;ODcRB7JG56esa#!LkAio%yq9)ES zbH`Dn{Q%0K#$Qnr%PkM6B%~MiVD?ZgAV!}@4Gwq5pVA&c5mooQ;x7#S(Vg(|BAT+l zMiDi5ygUA8ItrS6oWX~zzEitc;5=LhoU^H2SgaD(s+tH|G!G;HUSzr&Z!0fJ9{~&( z86<{*2~E8()9i!=F!)PFegVsxE@OxuiI>s7!w9_?DRISLNt0=ICB839KhgGl!h3_Vk5I*w$&Ja%$n^o%~HQQEqX zu)fi__o?w`sPr;5QG@<9pnpy1pDVsYjc*=$v!xHDi=tsRf9MrZqQ}Wc|3vjR5Y^ig z&N7;EFAzeiAnq(1TBO?pnt5-5sccPi&l*x~)IJ8aIW==H2`Bx16mU7P77-F)^&&q9 zwz&s~Z4}fn_xu@fcYHUNKU%cY{~&7bMaEJS=AM60CBmZl%)0BeWwdL6l-`c@POdYk zU@`LaI;0^5hqj(YmZr2!yQ4J825LHcCdOG!4UolZ0+f!ifd72P3hEPQaQl5IV+U^g*A@#WTU?A9D z#2~Pm_lTNKvQOsT5727*BmCiVb=kL>d;S0vrs*(Zw)aSJE!qL-0;wysfJ~dv*_az5 zONW@B5Jl02CCq`*ZU#0@q-GFC!)X&zHPHc#^94psU@RdFbh8vpN(|d}tyvT=7sVD< z%$q>5Qxw|)Ci>)ca(%xU6>RUN$AG1HzjotpbSx`9u^v;rDgKqy_6TFvmT_!E^~}8# zHqg!oW!rW&wkM;SYagOTZL@&@vp?z4plVqE<4?y2Vy_m#2st;!A4=Z_!RzC}{DnV% zFbqFE8W@K94^q4%{Th%sf6~4!RpTFtrMaqEO%zX>3_LPalWi&UJ*HP-6?!nOY*2d) zrQ<%N3M?RL<29+0hb~)EUJ^NhEdAtW^*CPJ+`FY1*s(2DwedlNMb!CqCuhC@y0mS$ z3%#T<{3&WM7PZsBB4mS|KzV;{c{vG9jc-zu&W!rW2WqmlKsDc))?B+X8OYWkR9i;4 zweM?(qw6l1M#$IPLoY3|#ds8MK$tck{hD92TR`?UG3i)hCpr5tBv>EKJ%1x`{8J%6 zS7J%Ig%-K8I(u8WwZqKl{T`CQ!Z!43pNdiG`4zJ6_^VhihMp4j+{sD5ypW)8Yd_=d z^=u>^O`84yu<&S4!)?FmM*O8qO@{WsftwG<1>Wo*0Y!2j5_r3pNSwIoQi8-yn0i#u zEeo=Av{nd+8XnwT;sFt7V7+Ix_B6X!GJR;w$XQa2JCBz0H3 zda|p(nmw3CN)DCc=&!D(6lBajQ=toWCqp3SOSc1NUUxYFZu_M7E>IM5G2Z5r`U-Hs zCbiq%YBjI>9z^SkT~J=8(8|B%VJwLSjH_|L6*Z^n z43w-Ne@IQthu=D<%w>N9%kLaZwfU-tp^ecupm)_hADerx209AYv;rVpvVQG$^eO!~ zZvy^wUHpr55h-HgAe-LKwr^^uKif{m+rc-@ z2}V-2AN1sT=t$3%sHeuCr!j@2f_bG)1amZ?(os>ZDhwPgIN+~ zsP+}sHM3$DPAwFqFYi!~nT9QC+72+cZ?iAPTGwy79{{?rgx$#y8a}g-uh0RNyw>`7wMo?S?e3%;s!j(rnOVtK7RVnppKFAx{pM)gAwe*}d#~ z=duGV*1NCJf?>VejVzpB<9cWQ;WSXxt0H@Qw=YumIYZl+GOxS4X@Y0&os^t#-6QIOe*Gelh7f)ex6LoBtcq0 z<1kqZM~zzu%xkb@o?j<6mO#Ihrg2UrNQs?tO4XjR|{c zSFIEXC^5%=N;)SOpK9~9Tc!RJ(_^Z8un8h+BQ36vz6F{w(@AKWFRtkt)@3n&VzaF% zQ83By7gqW62okJLW74kzfc+cnahR=3s{I4GE)8EV{z^6NADDkg`za_*`Z?HJ=_lDI z-3yB5o(70abk;(<*b?pZw|Qqv__hF@HLoMh;~s>$hn{ajDVd>T6ByRCowy^+*y+49 z{W%{$=(-(cwEWcqqOZ4X340!pbT#u2pr14%Z2fu$V7`8pG8S*6e(1{zMu8Tw674MT z$-EM=+YUh*tni`qC@=w;^>=WYnTF2&C*;*LfOcK>r=wTb#UD@8@(L0?g2dRN>D)j0 z8o8%qL;!Nmpi5ZKhW zXSm}3jmTL13C9v{2xIS-Va5Avm|MLV)QcNBijJ{3i3y>x=tOQ|u_Y*jbsoMYsD)UP zvmXHk?ItirM(#2)h>0Vv1x!us_Y+EJ4ZV>9Cdrwdl#jox#y=S<#Sl{cz2YMm?iC-2 zoJ2kxwq)bWwL8#rH8H-N95;mKD6;W<_)ue1iz=bt;45I0i4X&QCy@wEu=w7(@6jeyV)NiyEzE} zx`f@_j`a}+^JGAg)Dn??(-?^pH;pAo+%%3Cl*ocC9c|a^Xb26rB2DjNbDTO^6g`i& zj6J8KgDm2~I%)2uy*2A!C2m;%zK0tT%XyEGc`RoE`B{v@2jJE~q309onT+EIs@DN0 z{dW`y{h*ew67?d#V(smrEJbKeKqE4skfUPT-yeWB=kZv;0w3q<~UB0rIk`{s;=-o>_4tfQGV?V@` zBZMN+%}3}NQo@L-KiJEMKD?qW}WtG6_!-U6v);yuw$}9F)mk_<$cq5i73auqg#{-87 zx3!qtHc>-XZ3<-Jh;Jd+)QX)*&XZaK!`s#4I~4kWF|Rhncj);N;yQP=(QNueyX{pr zc0GsEh$Kx90W9p!BHZ?yeuuww3A=MP)*l$0&440#90}~sl@cdzx{4ri)78A7PZnh9 zXkD++9Qs7t^>?W1O&CA$!*}%QQ^A9_4dt}1(Pq01%+Wn<=Pnlac-n(deFFyT{ngag zhcrfy5r3AdgK9_u4{!#u@sNbGnq*B%1e*!MacA{VDr-^H0IWJfy3$k>yYU%+R z-uNQM<&PK_+8VRXGGA!4(;y3y=zib`9Z8>rRQMU_N%~_{!Jz#N)%Epdr1840b=Y{l zfGi2{E;LWnDL#S;^eccdURNXe#__s{kl6DW?M1ZyPmS09K>Vl1i#^4-1A@jXCOdt5ubSqjMwo>A_0Vq=x7#%;QoX!DUpk0s7!AiENi6Rys#?c9P zvT^dzf!zOK@!GewnXJ_!^F7tedQy>hSo}D846RjV`dWp2Y>Sn49ERt?YJxZPtX2p{ zhn~=GM*_B=1wpnu)*i)Oc7E#5O8!@#2bc`?(jEcBIp+ZgPUiuBPhx_DY&Z`N~Zdz{xoez(~+}fJKb_2~n=4J~d=RiM7lJ)SHu=9tZh+Oo5MF?~bxDZP+*{`JO zm)|EzkEXLMP(isrW9L?M7z|&1^dh41MJ8h;ON6TLfOz?Fka%Em;OB8Ft#N_ z*R=E*^5cgd;QJX?hnSv#bBWxfB3^C2CsmNzOyiHUF6{UV4#8|H*=Ft~*Nq*Dq7zNx zs1#H+fF-4rrVkFt9kB$;&t}c%z==6a5L#S`@@W(z>ggu}i6bh3Q-_#!2}67hKqpNv zN3y^qKFO zOQP)g6C*dbm3c_VIa)td&Bvx}sSnNHHyv^Ug&8M|PX7crdgsxn^a)Ie-QX?T-uN_` zG`&62&;C(_fuoVaUNPN_6+$xlgPSjw!g1RGeZzT&opiEKo4%dC*K+TI5T```)P#;DiP2l{9zzB?3>6Afb9paKzt_1e=>?ju% z;1nWMhDud;(vN*VosWsbq}#R5*>yO1>`MNEe)oZ`3pj*`qizs5eL+1Q>fqF_E7osi zUVUc3Wq&35vMaW=#BD#me9dNx$Kc80_;S7%Rq;UI5}C^5s&w9r=o|w&?BK*se#RoU z)r#}RI16Ekz6g1{?TDjBCb<%GIrHuv4?@~J{d^S7hS^ln#cO$N}PkP zC0k2W`^%k2%lQe1@B!+K8rxi=?R*LfYkzsoJ5-Y@A>ttpSwD*+%msc}yPxo4TX1Y; zulO=`9=$gAz7FM*JvaYwD{?HVrJ_I78$50yy;6C%Bii0whHy%vucYajG5Sp%W7Ar$ z>w=EO+)Fbj{RaNfJmL06O%^X!Hw_lM3i|6XKy}F=cJXNzCgS6?dC1J?IgWYKxv9v3 zbPz850fn18ooXD1qc;z|jfeX%#4h`yGIMY2%S?tQ*7-`xl9corK%&!i!Aq5B_Lo=#Si!FoDMj9C)_W$-qO z1cZYgs=cNXV2AnIE!q^+b{4!rPlE^b_y~^&Nf_FA9ptn2B62Vmxyb(gqe)!06t^g3 zPx$y{_XCU{{e19V+Zj3n`7}tl6G7Tw(NSMIZ$9@-EG<=vx#vBUIcP~xFxfbpOLylJ?WMpp;Rm_9K3PC;gR9QBd#<{#{4J$bi8yA5^jEu|dAa7+(D z8P(*47I)GIn~JlxNz=uPsnCgm{!qOgWmTF94dbTmVP|VWzq5^)de|Al4&B)WLW=g> zBNQKgvmkl`_tu?gqs3(oY@t}{6L8B}ICTPvk{w>BQwS^`1S$nXk7f0T>l1Z}HbT?U zrM>-a+sA3*hskf4zF}``IHiY%=%`#V!p6mn$q<;rfwsG#Y-&@ayg0jZ;Q`{+fV$&@ z>Xd=(8usv-u8uQcn>xuhxs%5i*4f{UINkC0vi!1B%CJXAvZK@QeJ+J3h6N7$9q%-HIOt!ITLdG44(+|8bdw%&8!h?XuLqOx1fXv&VSIm0_ zh*?VQ0t_tqBo9^tL%cUcdV|n;2_T5235W2&DZMoy2%w&vd-C#GPzL zSRZ1*>jXHY;gMYFG20?`Eb=lqTaZR#1s zfcImDN~C=w<49*&xpBm~b(~`rN~^_C2d3$chgotn5^eofZk%W)--5o#S>0~9zg^rT zE}XvG09T-}68{arrT;@PdKv*A)w1_&r%8>-X+L-YtHm}tzoq>Qyb$L!-2~^Oz`13` zX6;E7?QLH$iy{|0mbjCv>g5q&xKi3tOyYQ0n^;hdWff19P9Ie51Cev+y#waA_gAw? zgEwmM?hMX8E``sxJ#rwuUPBNIS6?U87w@*!R9cDZ^$?PpJikjT#7%a;FV@vQ=4ZX4 zZQX)k+yLb37BZ#hGZ~EOF#9*MJva+`iW3R6OYNy+I(sDER)J}#S;ijVtOd-SnEKEn zEEsfL7U%x~p+(rXeVs*sdIX5s0_r7ADArqjW|^lHmoxroDsN_nUs`>>={?AUCb~~T z1K$QZ_V- z`%N@iXkru#ui-h=p~;~?1jJg`1DIUj&6|qY*BP|*#nbCb(zh^YxSPuPemDjI_jh@o z-gYqNJN3NI3uiZS8h1E2tLEN=$n@##vhy~a4)-gFk5NF$u4fVscjE!(pOf+2W!~f* zpC&u1-LN@3FMo{TkaP6UxF>w3k4v~z@Ot07doRhDT*uP{hVK6u#F#@Be_wlD+-v`} zi32gvS_Qiwwo;PO!AY4xiWzF9sI>R|b)1RQOs|tVXCFlGJvCtR4~GsPPEywnLPPWd z=WGxpOMMv>6uS8HPr!?`)aSanh5Z?Enp}c~B5oneyH>z2VLdo#C9*Kr=TnlLy##Ck zPb+_j6Q*ZqRHPO+c!twRoS3o^vo{%7W0p3fp=|sqlm1}jp#SM4iMk$ zvcDgh*aH7`F3i#=w7pd0HG8zj8N$P=eQOk_#WRbNkx6iq-qYSl!61#p`=i(_Q`P{v7$#?C>qusB8BruNNZT5|*|rYuw55;RyCL1au{>=JdwT)nr5EE)>Wj|A$oB7W5Y#T=9NZz=Aj{e#&np*dcya)Z(OATa^eV%s0Q5v@m`xj%1s{KoY4fo_N^y+2yw zp;#i5)Wk*DOV5059%O9)Byv#qX~#y&v>KrIrf9C?g$w5(wl=ia z?_)20(5;A5u!$4vuSy>SCSgBVK4H3$h{aNH07s|Kx(|#Nu zWB=@Z12k4mjWHGy^UJ86O1xA8LAc9R@;MHJH+M8EhIy+tPe2jOTeW#gb9rTpIS6$8 zb(dphFb7`1E!GJ-_iAMv6EFpTX0v$;4ttxYETM`hM5GH(1+XIV;{|v*&Ve8zW3k4H zoux&MX$Zshs|b#nd#)q?QKJ-iB5=uZ`mHS3p~la*L|at6NMwbP8HEQrnKu z^w5upud$3Mn4>GDi&xmY>7>NkY&1JO z9^gu}v!^u^U39=NwgrOOxm=y{KKvWUiu8VrYs7`9L~OOP?1=eFVSE=8A{}M%@-s2(n74k^-Z5helSeSZ3?cf83^rS$kGUyE5jmRTX(PD$2vl1c zM)Va|;#3y(ph*qqMa7%)&?_D_KMekfF<$;493c@eKh>4!#3qaad+MgQisYN|=^KDT zc3h8()A@LKRIG>QAJX@sq?uDq;qJ(#vTu_{?A!35dGtluzmjNwx)vJF+m~&#co^v< z8CJ5C7^x$UpTokHk$H)E6se4#$pV#QxH`a|g8{K!!85Oj&FK9{|0W7?P^*kr$P7yV zeqPDYJ;VL`p~v<0i=3gq(g6NNJlQF9oax#qAM-fQKwO%X4Nel@A2U`j#v_;+cnX2{ z)>sUBmpbJ`bqd0GFRPnAD|T)Ave;$5bD++Ar{f2w)ZQt$!CN;@+sCftffLnun)BmM&ME1iQ^MPP2>i?$qH^xK+mK;h{5G{H`{k!Uw=E7SW_i^LMJTWtqRjoxqzH6Xswsp1;c- zcVK+g$>z`rIz9)+FcArp0V?P(Y zi-~V+(Kh}GDqZj%`KLI!g0GHPAm!;U*hNgU!>k93V{cFs(IdkJ9)}t z7a~aHwx3R9nF`>@uPgp=T>&0=>`K!M;jRZ^2xEIiTIdLD_M0bpk< zf4QvA{#LZ}L1uXGMr-V?*$-;tFv*)K5U1WM}3%dBgz2QO3uu9a>3Fm%{!dXekobIPp_deL(&dM@p2{2%U< z*OWy}YGUHh2JsyPXJyN2{%0S{())C5as^d2v1T&aboID-Rj&AZ2vVC|Q$E5|GIi6Z zMQZFzJTZAxjcw1Un_k1Tz`}c`RXy%Z3+A4QvRFshRfLEYzVLyxKXE2H`#F8VOk7i^ zU?NfEyT~;q<)V`@>#l+jSOiz=`Xdc2o{Mm|2agCS`h^G=pX`RH7ouGUf9Nm=4=k7A zE@%;r&jc(#4d?^ZH}(*l82EN2_(CG~Yi>hWx2BSTc+9A#I!E<{{M^ZN3l}7dXVC-Z zAJY53n8Ykrrkb4(xsw+Z7P{kj3ay=jZ&pC*{NyQLBEk1d(HJK>CS#Kb1oxC(>eHG# z8QBUk)TqoKvG0zIQ>WldD?ZWIpu>QppLnD;gElXU79#@my7c+5bEDA7i$*O-9$(>( z6V26XHi*50`FC()$ohV0z6g4z2e=1Owgo+{O8*M8g6Y+i-L!>PpMJ-kobfzaM6h80 zxbt8Yo9dqsGw-0`q06QJ4KSW*Vklk73ro}~&$|mg(Y662y$Ue+1FlUwXGcuh(z~!% zY1{yo@qJC>!TuK9Gk=HF2}(=hf_>LM3G9==J_+oTz&;7=lfXU+?32Jg3G9==|6LL| z_H5gXVA$=&uidvU@Pz}uHdoz3{EYKF_o6utx5u;4W3x@?zj0sZ4R}}i!p^yN{Q7b@ z91MFR^_}$#I@@j2*pJ>fh1M?c2b!Jp>+xI2iyNlnGt;xUo&6BK6TdfHum6;L&X3hA zoxY|>FnrQU_?cq)tIFO`2tSIfC{5k~eiGb@U(3d?LtF79vnQRze%9LB<_&`xYoyg1 zu(o@<@LRg)`6f>p37=EXSK%9m`(ON7VFmbh`ibmz?`of?qtzS6&tylutMaz;@B9X_ zk=VER+rEq}^kKUAac%xf&d#N6old-%Aug1kD1X~o|IOwc9EnRQeo%Tk$wU5RIDSam z;}3+|yiGor%_drMwTIg1SIH|q&aeM9aU*4MUmn48PJA_s{2W$vax)$WEkM5hMm$8o z{jT+y%&kb3UuH5Xq`=v!{hWa#^Ys*#r9`AP`sWTYFAo`SUT zHt>bC^7c&TampjrkgmTolbKPB-_%3uM!E}WBT@_Y?h&L`r1bqCl}K+uT8(r7X&(Y_ zwdhCa&za0MNE`7mt_1xFAw3*vH=Y-_BCW@Xf@i6pTQV8TXtakj8*Zf4+cKHU5$fE4 zbR*Ib&Re~Uv>N9nd&VG-!qaLBv+4afuhZ$P>pX$tAxNC%K^Lpq4`C8U*aW-?zOtww4=zZ#L^ol2z- zDSZP#qn623p?|xOhQMzMCy;g^tscr`=y!TIyqC$`0X|bWrgRPXqU%7OC0oDhM{e)DkN^yR@WsGvnbVfGUTTompBk zZ!=-C-N5p83gx8ci__o7}ZsrrIvMtj?%JN(Y#Wte{5-4O{v9ES~7RsLc{%h z{eC`VrlJ}hjNAbl57VTaQ)>Bf;hfU4D~jfnT9d_dN+(}#a+FrC8|5giijA%<$yyy*hR1}0J{vZgAFhX=KD1{c;$fg z<-n={BYHxO*f@TNtb2@8#)3cD$$$ zi$?4N2FwatRp9GxF?WjJg3-z81C7&7m_PORWXFzlW+Py3z=|MCE=MqWMUj(q(@DCS z<)OB;J}@E+^T2*Bs?%3xTJU_Bbf@?S;32P^YSEof$Cg&sGSTQdF8WdoS?|GGRSlZO zHr|&}>r9syCyTB?ky3?E2y%|@1+ckwBRSu<~(1NSIr_oD6{ zsJjD4C2kRQlQfo>7h#ar(OAYz^GYiVujC^Ms(k)A$FX)b)O9}yTlO#3r8!j&8Iuh! z$34v{SSB{7P652+ z1?}rlemv^HT(gVl+yU5Rz~p=-yO{!P24GVN%T0D}nQm9EfsUvs+XCSi{|=UDq;Z9w z#75PY_7xPJQ;3m;93fAAWF1fuWioxR-rfQE`KX<&J;k_%F;EU>PZtBZ62(hsZ3gZ= zgqs~>(2W(>mR1)=$Cg?#KB$DzT`-Q>+RISqg4BO*YiVwfZ(w}|-+Cn4PZnZs81)2w zCd)!IzEv2w`QgXwN#yH_E-z%d`7Y}2L0v1= z#dQk(fm#%A$z*;?6SPUEC&sF>@Mnghn$IX!3W{9c$?s@}|0VUb6=cRk{*8b=3H>Wx zfXt$NYnsaKd}Gmb1+Y*xd#!5o#uaw6jy&z@b1vbV)QoG>@8>{?Xsntsey!m7V&YHX z{Ox4}VeJba2hjCA|FE51ABp`vR>)>GFwE9kX>-tpiKuS6-u&w*Z!&q%t` z2b{;Tk8Pti@8xZdpwEp(-FfhYFr4{L@%v z!&sOnNIMV0hD~q#`q>53XPv4jzbG(mGbx)2OwXDMHWiu% zP0G84rgu%sWyPlZOv+2erVS?L7bd#@4K;=yvnzA`Aq5lc0Gy3av&moZs{+%hq7-e6 z`$>W6rUD{9th?iXTwv-(3k9Zs6f;&%;N{<&h4iNdrca8L-xr#Oij*%4O|fF-$s&S0 zN51@9MFb}9?Qrj?g-+98vGNS~&QcP*?MwDaV4no`NnoD@_DNu$1peQZz=o4}dnr3l zLn1G^sJx%wb+6%RpGcK1o?kEW8zfv-?2|X&PHLg!uZF8KC*PQpH=us)t@MvRhM!26 z%$u-3Fy;@)$sZx|GCnp<iY+T$*bcDkFu{oEHjC|3fgpo` z1VdBccC%}lAbg<6V~fqMgGH(x#cw3MTL?n(*(%z{){R|o3fc8lhVx%7I+hW6fv)Tl z`8FY#PeoqJU6A=dCJBl8mfNAqN)F+_afSG-$Tx~KB+_n?_K9@8NH>TyCDH+r4vKV_ zNEI7z$0AazNGnBJEz)|CHi|SP(r%IViFCb4H;6PP(gBeUigcGqm1&}Vky=GsDbi|@ z){C@Jq#==Zi?mOq>qWXjq$!aOh;&e-yF{vphi?{Q({7+!ODj_g*k z@n5HkKL?=#AT~IPUR^az@8=I z_O}AAU~5VjdoGCdsXXw1NIC+aJu3vh4Th%x(LL)!L?vtA0s>ww;LCWiavR{pPveOk zAo={Wgj0|Q7kgfY^br9M<>0?4;N1dVO_gxH4|oAZoBMdamnX_*7`Tz-+zEq4{8*NA zM4!N?pBE%}w}7)}2`Kvx>?y&k&*6wtp4e@3I^XAT7H7xp`+(E9ScQF%{oM}+p3xCU zLL~kgzzN@ygMSr+AFiZ?JShf5*Utrfy?{&myiw3k3HeaWm97B+e=bL!?@Ih1aJ=_< zqLe^xG`<@JeQDnh0G#-LQ_!K99$nJ~oc-hg^0L3@3iwl13{9!yiP9tBhYLUEbOFCn zz}p2};?ubVYFGL_GXC=<;56>i-;i=h3%K+<#B!z3w=RvO4+j8F^pzZc>L3BH%z;l9 z@Ld1vRKQ7YijW)4bGqg;_;+-_fI}7KTtP?rbrOCd!#@aTP$Yz>Hvm2o|F-}>NJ{fOfPT>80~ZtS`Q@B-y9CD(3WA@HT&>k{}G0k>Sl z0W9{9+eL7+h>j-gKf2GZ)qoc&W<~mg0<4_N@DEm`|1I@!3&Sr}2Ip{0)_vT*B;e9N zm;EBYw2-{MN=ccgRe%@3>&x{w91MP#vP&zjC4M*0v*#GN{fB_d{fd;&65Q+$ z`t?gVKAjz)Yo&mP8d+BPJx`Q#8C+xx$<2Tt4LT{IpY-gFuBQN>z;B2B{fN;iRR+Qw zOfhx3#+Zqpy{|I?C;FByUMA&yOCJ2qdEoS_C-rww*q;i~6rGT_ zd?dMr^1yFlboQ3#Cx9OX`=g3=bF!d67sLs^N7yAfAFln5~PcrA3-{h2miA?@cj2FLt5lUHJAXK;Ie9{gQ^GyS=c<6j`!nh6JHB>MC6 zz#H?xw=y{F;c8Bpo@LTCm_U_3@Klh)>G?EWw=g)r zH~zgZ4}1sUq_>S-9GlK~(KQuI`$+S`!QiEeRjfzf<>*Rd9{lqG|0VQc!&Mwn>f3Jw zodG|GZx?j#&qL?sJn)b6z^5KIGXM1q4m-ArH@HHy`+dNz;@+5?mxs=kdEmdw1K*Sf z{yyL&|4JeMDWdUH59j(>J)Z;UoG4uj0JpNc;eXD~1CIbs^mF~9p9}gHp+6fWxprTB zFub;+DHKvXXX9-O+lijJco`<*4+cDVx1_nv7k16<>~#K*yeA{xk!d)Qy(d%UaXK4j zYDK|{puaRk4}7WK39EbLu5wC>kqpY;GLD6*QI98X$v;3 z#z;oc=LTOJ-fgmVL|k|m3ZNNnzK#y|q6@w60+18k_Mp2RK4(2@Kp}V=o2gHV0bPDq5^quo43vH?Er+AtHF+l#KC)O=!c_YZ9r6m zJ~jNW(Wm^DW~4W;{{6|H?|rRrqG-@}{mnhR*++BN)$9vI{E@XTi8)si44So zws&}77x98*r@zSub$51R3>u)?t>It*FF#79XYd;KzT}zyNbB4-@2U=7M$~NnfIs5L zlsV7m3Hw(0LE7)bZ^5H%)+$d6kU_eEwU?z)ANB{s7;Z^s;fXNO0aUlm^fosy@pi0U z66AH9*_S*UB2_J7A!Q4wa~OmC4+7a=64>k zQc>QaZwf~`B2j#oaFf#PnT;7V_srx`D+c6hweE~P1m zHy3>oUo+X~kuW?hkhjMh4tv*nFkIoaN=w+=?(;N9+uPToiXn%U-5(i-%7t(k7&S;N zm(#dmLBb5oETnw3^UPbg)Z3oB55AwILtWttI#$hzqM%?%UeVcBS^rimVuH|IAjQN;f17=zPRK!<$$SK7eSK$p`?oL6+DcLbnx~4V z_N;}DI#&&*-IBT*ebVWS%xaGY+Pxw8P>c`BUsZc39E`vvqyiEKwDsGR1zU}p{T-no z97^U6%9dxe6FbA0nb86t3XFXFZ1CAB%=DtUa~qsXJWCvN+<1|kb@@MWEo`%(cMy-_CxEvFxbp5S?R=A4}AOGe*lz*35d64t7-}cn&H}D+urF7`@JjMd^6T~ z{ShZNn>HI)6c6pps8Qx!4HwJwN{D5iPb{Z$)SPYJP=~L1iNAd$mx~DvhR}nK2G>9q z%^qfda`}T~XBv^DU*y5=rEM5;g>43i($WnzZ^(Xa0*Tr64d0F6@icXLJuBhV1wc5` z)LIP{a?EjgY!%bFkiJQU*whL|J6h3|rq$%(xaKq6a(32+*vw%~z>YHql8?A&TE(=A znY@AFdPgH?)QSqI9-ICn4VJAE-}B*bS?htjl~c`T8%F&<)hE8rFF7MSPyR!S@V&68 zeT(Fz0a0Ma{aqex$06@kfno9vhUaB{3|xk;ii6nhk6{^cl$2c~#UKCKJ@= z!wfJgG^{3Od`k}h#?A}7$~x~icD;fr)G%A-58xYS+Tl&-VboZiM*EC~u=$t_UEY=c zP8*qkrZ#U!he!9UoJL-0P;9UOK^Rds=gB%5NfJjPZY}xF4=Wqls$sojT7g*Q(tz+G z_+VGb!Di-%?KA?uH6A|nIrC2}CI9jWvjt8J0v0$PAJUm4o5c(ROmmk%qQi5|0z^=U z-I^B*H}5NpkqOg`FNUGDK`vPk*bMnyK-_obfrWaC#SRl4ThH?CSaqqA=yk@XlaURq zX*bdVq`3`FnPJ>2NL<6zq}lhcsR`Nf5x)+I0%2Ium_pE|8_Q~a*cZar1$lg3zNRRx zV>L5eXx&b01k)VXV(R;Rr?V5&Cg&@ZoLJ_tEn3Ky%^9A#OBXUfgo_0hLhj%(P#6H% z_NLa=o))jaO_#|GEO{I~*DEkA`PpfL@UsC_{~IPQTZ;b`e$Y(A zx4IM<;yL;d=>ZWPBHc^)piS)|z2Ev29BIj|!Ah7rBvJ`@@JGz+u#Y)7hN+>?E9OZt zziL>UG8zm1bf2mwh(o6UunwIb^&uF-2Tu)l5FF83Shsq-b@aU}aN z)6?LI_~8bV+sHz41})H|dLX{{HrqHzsNB}Uw*Lszg>~5pZ*U})(7iAioxPRADAXB0 zfcvpGiqs!Iz8=>S3KGb88g|MQOSc9*N^DOptMB5?Lni_0(DizwM>YOt5 zWlIb91H4V=M*ZPoen4J_lATheF>TgyULaVo< zRjFuR8vrq$M#8+L6QLb?K*!KT#)C5W2wtin?m}%5rGmvvDsWq|Du^uIboiQ-3Rrk# zSx~4V9Au9%D}1dYB-+{xYBGmrnwqdMHu*wu9hml1L{@fmaN0a($|=cvqDIVsFk{@? z?njGgpPC1Ro|#rK3JBqn-zOgj|EDP)Z!0*ui!+4$8WcA&wF710YzW#!Y zIQJpX*~n`0oUF_{aPu+VS&{9__jF|1EzVQOdb#b>ITbpSBJ0cdc=S{p)SzP@L`v4j zu@iPR;4iT%>&y3oWGdeS;@F&qyr*Mez-4{;UXe`YdquK+Szq@55>bDSASmBclBs;JiR$Ahhu*%dp8$+l z#IbgMRSr){zl@jUL&DbK##rAf>dSP!&?AX2&!@1Kr>1jEnM%BT_5UlU{(z`2 zQ{#8q5e-RCroYaqFWE&*!mAzy=3Aqh7M*ihFXPL_LXc9kHes_xca{T4{7&0Bu z3pfRb%mJii~hgICJM&&|ILG-6SzS#JH@@sj+@^h4A&8|usV5W7YD5)UZs zl6jfZ3rW;P*}gp2f0?MS@oM^CSx*ASqcY7sSzn%45B-i~(D(e&C62x6^H=tZhRfJ~ z_k#>w=_bX()y@89|H$@bdJrlb>nkaNyMd>JRA{(lJ(*6-sW0Cn7`%&Dl6*=MvYyPB zqdvi8|K)oiyF`6WB8t?Slb<5mugrnV`cre@vOC5!XNs^I)<5uygQ5dRBD)YD$-lIp nv^?p&^D6xBW}ZBcb700xwku&1a5GZ<=U?F%({d{0WR?F1mZhy& diff --git a/PyTorchSimDevice2/torch_openreg/lib/libtorch_bindings.so b/PyTorchSimDevice2/torch_openreg/lib/libtorch_bindings.so deleted file mode 100644 index 144e6dc6d88bfba08b0424d1a4b975ae430924ab..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 166144 zcmeFa33yaR);Hb_1cCw`6g1;T7~`0TOBx6y0+NsfZc7IPArhC^grtFJHj-`#Dq}DS za;NQ?;JDzp!MF^L3PL{Ucs6u51a%@J?`zTY`@@9oN^`*ZYt|IhRP zo@d&J^!=UMPMtb+>eQ*a?Oi$U@!jn9?l%3?-FA)5ZUB5|7|#p48Bd_4+0Mk@1lu6a z+rvyb{bBYEf@XZ%;smBPV(4)?{lRBC|24lKyH;>ve9Psyy_GtbmhVI3MLov1T&^dQ zmVGD6EjvT>$M{a`E6Nq>-$psgMdb>A5akN{h;qhvhJZHXJ62RE*27iF!HXo7$jH{$5|{&uK4DFBiW_^w0YJxvm+;x6$5Ylq0$PryqUTZBz92 zqK(6JRtl(r{;iKP%aZCK@`OnRHi)N%crg^0o(Jfd(zg%Xf3T0z@J!uP3+DCmH`=d_ z`~06sw)MxqPw|h$+YA4eBrTiKcJGr3wv{&9AL-y!^?&td#q{#n}M-<TQm2MFF3X5!mO(nojL0MU+uYW!;tMq-(22r*SRmQy+PLkuDZ*g zcYq9;$_WS2!Lww7{``Ctde-@o>-iT18>#;HFh3&IAA`)1^56W6$oXetU?b(zpk4Spi!e;3LYiO+vWsYi>_?qp1&NcGH(LcbgR zjg)_6lyN+7VC4KmqU0ZXcI5o+$R9}#-J{51E;7?xvwo_fSCQ&@2Qx2H{_c=Nr2He# zi%fqo%DDG~;fPev-YEQ!jKb%>DE%4?dZhaMK%SBM^=zNW`3Id7S^f*6@To=N^ScWo zxBCp{Vm(c%6d^SeW^FvYWk1tBQqoeSD zBkGUTZqF$7w?^TAZj^bt_sqz0$c&;_=R(gT@$-DY$n~s`BDZ@`PbB$QqF<5b#e^vB z9)dh0@$*;|{(DE!t2d(9|MxGn*?ws?s=&8Kk#i9`6^Z|+Fkd3k$4AlU=c4GtrBTM~ za`2Q8fzN?a=INX$c4=Z1`P>V?2R#x$*G8F-|BNF45cDdNK71Qxo$VWCovJ=J@;rJ!O1sZT;s5g}^N9Ky zNv}SO(%hEV!EI~(kP>Bl--DbZ^|v#M9`1;u=Wj)kvp0%942{yS!YJ#` zq9}5!isGNVhE7G&!<;B`n2B+Zl)owpKmI8Cwgml(v|cnsnJ@Mza#$IK&tp;O=S1P> z?kMB(rzrD!U=%-jR+RZTGfKbqN0~2yDC^zzQRdzE7?eo***%K>H%D2oAC98uKSY_Q ztE2GwW)%B#985;y(-y_f*I=GT(zlW*`?%gw_K8cPjPH&p^W`2i8mZk8s6P_DFp7Ro zi!$$Si=zK~qUhWGQRMc!D0cP1D0ZnJN3v}QG8ynboxXRCjS$+qjomu(cO==G=Q^(%UQ74oF~wgDpJ2*H-E8F4D2Tk{`Rc0dOh1%&(ZEZf&ek{XSU9NhtB^5ou9X0mzHCOO>Yx~&3gVq%y*Jco-QA#2Y)Vt zFe!gVd)NHqbUs(UDDo}P^T+A(*`@dQVV!gqfo)K*arkmsh@JU5n>=gqA^ z=A60`&lG>9udKqGQ&Uq_L!_D-Prlce@2e@RoQ15D@_o)>H6oA4OPR8V=FGKu^5#uWxo$sc2o#BRIKnQmU`2rewBrMBe;q#Wj_vUMa3DE%(-B zIjKS;J=O0kE3b8qDD}=QEAe{Di*KLrDXT2=0}%q{65+Z@#;#a+YU&O;rUwgqt~?jcjIv#-rpmPs!}tJTr^S${|4#u(z_* zQ&Q|JnXTg)7^O;saw9}E(rDRRSK_Vql~q+rT^gG1t0}ImEid+YJ!ZPcSLLam4`Eg2 zOqobhPW4EM)xH|%uy9^L?(L6fbEbH5JvH8PZ*eX4U$9?;L8+;lXY&+P&MQM#rg&@p z70?cg_=XCxc}l7<-d%-PP+40xtI}IaLKK2zxfGX|S3xdPxAYc8d%E;l0rKXLtEwtD z6oW@4vzERxD@(7N>#dntP7GQ&8V(JeU0gZKn=J;k*vJ9(@L_nI!&#@YW_wF+8xNe5 zH?bstsOMJv({=f}5+0dSDtMYIL-SA|Vv^8?8>hNXE2Q^uW{p?wc7?a1qe=uIf<*0G)i?&r*r6x;@Ywjs31)~ zR|%Cs_iH>*pE6&qt0X7C7=LooF~lV`UdTOX3fF;*=^hWu+*4jP3svw;rM{uAl8S2Z zTwXQxiQ(GT2F*rXqKq<9ebGBh{(C@v!IXQFlN^YI%$)S8@voM~bw-)OP zm1J}9Q*|Oo*Z1*WvQom1PpYb^;B^7BNEjtbFrLZGPpv5~@y>w3vnFH}`)g@A>5@CE zs?z7J^Xaw2h+y(zVA&FsdMhwbX|4J>8m(Kc>T9lu^tmw{YoXACpE&UWPxVlRmT^m! zEzu1B%;+2oci_gU*UPN`=f!i-@YL&Jz`{km#T8Lnq|9h$1T8w{Ufo3hymtR*g;UsL z>DuRXA+joAUGNwEh%Pqn3VSre#o9*pYEp59cU&s31v!)QVLQS}IWws5LyNH_6ql4> zeXl9T>~jvE<@IHTvB(8mR902%0YBa>cGg(JI)6xs#1)UjR9Tikqd)n+A8BMbdvbY^ zha0-xWqftN5Sd4KJS0{K6s}+`+}nsfn1Y3Kf(h&ALfBK$A;Pvj;^}w zL;i?E@iT3*7&4>op%|>Jg6vEMPIE155Dlo2nvT@GnzFfA)(dLA&d8alirlF#xr(1| zBgua#t|f;KYfb)ep8FCbPP9>&R}h&2O3cm6kDjQw^8PQ0U8&6li*Izs`Rs44o|F-9Vt&4_T^-CU3tic zEk;gGcJ*~rX{S+8Syob2>h)yS7EB$Vh8<15*o}{ar=M9<^t^$oGBHs3$PNBeVZ zip(6DwfSPxEfb}BYEgTa9JL-jmG~_&!d6Zt@mFdoZ&Vm&ST8;S ztBTK8V`g_FCLvm5+5c4MbVS%a2Up{3u*`YTL>5?*vAe9&JIP-$!&?J|`Ei%})(9@O zRQot`YkwBIWFKKYXnz&)PfH$rqD&eDhOT%#m$b%BXJ3e^trSmI`E3joODXygr7#AFl9UJe+g+at1 zVI$x%w&~C}#ESH+PT>yohEvaXeQ~*84=CxGhRUw9p37BP>aFWi)lh7d&0V3v>Wt!2 zkJw2ICg5#X)lir!%)*GW%DLF*AYaWa#5F3-ZVWB0swhUFMlc6m_!%3DktJ0X)x|Yl zjuhn0pWt%mlV*5wDr^1N{eUU7P1-!pUsmpedje8+8K$5f_0VTj7Wy#WUs*z-A&@d_ zW>H4NZcu#6oNQbg6}qq>O)K-w7Jbc|pH*C4i%kW!l~+}Utr$&llb zk=LYne6_WvC`BeFiBLkZf=56Cbrf-}lb5x^;kB+T!U+Qo8=gA=M3)CEGC zl+UZJ(J_gaSri25+J%D35(@8lOGRH;+?geF{AD#>Q4o!rQb3-mv&(8F5fGn}K)Q6r zRcnMTEx8P9DwoWLfPLO95n)7rW+cCOuE7N@;OQ8Y3It<)UJ5-LqbpVgb9|+74DYBV zBJBdGepvNsHCS^BDiQvv6@g%~ir6eu7|BNz7w}KD#_&CcR~MJnxM=T}HC-0C7?yO(mT&1YzjqV$((k{Lu)^WX{DaS1Q%BekK%tOTI@N5XfsK) z6bFhT(GikF@l1Im3pBbsDJ8NvJwGl@mT6fX!U`jJCjc91hO#2p>l}^+veJu%#TW#H zN)W*lfkuj#&+-@7U=YMC)gwfLriZ4x(28eN)%?uZAG1>9<-n1r*k6YQ8v(gmUri~W zO6Znninp3`DKOMa(&Z>nq0+-?h51jYp@$tI9NQ@on%zJm}pN?@WkJ zmjc^GN(2@0OOp<>ORH8Md!2gT30XL9pa3HU7b!+YOJJ4m31O3@FY8929WkIVQ8$0% z$?v9xh?hj&+vVr1Dg@hgIt(){m8N~=3}RfS`m3>E(z6In`&@{{ddWp#Uzmc_KW~6U zm=VF|pI1D!Mha76fnqlG{s{nyQm2eQP7-lfV;CSw@rZQ2u$~|MVE!5rj^QqZB#rYm zD_uxloa11tT*huwKCGnElgoQ&3Lv@M@?K~t4*YOBrd#WwIUF^jBa)w34Z}NoKhOx>@V-kw;NhIkSo@CSFE(@~g#x6wJ*}VCd(R>&g`!R@9b7VvJ_AWivZ` z-t1yJG84h@YIug8xn(szteVh}sjjZw#V%kbjzZBPKcC0t*l;+5ePzjPkA9MQWo=cM zCQb>vkV>lkI>zy*{#5HLEg3Qd;K~tHc2)_Fdk8rRm6&wV&J36rEYp<1HX$p^GwjL~ z+XT03T$X3(mCoVj$p!;HMb8t7ykN zady1hM(Kh|PvSqy-yNmIf3bRccUx~RBTB?@9{PI*@=;ygY-i$c_`hy=3y%<`_R*ih zsr1wX*MY(m%6+EZLO+RaC@aHFtytR%9sAEK z!wr}owwHkELnAdwPa{10_(Qn)(%-gLr`2qE0qw`x-qEpoS2EK3**?;-pVQB@X*w?E z@@Lq-(Qz%O_qO#IB>Gg%>Ah_Ibv&Qbd)g9oyzPbCa04aAc7=|8$@k%AOLtqEj^}Xs zZnkV-`WWx?B9r*ki?0>>`@RM};Vwa6X`#31^cD+!rGD>blZ9TrRMgX9p~vYLYj#-Z z^(RDr%|hR#*MG=Dum3^hKW?EX^wcln=;e%lwLB>D$64sRbR2J?cj);OEOgsLqMjrR zeV3j;#X^tM^QT$pDY|^BE%bzqg8#)9dWKGKw$Nwmc$tMhUC+P5LO1fSve29L{2dnh zN}aA*=q)~eb)0O_s zi$b0m7J9vojc=o#o-gV0x8zT_SB(2I3%!q?e}#n}7ZmwhEcA=@{F^NFxO$O)mxX?n zo?o-jTbe|E+qqr)J6rE>tc9MiP~=ar&`0U@Ct2vrbbhKWbYpz$EcBK7_|{wKOLcmN z{!P-o0{?JblRt_?Kd95_D^H@g0vq{__NgA3Zs?blZnSTu8~j@7o1(P4D+;|(q8}#4 z_0M#P{;5I4cb!DPL88Yy#5iJ+vdQ!oDL<|jnx7p}=q>%i^~m`*>ElBEmFb71dOFS8 zZMJxQ-cf$t^mD2bg`ObMub1jsD)B>g8J}em9alch&kBitj!CuIR!VeSZ8krvB>I;o z)n?ly(FaNN4vC&9(RWDnUrF?3lH6qaE-8PKlz*v|U#4qPe&Y@)6+0x+he`Dum+0dq zx=qp(@@tGwtVEw;5b+%+(F-N|Dv3{-o*?D_Ldu^c(TgN{ibVHF^fZZnn?zqBwJXyz zr2OSlenp~JOY}U6?vv<+65TJ+r%Uv?5`DHrzg?nNOY}P=dYweSQ=&IZ{gUbRQvQIH zf2EXPrZ1NA>q`!oYL@8tN%bt1=(HCW-!vl)powzb4UlNc1%leV0UkQ=)4U{T~wjkVM}i(T_{?of3VqBp;bR zTcTG<`8y=~pCo#GoUp^V=8nBl_>&;f@rsH0Ns{PzRmJ?INOU;-<|j>})0$y?G9)^+ zZG04oZumNsmM77%;W0mj5*@E}n4jqq{Y;aJ{g6cOBhjlRIyPA5r%s~d>b?1?m+1RV zs?D}oq8q*dB{xg-{!;#>68$WRzD%Oy6&&-kLZai<5c9KAq8slhP~s|yey*7tKfWW; z<0bkgiT(?T-XYQPii!EzA<^*)g8A7c(Xn+jKbk~OFsWEyB>G^9eq5seQli`XhxPVi zi5@G_FOleR68%z%9xu_8C3=EHzf7VhN%SESJw>8lF45B@`jrwrL!y5r(G`iFBGL0C zx>KSTO7x);eY!+9-|ay9Y>A#K<*%0LyCr&^L?0p1>m~X~iN08(r%Ci?iJmUemrC>l z5`CFOzgnWNkm#c%`bvpDTB5I#=wl>$i$uRhqHmJuV)o-O4+F41!&y6vp6{!ftTu@YU8=y4L=CDG$0`W}g%AknXt=t&ZNqC`)T z=x&LgCed>xdWJ-wB+(U#ew{?mljxHrdZ9$mm*~?a`c#QNTcQ_8^lFJdO`_LH^cy95 zy+prBqA!-{H%s(piGGVjUnm+)TM4vCwQzZHViJm6We=X57BznC>S0wsEiJm9X@0RF=61_p9PnYP8 z5`DHrZ<6TM5`B?GuaoHaNc4J%zF49!mguTPZkz68#Sn{kTMb zM55aUg!TVXi5@G_ACu^D68&+B9xu_Kkmv~#eVIg0lIVYw=qVEYDT$sY(Vv#+84~>& ziLOZW6%sv9qCYFq3nluWCHi!U{uhZpTcSTN(W@o;XA-?mqW@K**Gu%35`D2me@UV@ zOZ2}<^raI0Wr@B_qQ5H9S4i|#5`Cpae_f)llIVvddW%F~Ezvhg^fx4WheZFoMBgFN zLlS+LL|-e>HHqFL(GN-VbrSu!L|-q_ZF*e27ib$KdaOj>DAD62`sWfoUZS^2^aP3C zF42=D`X-5i%EYY`1^k#|vzC>Rt(La#r%Ov^^iM~Rje<;ycO7xE;`YMV3 zu|#i?=>L-FnB=EK4qVJT3FJqNJ%={PZHl?Y> z*X>k?(y$>`cuS{J_keul)Fq$c%Q47C`FWpyB((d~sZ-62?dZ{w&=wQZ*5XL0)x_Ny zzi8qfjF+1@hVc>;_hh`##N?zM3C%HaEaPGm_hvlV#Ah(hG%*G6kA$2i?!$PniTg6{ zZ{mK8?Iw0GK77*XZye*@ChpI8i;2%-+-l;p8NX=a0gRWMcp&2?CO(JpLKC0Mc#etV z85f)Q7mO#H_&mm$CO)6B)5I4r9&F+Z8TU8wMU3qx9>nK730G{82wLSyxYW1##>B0lyR$xhcSN9#KRdcH*qTCB_yX#8)%UH1R0LP7{x2JlMoz822~vHH_^hra<uB__U}@j??%V?4*i zg^Y_$d;{ajCccqzripK2>@@Msj0c;zh;e@t-@@2#Vh`iP-x~d&&Um+piy3b*@eIbT zCN5$8qKQixFE_E5@e&iyWW3PCvl!1YF)ctxLd7O7V?5c!w=&K&@okKqCN5_@*u)i# z`w~6O4-eTe!#;qoGq#)fF2;wC8U3$kyxYVJ8E-N1-HcmJ z+`#xn6E`wmZsI1!OH90o@j?^d!+4H~7c(w4vC4R|iGRa5)5HPBP7~kDc(93sjQgAT zw~Xy3{vG4PM~(hBGv00DC5*S2_&&z1CjLF+7fpOWDM6601A|BdmBCVrXmaudJ8c!`N$WxUY9@V?hP zN!~ZsP4c&ai=OI=3oJO-f^#f*j0LA!@D&z(i3MM1!2>P0uLZ|g@Ci$QzOmr1EO@U4 ze`3MgE%+S^-e|#AdA?#vf8K(hw&2Gs_yG$JT5yvEFR)<01y@+G*Mf^IxWIyQEjY)5 z$5?Qx1z%yomss$H7Cg{``&w{}1)un1SN;9Qg1@rhy%zk51#h?DcPx0L1+TH-S1kB> z3x3*yAG6>GEI4SvO%}Ysg8dd;VZmMtF0$YP3(mFR919*}!KoH}g#}+?!53QaKnw0` z!7&zmV!37fE%+-7-fO|1SnzfWe#e40TJRbRe#L^Hx8SEO_%RE9z=DGo++@KEEZA?s z6&CEZ;35kyu;5$^&avPz7MyCqS6J{R7JQ)v547OE793;2Cze^p--5rg;Jp_7i3M-B z;CC!|qXneINens7( zG-!5zhIR~|IwjnNh!K7FLgZL!7@cVI#S)AKSt~)|9Ovhuy|Ff%)`kLY*@;PZ?Ou?z z53th|fq0ZbW1`p=#9y?_k&d>smyOM+&dUIumw$oYg_qlOUcN?YYWxmFaV1{vq)I

FWR^b2ZNWCdSF;U`TQDSts#2itgn^=sI|^7#3)mF^ifpt1++=a(KY8C+H)2t>Io%qSDeyx%GXB; z_!E@mliHIaZ%(+60n z@$>~AW6`K2jMpJq8s>!{OTtjJIJ7|tONIVx3Dh@J82hyZ>d74_?Q991I#cXb6_v#> zko)&(Xevfr(=bq{31Yvt8)~JEreOt78<5AX9@Bmx9LUlX_2$HCw_4t#8YfjF1>KpIyGV&vqQ9NLR04M<<|(7@Oib|gQv%tE)0JSgT`N5-cz>JgyE8F`awLfy z8JuGra!||HvKpCnYz^0ufE=CcNCK}4J<(1cGUo9Hw5`j~rM{>6N%W{}3{|Fn>Q;BL zLZxVLqXZ>PMQ7>zU&K$?RD6PpwKNol2)AHnXn)6u30cXef!fz7UP;V6j4Wcn9aL#B zby+HnE2i59t%y1m7GrF2elulO6W+2ZVP{aQfbpr15KF6`@>#fxT=-( zPyO@V>Su2Cly(_Pgyxb3)l+=^l|W-+J!QaB; z5iH2lCZbxmx=*))kC8Y71FwVu#rVxyA|~uKR6+%LEjXyH(wS)1$zAIEo+pw~Uy-P9 zCsh-A-TOm+R64ANw6H&1`mbdXMod*4`;d2{HoEAYQYr-#p>44G4r z@f(~bCzUFJ9+!}%Qq=6kcqK3a>tJ?bER{QhdLOF||B5=M)8;9oa%}!SBIP60)ag}% z<%yWt+4Se2){}DMdnZmN42whfucU$)mROsAU8mC!iEff8L(t4%G<*jrG-4$*5y3!w zsWL?Hicz|SMyX*#5_+_LrWtQ5P=Z%Hzf_3+0?0-QjPW-UJWzy1?+;jLLK{&V=U0Mb z7E;z=%W0lt9-4%PA zvgM?204i9ZR}gz@m*C4NPkpW*(-&oJ{vicQ!;@L4MgX6841wyk)D1}{wqQJ)?MI~GD zN0LJz2RA7Oo(j^?cQ~3Tkgkjx_yn+{kxmJepf9lx*lW*D>_gqtz9hb}Mm8Kj_Bnrmc>qb4byG4fP-?V@~0(*3r`r3}Q|f&#Fqok+ zSf|f0B|HXmFb0MC7-W(fWY~_@Q*hkw>#K~4IestLY`Ou;palD#d8OU9+DWYkQ!m9c z6gBNCXwyv?W$2H#jhbv~aWstpR}gZarWXG>N_#s>pVHbwu$w-5>Vxc|Yjwef=IIam zus;OYJTpAlsFzGlzM_7u1heAAy_6A3V62kf=7UMdjZ*^ou}b6`tfs}yyVqE3uc+Oyz|*=%jSU@$hs-c+Q|XD%m~QM-eg&JIK0l=M%0qqvo(t-iCB zhVSgY3$zKWNN72*08^_VwynpxXna)XA7(Z?Ud@VI+=~k!v2(Yg9_wsq4tGh|;>F-Z zbE1}+j)N`ux7oS1e)^3kmG+DjViMnq{+&nPGu6Dp_6*oM8~o@ViH`m@NB@jC{7Z23 zSJLo38~^GskeEsq`H`@&;Pg})uCn{Ot#1x}zn^4m&L`sMJu=e}^7jyMTo-+g#Uw|+ zdn-*x9gX$WeDarWH8IiE@PlJsHoA-lyMLJD)g0U6UhY6*B1mqvBz9a|k4qsDnDel* zC zx>v~3v}jmk!22QRR^24Ty0Nsl1DCpv9m)+}{$6e%7Y%K8G?Khsfn;>irEber{{c;b z+Pv$#ghvqaxm5{V7wg>WQom5tO(^Y3-{!xb-0t*u{U0N@EjN9OFJEce>^nyZWKvGf zxsm2hA% z!=T3L9li@(?K$8dYJqk-9~Ac9t#)+o#QY%L?%Ww3Pu*VgfHr)Y*JU)>I+c=j+OPFs zYrxj2SY_+vzY~@z2G73Q)2tHPZ^J6l>8hCEj;1GJu7u^e82xjs{a$+uBclW^ycrdt z9i^eWT~jdoX>$xyM#g6VdjkGK2qMqa2ifk_FDAP~ixBLN4a2?q66$?0^@U`%J7(Rri@6*t~cc7*Oh(Q$VnW94ZF$uKpP-aE#PA;6O|NGsfj|a35NPwWa0~dETYn%511;>1m1Svd^ z6v<%D+jM%9~)=rcFYk+OD<{EB<@nj2(oKL~eMQDN3;B2QEf` zQZyTFLW6%v+=Zg1_o?j{x?c?-e6C$2;RY7aXeN3y=V zX=urCB$pFgVna&_w27hbW%$+{=q`;4_;$mQsH1LS891s08xuQ_ftbNp9R5)f@_|_b zNZ5pL`m~9xHti)=vZ;w_v{6YR8&MCvU!8zFn0kNtmDnnl(_Db~rz5y99a8gQZ;h%G z=uaMPoR(33xHEJoiF{l~BPK0$Ot79ygcKsAw>a*(6c0+^-bA{;;N0pwiVeOid6PDq zIf*H5WC0CNR04k&Y_mXoVS&(|Jx*)~sk^)#4D4pcE9jj11BLK-yYPw2nqz#P@}P5V9Ol@hq0XNqam98GjpLs__i z=9QTHnYUzmr1`IPqWaD@bN*}3?E!w(#7gjG?kmMQcA<&T*Vt{s&LMsl+JZ-I2nL{Y zk(pon9{ml~!uElL`vBVcp8=PIaVHib-WzL6kdv$-OibuS_#Vdd8~QUMP@%Kos0)j^ zG4zQ$2wlE*D!e-TYPZaE}YN+Sz^0Li^4F@z>5O;4#K9!^4YpHxNo|~HrveEm z(79N9co!K3ZHnG~a1~O_uC<5AnUfZYQb5U~QcC7DvY0OQ8}9hRFXLgN+<{9TUT86z zv7lpOKMOi-S2MV)rt!r5Gpy%bcb{%OPq&=fbnx#A6v54mSwnTEuXi-jRTa4AMZk_m zx`#t+^=P>Gm~wQD)296a18(jh`a*VCc1P0=LW5<~zJwH@0Oq(j8ZSc@A;(=D?jbok z)_#2|4|b zqVE=bYl~>4U=`t79dietVuMDkl3}CZ4eaqDJQ9!iio-wC9`7Q61P$Nxfjy?IHtm<- za`jbsg7tlTFk3V2094r>D2a#n>}Z^f?vk1(?<9k@kF1@#PJ0VkXp~yGWRV@dLGq;GSnxw&3tt_`FVF?;1JbIG7K=uOM8~po&tDo3g8_J58=H%nyH3B%y=pY zwJ_SyHi99yx_k9G+(PQ2K}ZjdaX@U^jnLq*|D_Ef`Oqq~7g5Ym3RVTyQ?jpTLo-Qq zqLDfc>Jpfj(D1HZd;cTkeFo;YIT!wdFh~crJWL1}r0X*suVVZ#4H{eg7l^6!0Cuvx zl_8Uq31L$*jM70Zfc8W8qHSTX$=>e*(=?EWFp#0A$;gv@F!`SNmoQDC61WVKje1mv zWXboLN$W!=q~})YxhHx_#VNm0mds!>=#s6YeOcx-nhe52)FXWn!yJl=;3nx+2AIc0 zfImfhpB_p>FFQv)f<^Jdr|$-0pePtmIM zHiQO!iD@qEyRRTm1n$UGn9=}thM0Lu`uqO(C@u&G<%l*G7iFmwGW)kF0pbCfP)UUi z9%m{V(MIDgw419zN?fjb$UOu(T!^MQ8lMK|3TACmjuV50&>!)ET4Jvaj6P05O3{IO zM`J%WG;Z>sjyt!y1Fj6$vBTK4fw>+XE_Ew;^~p*AR%Z_OmB$>7bc0k0VBfr%+><%j zFk+jEGOqNkzMV?og*;T~3Qn*qD)zcfUpbm8iJ-PO%N*vWA9vhy8!07+G#!oAyuf7h zE~*FbqTp~ajY7+qNa*^8!kdZ^Hl%F_S>ezfJJ^2!v+?dg89IP{O!`LOSly3t1ujo? zK|ZlqXt2V3<7gx!fCI12empNQ%h01X-ymgFy#I)+VPmX2{qjWrCRandJ#-y-)9a!0 zn551#nib?w(hvCNVZ0#$gicblLa@MXKuVvw(tU|>jz&L8G&ml4(d6YU-s_Jm>N!gA z`r|Gh#PsbD9X4Rev;|UxB6vQ6An-(!BvO9o; zKtY@0tr(Q{TAALR9lE^;oKKe zPvrM^PzJ7(_gR&KzJiZ5)Z>lMOQ(IJ3FH*-}6DGH~L!zeTZcBvsgqj^lfEuQCTt;@h+v zz#j#R>%pIhhfmdmhp>OYj)l824HFt>(AOV_dZ^9US9=;O8J5Z(?qCTEoPy4DrlR); z$0HMKgaq>4fmx7iFA5M|ggwJ;=u-VCoA3HevdqD}Q<*B}1_Cg?0XKmw@E=;Wjm(hH z4_5$T5_Lbuh6*{UP#?WeABx(%WoC%)d=qk51+_hODAUo3$~&K(*(~8co$pHgJvS#Z ze{aE(QB${^#@_`bybHh^@wqmhMib0-Ze@FHvcY_&pT_5UGedmmYgRs&hc&DVzis;b zX~Nk7Em`_O1p??$6y4tl^+48aeF_O}hgRWC{1tBCqH4pVfJVu`VDpt>qUY3c>;T0~Cq&Yg+9V zG$#`MpTXwMCU!1R0(J3Pl3)OoQ@%5JGHS0-?!bb0BfgqMS0I*wxRye@BG!SWnt}q3**IMxPzqo zMHo)3<;QBF<~_EfX|xT^QMz|`2hws$?sRwG6}Spfbyonf;@dGN=tQDM_yreYR4^kP zjZYz)_PY;hRydmIeU98fFE?HD5Vg7BOQa_`8uw8Pa3b6kB6bIE!{S0=z-q9WS$_?& z>9V)c?kN#$@(cwaMqdUdPX(*}v2R@I@A+^h0T=4>4yswrwXr_p4vuzH2aSV>OY7lE zefL(6wlUtk0Qq)fLj?xzKv(KWM(6_>2pELXiQ1odbZ8aDTHU6laoetA$Iz~Sg)7k0 zjY?eD_oJu_)hOVfy`+8^CO4uEbfS)QJT`MI9UY!#bU0FbBlY%7lhnq9T=}>Vsws``}hjin#jwzR^lM^$?tC)GFoN14fK_0@sDc!j5$UuI*;uCPf|o zLFYO{uPN@nc4o7P4e+TBpQ5PSFxQgYfq8L(3Gu2sv4?xqyaY$%6&QCVSZ9Zem#ZGh zd;`$tTEoplAr$qy9QAm^ew^}t*E2WR=Y2T0O7amUIJ^U4^W-+fq<@_y_{t$?fZjjK zgm>UpKgw2Hm2G=S@rc-P5VRfYx5~C%u8Yrd1$z6gQ+jW5sW9~i?Edwus9E(xq++M2 zoQaV=wpF>fz*c`}BEr{>>~#v-yXoacZq9>PG-6q_@i%i8PjkgdTYlc|2>3i<|OOT5+5NDjJN{Q$~> z)27#nfzESb4};g*bJe$kl2|KH*HB8#K(OX_LhoJ{P6@ zFP9^G8ysdAIE^DGIXHZaJNbQ56>lb2)HwkBHLSNc93Sf&-_XK3;B%?1ZgpKQdK_>- zanui8+xC$5yLxY7WjNP8cnh@M<;dBhsOL6(Ki0p7TzconsKvipX=umfXJvYgc7Sm( zeh5Y!^&!^c6Pxwqqgw3SBtY73*$?KbmnY`ZUeSfDaUjN%2C$QuNrf$=->d|GjTS<^ zG4c?B5DfPwbd)^P;FuNtv8R9LU1;2!$5}y6IYNVwr0b0_ANhPIEgMbXmI+P(U#8&; zc;eJ;fobtB^&NOP*xq8YC_{R=)D9)sXB~EY2|!BguI@_mCTx-%_xu*6CItsv1`Tip z$BlJ4Tt2p>O3|+h0WnY)suxG3*8S3=G{8(4;_6&D$(s7!R zA7-v4m2tg^KlG`0Cu0s`OfhWhl;n45*m7w9&GY+7>|DkCJ`$eaMCAE>NT1(lV}3gt zZvt7G<8PASs*; zH+{dO5&zp48?F;H<)KQq`kgy@4-_rdhQ$hmwIpnKRxDDj;4NBhk7kpJ^bLGH16@+u zdB$cZ!ysv{-n{}jSWKp&Z98T@JyBDL+tGTv%QjOn|7KVb%vBmrg@$vJ()ulI+VFoWu=}mwQim|6ePS~kc0-1nX-r!&rUx*l z@3_)WIvU?bc!11TIlk2`N^oARK2Eg6z&<4(;W1VMIA%LRp@iOOuotGDKHj8E(9>A0 z1ig2|3c(2gjZW2@{>GsPx_6Ni7bu7)l@1PA=T2USI|IM2;1jvd^YnQMJCxaQVyw@V z*{~6o=nK2govnrtPI0N<=fd;awu?;BXRbh>L|5+?h{knsPxs&tg(+&m?wU-|YFEPs zm?Emk{~9eIInJ-BF8|B4=T6AR-aQF3$#)hUjs*0WVYJ=q0$Y0y8y`2; zbrqRFMo#dLUAhX5?W%4wo4c%cj>gBaujMtb2oJ7?PAqTVh1Wbmq&4rjSo5YKmsq#a zJH)!e!v9VZK6M)BBfVQ)DtfQ}JJgI~#yUuRLn$&44S?g86BpV?b(%XigFiZ*{x_YV z=@ay2^x2r82pplz>64VCJh)dVUohE7lfssnzSGf2zbv7&ucH~yrLvO`D=>H4H1Zwx zD1$qolddy+VZm@+e6G+RI1euDZ%3je$6?rm`A*o6rj>{+5O1b3odfQ=DpTX%kf$q) zAvDM=_6$9UL?Oq_^ev9Y0HA)gNat@eG>Q1zqHNo*493uczn;wBITC+8L-d;SX6e=znM;dhcv;ZVmmKq!Ao5@k}f{S8T%bDr4a>Oh{-IPM`#Tw4c{c*Bic^ zEuE?3L=+1QwhcGa!O+bJU3Uiu6$SeoDHZq5cAqh zGy@BXylL7VHka@bl=OolFIvUT>o3+*UWh|FLt03!H*A>A{m{2-LZ0yU{PWP)kOSSL z{U^LMnn<__utgVD3J2ODGLlCIoKOTkw5MpG&DeL9ejw;-oO2(`YosSwZ zuFbtMt_7$|tEOHA$1Ldu!c;1cd-^LmK%;K#>`e~{pm#zrB#5p87)YIhfG;GTkZ$et zH<(NqW9`I7dGew^tS@xmmoK@|4=**G?Cn^D`@Imz$r8sRyeN(1wUhg?&Nvo5CxGKp z0{0VmbK(t)(%#=#FVgniM&tXBE!73e4d3Zq`IH(oM{(>a}II10>cx73VJ!_P$}N) zShfC}q?>=Y?wcfiyr7n?Uvxvgu4B7gsJda#53bfwH@E#CSa|Ovt!>@aO~D$W?r^Jb zWvZLA9j{KEW`|l&8^3r!zwDsvR70rSI6Nw2sGswmSH@81=Z$VykD&jq#R%$ROON0Y z_cK@>V`21mj3V)Zc0O?D zhh}@)nS@uHnDxzwfwmCEi}k*F{d3J$mrjmH$@|xuRQpvO>YsVLfP_T1Vm-H=!Y?xfIT1prW1;q5{x7TVjD0I8` z4dO^xw#$`3ZTwB3hW-hvJ`P$wo;%x{wTbk-(zIVNb*Ar2{IWg6`nko8Lm@1qN?;1K zw;!B;1XgfP*8?l@VG)&e2c{uR;dG~W%snSJI2pR~zN3-u)8qySbj+Qo?Svw@f(WR% z)B%a)8j+iXo4;k`nUiPa(p@Apg{=^IMY=;oo)Q)oI@4geh|3cH)E>oHMyS#MS2!X{ zT&ANxT&~W)hWo_+3CJ+8oex2OGFLmmVJMr22lCc&u7SD#;%*Z4UOk2=kXhn?kF2`i0j=6t&g%t)>#|WIs9we(}0_Mfx&B z^CiMC;)wSriI2`_!tz7ZY0O!tuwZ@=)_WdLzMrcPXD+;5h>7m!Z_};;pP5(;NZ0ne zVCIE09;l`5Laqv@$KIwTldUj!1g;^-g<#gaWRA}Bx$iM!tqBYxr&I&H3CgRle(S;3 z9Vfw72G}Bguu8db^o)3duOG$v5VCEvYrjMjydEQ$MZQzvj1v37)K^dsIT&#ErJL3Y z47_-EXbcL6d8OS8e1PM{jz!xrcB^OL4=P!TN~om_VjIgL?(vG<3zl@Td*NG``lbcK za?>fgzI%cA63|H^CP}KTolT1qyCA`svD4@UAXqZiQNhQm zSEKXsDAH&QpF)L(J*0`lM}7(do_ImXHz+(lw1NlD=d)#PI@R0svA+`m92m_7Hs?{q zdePb>sP0NE8d$BOVUE5U|-en--M&DFJsV zUZ4vt(1jN=Q;6vut7 zZXEVdM2+>Bj#TIr(fP3x8<}3Yc;zN|=eePUv#Hf_D@ zQmp-VjYGu8e;MvHV)0Uoa9h2AcGdE!UNf9(Z3ntY+0TXm;XWR7BW6F9y1j8G(#f-? z(~=p`DAea$*|~%Km^Y4#?%qWjgw;{2LAny4mf98j`p^_4Q$A4n<51E!!BK2Qz?ZeD zvmLtc{2sa=f~uJEWqZf>Ys~{$VeIH-BzWsMJDiRKfz0~RiQ=?#18MsRj3saMF|XUS zO+fmgYyju5qdWK@mVWe&PA_xuIuP9?p|eLaFXHggH-|1J^w#>oSmNNhj*i>u1RD32 zqMUPLd!ns{lz0chwzN%~#4Ynl=llNATt}e5mJ4&k2fukR%Nz8wU;poDD4-aI2Zd|l zkQcb?+=p-Y_53~$$wT;4W65E1!Z%7F{eJl+wA@bnNn5*klMQdQ@gA%_OYFefvl1w^ ztAix`qyNYCH3@yU>g!W4TJ`lMsMY^XU)Pbf`-%Fx@Hq4}9fRIQU&A`y<+O=TeKGkj zb>osDLgDnnZu%xiBi=xv$nO%u}C!6A;-PO`qW5bz0Y* zAJe_WKsT&RBi-qHaUGH-Cf?15ZGp-N`${VypCaPeK)v?3lCJr`bOqfbaog=1-?^@2 z>$U# z=fGzBUF`zhZ-X166;N9gguyNshbLqqaDovVx&pQ#EIzA=_&ZEQzhe-28%7)=hrdhn zAza}9DR~bjWAPu#d;4GiXY$_rx1THT$G?HRug7c(%Uej<9juBoR&Kn1yYS1!(8(@t zT%EI}Z2|(sfh+M^lhQsxIHS1D+IS`DJT23Kdsd(n{T?Uod{X?H-WYnSJu8xT<9BuLWtfT227D(zf*J0~J_k-aB!oNn1G~-RVh+plv z0b_|(nPv^H>|nXl9?C>BaD4M1EOLbJnLrq42#Shx|J1^N10r0M0VSer@K6VqGi@j3 z{*cIGO}~iLBp7p@o8t5cm*B{ToZAG`xlQ1%PWTITm7?58%rFx(IPo$)F^>|bn~Bpo zu`d$oI#LsD!GZ%%zE5ANmrfL~FT4OwjB_n8*8CiWqQtO&K>`A;5BK3_R7@j}{*%N(jmxfIwTc}Lz>WcdcFC4 zdaYZ3A79<6mBIG0a`C2zR7Skt4Sqr&;FO5%67iaV@e};}?jW3^%U`r(@{VgIXj{8a z#Huh7+NB&^L48m$N7KWoGxSHUIK!shhDt+U0}1{RxK5l-FO);si66d3qqRYDY#H%` zE%=#?F-H$Womy!xMWY&NNPT}6i7AnWbef(B#!ny8t0__aghurRiUq+noCY-#JHiw4 zjxdgM9@OH|1;O3M&K<@>yUwrpDVN}wT4K3hc!$~fxRId-R>7Pb`(w2hh%lA>S=pnBj;CNdr zLYKmuC-`a1kN!Rw1<&av+QqE9xWp}4$Hv4uP$`f(A&!kCW}0Ck>5?q?NKk@}i7hY$ z1gaIhF0_&W-UV8PZ~EZ9krw)fCy0O~%&T^S+(2ZXP1(upu+4m=7gYiWJ7xi-N`Z`E zOd~(FX=h?EDDzUvj67sxALx(g;QfhYo0Z_piOc94?b0`W=$X>!qx~6EntzSr1eFTt zx$)+sDBYMy?=sMfT+{K57ToIJnMCiP*^g+Cyh%EPIj~bp;VrjFb*G;aFK%)vvaehJ z1V&K}F{EwK|mdiY8x;kNH9r6UEIGfQR!{% z-j3gA-ithGD~Nq6FQ1m1`$ zAxRVU*Rz#C-$gDM>^F^*Hr{8eU+Cv-IJ~0^OnlC!;hYVFPv>k%*3a2A+^fWu6`ZrV zC(}9GuH>`1#@r)a-#QKYzzAiM1wFrWp`Hf;rs=KYCfRDF))}_WH1A| zx*;%V!y)Dlq~`FQfEe z%oM!jhATfTZLSK|FJ=;mRBfT7=?DfXbXgCaw2~icP8XVHKQd$LXdLPHTdq2joR1=G zJmEhFZ?h}unDvgv51^ecyhWmX6n(1E?Dj+R~EOLYH=l*%a+~0#$=!>)+<~<0nkDDfDC0(Xn-2YR<2>X(Ai*>mLG$`SmXXtBhI6$JKjOtzl!AY4#dU!JrnqP$^Bb4t$%_ zY&wmoLi`h1yWtsP{AC6WU*USz$&z1tp!0*r+ikdCX3R1^s>Vt7u|0?q)AUkW4|Y@R z--+oZ9b@B%BGiMJUD;xGWiRe=O>n`fhOfFU9&n95x1J4VVQzgd=GN;uJhyJp=hh9F zTM*;)%vBeHJH#=9eZTC)!%=NIt*4lFHd15`c|OYTh8j`N+n`>Gdaq8SYa929yoi0$ z^KlY}bUs1hPk7q=!A+4yk)h#uH^(9`8BE9xrq;Haq70bfjo?Cin|OeSl_40omu-^4 zzz;ZNq=tB=@x76Ig`pz`pgeWdF>DC$O@ullC)Qwk!AW9C>+?sd|8&zZmD|-#L-;lS z=`0FN;-{8VIblyDP4Y%PojOF0JkJ3kNR*zbr;2&?~7axPljO zj8zDp6qs2wOq$8>tEdVVBQ|r@l_*D7sc;LX?;#BXGzwEQv<1Pr%vA#^jekagQa6W? zdLdF>4GZWJ=bbTVF+fWyUX2!R08!v92<-p2J8(O0B;h4a7rn$8Onn|Tpd~E{m2gPy zZD@w>zf%Y6VdwBocOwU40l<21YL{yWBFc1WY0=lvk=58Hkw&T5Yf^>th^xT(Y7#^q zYX+-{a5_i|>{`%qGZu8M>@Xw`Vg}{JycH$L^kYWz2qAGgSHnw3o1^hpFx%$xp}s6~ zA-AYPXd62v_v05bA*Ow_enFee@fP~P*=b1`@dPnk!G_!&^cseI5Sk%Fj+YG__uP$6 zw)IF#B14|5dU1RFUv1e4{-~d1wthM2*e4F96RWFZ(YJ8c8cx7OeusyJC&a~`xv{vO z`NhH$!bIKz#Lh_P;D+yza~|Zr@H>(74t^H-?!xnFmeJ~ftf4{2%!7zCgTpV-t)f_~ zwOlB0j^owO;dZ#YKgFU2Z*#}P9R3--pV_Q_fwv>dwhn6QoD6^ClG~1yCLdA{sN2<# z>@9eo0loqLjZV*8JbFMdpFE1p#XSZrJW2e`TMv(d912|e=zBA2vws!L;S!mPv$6cV z+9Mg6;h~4eLmyr3dN$F~Xa6&gVZ5x$1~Yaa1ZSA9qj-0Ql|B%xuN z1HX1b(t)uu)JoSV@fImvVf`s;^n~oBklC=P(ZNTFmDBuzO=!mS2k_fQUHk!d1zZhR zV!5^G()Ba9=7i)I^P-T!k+e47$Ce~u?m@NPW+%4Z}|^ukI-@tl?+X%J!LbM zKmG{1jdP#PO2gm9Uc+~zITf)jVA~W-?Y$Q><56h7F+zrvNUATRTGPic=AU7=rh^Bz zk0+@m%qy-D0;v`PX<>7Kd9{<%#a_wWkU5%GlhH(2h5M^X0dwO2n3jf8#_yuY{oP57 zJ@=&E-`8l5-;Dj-y}EH|b0pu(79f;?B;Af7T{j+35Bz+Quo0mTzSsBp!NQo(8<4CK z?+h093_aHc_X<5CVR2kPl-g#$QQ(hJUFIq}$q(D-v7bY{3wZgHjB?X$-$7)?xHuZu zqNcTUmxuZ`=nv@l6i~DrngCdqXH)FB@jNOMfIg5a^IOAMhGK}Q{!=@H3LXCz0K>!v zj@C`uS^s`_?5qmWIg`}ex?0U>S9;Fau!VA=27%38AiPR|9k)P2DArnRQ4|o z2L$fdbPTj(H)H~Ra`0%~O#8Wk4>AqM1)pjwn9Q%D^SZoKo4J&_cDhph6@H0n1u(z$ zA7cv7|A$0+9H{})ty{63dOCYYP#3ar=7x>|t-h=DTpeUKP>a=DTdJpqG(??o@`8R;dLLz^G2lNC_fKZhwo=DLl6C% zDTSas%1&>n=Qnr-lW*%>jNG`GiqoFZEHs8Yru29-9(h#{-3n!YgM6aUbUfk8DJ72~ zmLUU{m6)PWoQ)1+in94c0OcTuUUjDzSrcS&G2s0#x^ud-xPB)lcfZHc(}xbAbxavs zXwNY`VHVgzJMg4)+!yPF&T%(>WbTf^##s!A-NM~uojV=|dG1$Xf_E8={SSbR8$jhI zzYcv2j#vs`hdTHX^7=Zo5s!S^mbZ$b3n5g#WV?nX*N-6azJ@z+3Qi(lyj{aRJJc1X zgL^6SWdx}fQPuebY2-yieGDiD>l$K%GK&$riXb&91eZXNniLZ452z(!01%WNiL2Mz z52TyHF+=G2ka!+M&!6Mj2zahVaq+ydbE-i+Y7*Cna6?_sus(!MnUOg}Gej>KI!}6@ zBt7HuGklonuGCHU)8ItkNR86dd5WI4k-CslRa_tuTkc4+^Sj@R2F%%Maf&{obSOE; z{ci#9Q#=kfkkD~(qp@GwWlePY?G@I{I+J}UHXT{O9hm^iy1LAPvC$i0Rv%Lx~;6p8I z_S1qzhKYPiZ1{VAgc3+dro|e^fGMY4le>#2l6bcV(IV-Z9DbJrcT@J8x8&dqCm3-W z3fd%T>)8dMEBjq9r=8zS0*rh~?$qz_BjZ_V_Rt{A2H}@U_u=Y6)P6v5F(Ml~2k1ki zzz3ve{Fh?ZN2~iu)J?s8jweMn^%mnE1hsQ>_{4_qYl(}br@!P-e^vwbF)Snz!YogWBcvD2Yywiq>mq(@emn#$f8g9Xt+* ztZ26ob6|2V{3NY^xU11R_3m#){`a9phPOC0Cf>F7LLA3`K5-OlZlC<+;7Gso zwcbV7Eb*%{^rI*TwSNZrEJk~cWGDV76GW~fBaZU`1Z>WjJC^TP5g&n=hhXy1p=|09 z-srNiwx@g=eEUa$pTme^dIP#M+mGrn9H{7l{wJXd}s2s-B81L(~cfk;T zwuajYq-Ica?X3_bgw~Dh$p#Av?GS$W}ywD2?ftVxs6>{Qk2GT-B zq*JkEu+gvPu)srfp-%IN=8~wLK&tDs<_1}s8x^6sAzV03`}%P-cL2OA-&NgYV=|>s$C;$?ImjMz%`DpwVq^N!d*EpLapCXDU8R~$587#u!P(d z`hU}Cp9Yxe6`Uj0c^cDCSgL;mRahlOze7*e*HZPBs9sZ1t2mUPUyK`gEoFet=;FZt z@8&YRNRn--db(V7eCQHX#CjUxr0Ho<2ZeRUD;G4^`GVmMo_42G4|@@xF$2l4sV zqu{d*tBkh$H{AO_b3!Wjqf4#JEw!#eNo(&P(`)rpt@KteGkrcWeYdE7BIPjq{zQ?V)T1BM-*W$;85KCb|cjo9-_uztf?H?@Hf&>b;#Pa(T6hqqp+X622 z*8@fXne~^EFCf=1?kA1IK?uiZ5En!^UJwy;qS~&9?C4DGDP~Rd%QWb?7L5G{;$hT% zDHtZsB?Aic-K4-)D8a_WdSs#(_i&*AN3Zm!8W;TVYuc!qO#AmF3T6a!5CWoaEoX)nOF?6hcK{$IoU_{0o{+&*05s0W7oF9zoGN~ zAfm+d*6S&44$lU@nt_{GBrKdD)KiZj6b#!wZ7iEhN&%%h^d4-S%uh3&4w2XdKQ5i0 zhZl$W@iHyU&lM&=Svo&~j4pc9v5!LO)j4B`?UcK0Ag#Vrsw z>e>n@3aC+j@6V~a-8~2<-{0$b{&?o4^u2X!Id$sPIj2sYI@Kc8)Ez0E1XdJ2UuB25>By2B>=yj9VA42-xLqNsRSoUmsiVH#;DGy?V0<+szm;3gqh7(R|J z;4NO&7T~zUvJeri{tF!-9@5+Ezog1UUVf8)SGg}`&E@w~e%0?w!NjJ4+$0=_7vR_g zCiV&+3*Aek&xHwSs`xH;Awx8hq`1Ww!b4OY68*9>S^}x*Q~quJ8$A+Ug~|QnB9&G( z$ZV`%;Rnk%a$_5tlREdenZdl6@A~KmGtsVXfoNVy3vaO`Nn*ugk2{(aL|gRzkfHc6 zq|lE=c9Tu%`tFC&>7>nzD)ZvhfONwk-};!ja3KuXs z_^c_O+G;|-G;fJpdm>$C44gCAskSfS*<+ZfJi)WCr!YLbyLQ!xa7zx++c*yJeut@i z33~)Fd1L2=@_s(SW28xIn{W9e*DW^fFJXoh(+GHIbpTzM9{KI!hr{ECDvkZdy*;1WmA=W|j?hQOW#ki{ zS&U%nsJ0TX`|%&Jw(G2eJ*gqdmj8u~ORdumZ_HNfL;>;s({{;3i5|1VmSi12UsY`6 zE?qd&>$2jnj3u)hFC?KmIViKh^%HD(Xg5)o_+N&4^FvDCpv^)>GexvC>GY0|+FYGz zM^XUU!0Y!e!$g9-?%UyZ7tb=q2k$^BgP-C<6TG1wrlQ7N-QG~3Sm@rg7<(CZmkSbV zJ;=Ec+SKMN%qfw+h$5z4<#4Z%O}FrJvjcZQ!b| zk^D1Z3OV#EdeN_pmHO=;e#h~v*yG7XM^^BN9)F~)YWmeDekaW2Cp^01V>L;ZnwcFPwabNE)J+<(ihEOo2|EQp#@GjR6;J64Vr@;Aw49XWNuN$=C znp!G}`|=a|Vy-~%uq4spL^=Cvb<)}+O{z;Mo9rdgG zb5gO3vFM)JD}XeL)UY$D5;3O zt%9@6{*5y1X4n&0VirLjjhSh)MIPB(_2X7jK_H*`OpVO44OfYJVlKE`(ubi`u4Z_K3VPIA1PmlQihf|L!gZQkXHNB{y&eJkr$#zzdk^2ZRvPZG6<0c>AwC zX5J2Ly3*ZYOMao`dV4B=phkztZ|wjUL||0&xnECa%P*u$RQ35ns=^@jo7qg%DgiRu zI0fXlz`Y~^lk2tePWr=nnJUdf?hj|Y3U+_azmG!>yn^Cre1C(h)%&IKJNMn_{ZRkz zj$-^ZCE69d(ba{w^gsCQTi1Lt;$PrS_N|Zht>3kiZ@rAxrz97(YJ~I`?eTAvj=ZJ; z^`?&m6~3nF#hSM9HSOnXIwREd5ZDd<(CsL#R3>uk_(Z*(Q8Ufn-=gEUED*Y}m;YL@ zt#XGWY{UAr1djWGqu@_blK!_Zf$x|YzQY+o4_`$QzVi|H;CbR5YPs%-9eq!CD#GWs zipyV69+!Vo$P=4b;J)q45B23Q59MD0+*rSRMso~3=tAH26~(e2;l@MT-|}UT3T1Dk ztXn&lLfP_RY~lje(z#@WQbx>*$7*P?mb-l|^W~0{tNXH-@4-nzSZ?5c^BC)g!y1oS zSKNI&PUyT<1c$U}a_?&W%TsUnx~+s`ox@ZRHw=78@i@W|x6BQrb}53UN1z|78nFj= zOISu%xv4zm?+KamdEfafg-egPPvB;Bej5+=&;a%=P;6YrG+4p>ojB#BWcD#54y}SE z`wNBxd(m3v<`#kSSWTl2ZWBASWL9*u>&4Ps1`%#HL8WD{_YCZa(hWFr3Lm31^T<6g#xwiR28_8 z0v=_jiF`cD=2RHUUYn-6>$cy)kMy~aN{k>BH5SdEhSzAXVzbe};&Z-tod0|3`wh@i z)P>J_!BIWX!HZ_%5p6uhQxtvaGn^MPG`R-OT7wdiukpbPjbgSU3+cwmFcCqg0uO&S#ZRY_IA7jG# z1eSi$w(3xB;A2GKb=R^A@V)p@^Twn5c;C{wzNI_2^S$^tWE11}nSaN{W)+L=>WlUB z#m*1K7L*jbsR&>+IXv6=0M4ZyfKvfDJU=DS{*7|UbGx2b^BlK*iwFNrr7tZ>Zzv4N z-bI)uNlN8Z zWz7WuJJtAo)o9b}Qq}m{7lmxmbl;ER>PMyT>xKBgxNiRWXL?@Gx+f;t(h*%Qvao9R zQ?)}F4!^%Uy9}Wlw$}-<%7~yIWrkI_!#`39Q|QIL&j15N91?pT=C=Elz6}t!xVmHV zQ59U?v|Zbz$rR-TmIX19{laZ!ta(7^FU3s^cX6T4mKn+H-xdRjd&SE9&6XCrPk7;? zR@JLNmxpw@Bp8?D0r@M!{Sx6kp~WP!%hg@o7%ff~x=U9}EuSq#RIv@pF{iW;FRT1r zCW4t7Fdgf9kL##9m|X^)!F4Hx^|Zy|Tjv(gD!pD{udCcFdu^G8Uo?hnU9f{LM>Xgj zUGfhVOMCc2Jc_pa3^~nPmxh>dH-g62i?{63vb$TnN?~5(Wl~q#SV*XfQwehc9_K+T z`8RC5RG+&8R@{0q7qyowLl?I_;BH;$iyQi&e);PqHruRxsXLbT{$v`)!{L-jFL?mj$Kv zI#w}LAM%RaL-cRc`#P*^7%J~*Qry?Eu(G*}QS!sq4j`hUHy}qeWG5m=PlS5ZhmFm> zGtT$vScbHYi7lH=Hey%u(fm!mqK95@EZEN)^6lJ2 z7bi$(dM!Nw%&t)1jzF9z!vMu$ONZ-BwM3kv6OATw*zW`Jc|-pE^VH8$;{6$~7Wc># z@7cU!xQI#noW~ms)vMfZt)v<&jJ~^Qv@JKCg)(O3OHM!6LDKy#$dT=*x!0Kxp{15< z@;A6={JWZJxjKKndzAP5AKYNTHza1@DJ$H*@M4kH?wS+@=EJ;65s8?)$Etq`%%4>}QLVl#8dAwm7q_MIWqbW<5ox*Ivm)NyHs?`c zW2Olb!kjUJ|3_95>uE@(dZIQlSmbx|04~b!;UDJZ_jez_SOuuIDg^j>kc1l+UcN)! z?prV_z8F4JnK#^_>hQC1osXCHZm=v^@GH#z#<4K+oMz%+h)hb&-Hjv9^O{DRdZBJE za8fc6&t=!@r|wYe$tw4UfKfqSh;j*cUd3HG9_Cu)qF|m)&X~Vg`#Od$W&&TudYwDU zyno=Tz?bAE5~A@{xU1-vfZByUf3S9x@j)&t;WD1*+B%hnL=(W+g2%mso&N}D5gy!t?<_3en?=%u$(I6Vd)t^>?3TyY_i_R z!13U^s|-i%Pf)k@w%B<$mT&pLd*Hs1c<{u~SBEHA)qOEpJKZY=Rb#Te?09$cEWwJUQ$(K1>}G*V&ipR} zm81f1@xjl%e&&wZM2#)?a;%L_Qmn!Jo4*%1+$pZf&8|Jhhr$2n0d|HA3iRaZq3;q?;1z_Ji2j6UMYy4ZZ z?e%yy)5iVrr=mxHYa2R(U8ZRCt-=R(F8s6#r?Twkoj}Nm#mzlai!hB!BJQ8RH<@`d zkMq=-mHEiZd-%~ObXP@|Xex+WN9JU0FI|JsIAs+&)B@1dv>(M7vLxpDp7Se7rT z?1OfeH09j@5^QN7L6_11zj-U@e##2`Dh_(i{wpQ7`kE8M6;TB059=a*xlJuwj1M4CPT@2T3y z&{Dc2zV)CJH#MH@fc!7V6Tg8?jHhh{yaWi}Jf6kE|LgJeE|kRc&~>4@{~G-x+b|9( zz$Iv@b*ptvL-yp+M`w3-v}}{FjrCKNuV3B#LbCq!N{#=b(e5q{2c-?uR1ox3smv`M`In2r zMs@?EN7EJ_R-Pcx>^n2r~pax5{IXh2`M_`B#HOCEP+-%oXA>{BEMZI*_n>^AhN-1szfTtjC zOV*&$<<3$xVyrDI+}zLf&b`3e^v%^>a*tYC=Pss3WotrhxZKiosgK*?CnU264Y{78 z`g7&K{YtZv?y~GbBa)wDyc!;DtKspdlaCAy`Z%KSafE$5D||$*$}=)9i9&_tV;CQ$ zSU#b}Os-e|jDG5V@+A$D8Udk(MGKWPJOrC)eT#2u{k9=-(Lo3Av+WI%}qWs z;QsCPA8{Ut-g_1MU(lw%bTG5V2k^A32miojS8Efqxc+-dqJ2A`Ezv$$5Aho0kx}Ax zi@MeD1IInAMlKE*B~HOTX^^c_c=<^+|NI;a$e?OvOXa$KpG(6VNbI?2P;Vvisn&io zqZ|0dYya??s!sGA zg^gtmF*9I%!!w@*zPM&k;+Lo>`q^4AzYfVK_+KIP@sv|Kq@0Sosdr8sk>NBsLLJwj zTuFVdR{Og1&-$Sh>xfdU3rcBF9!e!uY6O*>Q_VlWtkpN`58sIT*GLFFQmjX$5D$;Y ze>^m$pz=~LwR7l-(vl_KY-fhFYNc_=kBO(ax~H8g(Z#xjWPfR{K2@dK=8xiva|}2M`r7(cy zydU;=VbBps0ItA1AHiRj@TPz1w8n!L}xGS+^QF6nu=Az(Q#~A;9Z8AK-PZJL^YM zT7IcLcN7h{xjP+&V^SDTf=o$E>FklF~e}KZn676_`T zSGn6y7n^xsZVR^T{S_E$%$-!LAlb1kYKU&e8IDDHZSz6N%tr`B#9MYyZF1cjChb4i zv=w!6bL>vZ3?h^EpsTele85)6(Xz(+SDLp}#B}%0NIIDLF*4$3G8qsvU02V{n?g>E z8TH8wHz!iKt-ee1*2iSFo{!3ypNxK@-&TGbbLYS>hS^-=>Os5lL-H~?CqJ;c+`Wr_ z=sIAe!Ur94XVG3t3y;i@-fn+{iT|*`y$)9uS3eUfr(VO~$#$`Bd{MHwv}(29JDI%* zhsMK8B(Rj!4mS)`4c8=+hOj{4u+>tw+hXwhlJG0lfNcvR0WVHwk3rl#N8~=GXLr@Y z8&n-r8?*~94PM;*&nLtiT!Id8i*jRb9H%=}4+!PrO(J>chE{StLn6HmvpShM0jPcm z+D?GVe+c{Gf=<@_7I8VXXlR8B4q7<05}p7XA{#sm@rTL7|b|!jdwu3Xs6;;|eELg#NI@zN$CTuI&lNwlr+= z6aknVZ~`ZUyP$A_Osx)dGaiJpr1@djC}sng6=3Y{=#9cWvIV20;G>W!;z~Cio5tZ0 z9R0x5SM((q@lY81QB~1u3ez6_N)eWG;fwsh(@&|0?L;lChZD2P$Fg6S?r8ol z8(Gaf3U4x_#dyPCQ+O-A_p`9RYRv6hPZ43_U!V#sJ@SM^gnQej>u4rLLj~o%E1`_j zN~t6P8QM^#-$f18x`dlo8-9Kpl$_|VORtga_%eD@T6?~M}~5GOO=ZpV#@DcIne;abtZ z8KEF=z0c+zbfQSMi4fgDA>6;1=BvZOtZwaT+aR%x`JH>5nlvx4lOCkPRA}9Vpxn>BDaFH zG(D~%5Et2oDm69>R06-LZ2~#jh0DtxuOM4vChy)mfz^4sG5Z~V5ydOK!h)kQP@odk zfSVfT_m;QdUfOb1GxRdhpR@^Y~6|e*g|usAjLGOoCn*~P3o^%7x3a< z$2!uTM@QgH>$HR~-YxL5x=Z;@-YxJ|^H|+2u#KB(1e!=+M{eVY;vY9!(aqd3fXVSx z8L2i`*$P{Hx#g#xuWs&!fs9Rd@!EP#sc|J!>fdVB2deILf=oTYWYDr8Uaz^W zmIym`2C(TC9;t21I6~ZA+@-m7m-xOm(8oFjF|`7%g|(o+k6t&>T+1906)ZE$B6^`; z;!L~E6UlW9q-1~pL9_P;E;0U9#;o0C>U{1rEq-*v%V}tJ=J?+{SgHdr-s!)I2Y)vu ze13#JjVw7J<|?rL#QoteY4Jff^?1#NaQ9qmL2!bj#k`fgz@#o~sNU*L13nXv_3E67(Z0wi8v4p)V;vK2qc?w3US z=`Rflbr*D7BNbOeHUN#dc6tet>X_FYOc%slQ}m@cKlbbWJPI?g+JwrLjp$ zjkZg3{)4UjQN-Qh@NOx9O{N0d(ir`vpX;uI7<9x|RB}{uz=PIABHevgC9&dOSdqjN zRXI`;U&H-w#cAv{Gbp1|^>4Nu6Wt4>K`;CZ_U+Ot#}%s_)$?mrvO$INGmneZ-b|!N zQA@b@@DO+IM)h#6CkCxyQ}y7`ZP$z?(wK;{Z>+f9J1?ka&>jLsKr(8L$G8EU)aD7*Z}qm=`WEE@}}aSO@Q)KI3u$f zh>a4R9{6ip{?(R?btdT)^6VkhD2u2G70gF!O~1~PYZuOwqN7mOFOmyOXXEd!UtF9> zU)~LY{l%q;^d9AU*f-IBnf#F;T=bG8BiG0nu#TusW_QDxo?0*3i|=6wMf^8;M9`yW zKBMCr_y1AIQ!Jdh@+)sv=DNxHVJa;(@OcKx#R3`7l?FbNnHhGbMZKtU4G1@@>hnGL z%3%0hAvY5#Y;@esExd6h3Y6_BVWBOER8JMYFo02qXb5dgw`OlLgQ~2VKr&NFKsTFT z{kgd@0pDKjTh|xmwBKUu`_OtLs6g`oinvY-Z*}YKYIsHno6>9QO_I*A{#EXeBA-C| zI=xElg`YN(l%4A77T(Zg^*vUzT)_O%q*j;`#aZCbPTG_ZQLZ{hRXi5jOZ$jAn4Zx8 z!;5k~6>@oi(>@?poU77WCD$Dx*G9d)v(h8Oa10zV?*>QF#yi_PFqUvNT%!Gzw zPqza_X!i@tAk+3{DGfk5MJ2K`3Y;NAQoOS_hXpM5Qg_A z&vB8Sy*qwYr~0f+)h|p;S_l=`F~aSMP$oO0__4S? zOD5_{l`Txwf0#%=roelBKJ2?BmY>MX#$M>gTUYXQVlP$}Cstx5mYdoJo4;|v_VFrYV)-lucXF)p=fjOr0Vz*=*M+UbGC>fOLuwr)gdcBGbDF57wjNnDL{k9%{2GNN0C8lSnIKjqH zal+}UjGJ#z?>D$!A^^z46LGIumK>j3PP#uCePBRxkD-xkU2PQ6`o#hMMv*~lVQUR7 zY_QR<(~|?b)g;pI1H6s}flv)?pT(FH%k9&|{ky7PEQ#kVFHdIsb;Tx-NN=Y^fYrEb z%FUI`o$ZHBC+X8Q;-q#gf8xK1)fflS-SZDV{~>4{#yAbM=mZE9{Q?b5{$)h`Ed?ga zIyL~rl|&Qmynp7Iw&EeJlgLmAJXy1dE!}d?K z?aYQ40_3NAGH@9y=nhkX8r&u*-2nJ*Pcz<_E z?DK(t17-B^j1c?$jLF>x&E5j+g%DP6UhSvn*=Q#*#rZS9f!4w?;O{X+3M+2YHv}FZ zZ9P@ch{Bvw7O!Sin#*A=9oc7KDGIbw*(MZ=mT8>_bd!KUFJ$M02SI$HZ;Qo^paAhdNuFI4&> zcYzJW6*a9Yi&j^q`8Y;->Yq^rs4KkOrOk2VfWxuUko+bn;O=XGuQLC12QU=&K>cHe!a8iGlPV^x!mYFw+&Fpk=?pyI-(rTV6k0K<`8$fX+K zlT^MG!6UrCmH<%1phZNfm9EndUH%5 zXqo>d2Y6SqYqK2S`-A+~0tfIKYhm{9#CHv(M({>lw{#k{lOz{`$X-2ZS813*LkX98 z{eq^4+wFVn-09y~SQ$qZ~lH3Q?4Qf3cLi;06tpktELlM}o?Bz(k#IPsY_faAS5xzH| zc8R>ZM0=9x`HYvTlBeliwBnKmf`DaDE788C63{1>TZMI7qf@*2$YNyQ&@xkCGV>DY z%htnEFa1PfS8`DrSJ&pPyi82@(%54;*M5u_@K8+A0JS3%R3vnItD3&aq+cTfTLc%A zR6&2qEVUpTwr>0cwjlA`0cbwX#v1sTkF)U+-Av5dVzOV-H)hr=<0xv3`+|RH?}AW~ z&~4(Q`>kmin-eZ?94-ihjuLA;G9NUr(T5Td^8h)c(@8aaXdncn+BULeg|_&g!fPlJ z*9kAGh;CX{%8DOx|=yxksYb(_OzI^Tp?6gn#Iy#!o*Mj{$?rVrJKggUo z-1o%S0d_exMBA+^$#1dCrHq|9(8wIMah)4`1lcz`NA}mkyz?9xO8u<^JEU(xKAyrGcQX3MET3PEqr4p!?`3PXi`mvN%7P@I>nUjTskLviA$ zZC-^NEHJQ#Q3!PD6VA$;`=CWv}d3HR-XU!eH590M#sVd0-rDf^3=p~VU^1tCFFXJ7NvkW z=q+L#XTuGmxk@V>F(}a27IZaQS)%8B4V4E;X^IgTV-N46R>loMvIhI?fk#bW z8K+mevo%W*=t|L=hH>h!LNPDk2o?96HNbTka{ELUwL%^%>TKPaZvrY;O}0Nu zpQ3U;7~fzO??aZ47I4pkP>)~In+Sq*r!DfLhvB_G9Up-i-lhrtaX-Qg(|Mx-`B0zn zi)3zU%`^aYD@E1xTrbfnF?AtlZH``gp>O=NIl zJy4IejaMKWI#y;*$kq1fWj#qYxV za@C)!J2k@(P1gfi^eeC?{Ts%@RaYz*;P=s1##ma(PX2=Y+LLmwn+ko&a2Dw4F>fg( zUx)Hzxmm2=5(A?<;(4KqZysaXB*V^1xLFQFP;Q!))7Vl-FZy!_XjRKs*{_xeL8FFD}xx^oK3GN4Jsc=Pe)3#egucj1P%46pCfx z@lV+qp1{btizK1cmZCJGad8audCglJ6gZe#FoQ;8{rXgHc$t|rMw94G?e#AtCLPkX zOBX3T)snEM#I|k54;L<%~0K4%Leea_p-vRH1(es-l8lEs38orT? zX>mL?l%g6fdn=5R22VbS{9J)3C`*TO;oyq0F3}x`Skq%nkDMB4c%@q-@?~65=BX^m zM^tZSzEMpw8Wwq+CB4!pnGa8JpCh{@46%2aknC)4OS_<*}Xj6)0kYivEc z%6&$Hr-VfM0p?s_oQJ?TaQR$V3J?6pU7aMl}Ip{tQjxF ztaFV3Ax3sHpTgSs*AN23zk(b>#xMM9KD?gFD=@SGLo~_?IkU5|9Lct}3JN@IJ`&6q3;pw_7xDave~$EguYbOkzPQ`{^9xG9hG(-2nYJFS z5_ifCoTX0}^GW8dT-}vGP3rrk;stX@Z734mXVYci4b9~KF@U27Ld*H~Ld z$B^H*w5S*}C*cmJlzW(wiw3-_3O6Ssb}WS;W;v|l+s9F_MTrOTKhdb|hB^~>3EJ|m z3&}n!Wr;;ju|It<4Y8Z@BFI;RCb!h7jciVLSiV(mSAjnRGK*`qv%~l|<^Q#Lc+Xbh zZ>W4|_+Aa+YYZzB={__ejM}7?!m~GdBHeo6>HCg;g4^i30)Z{YD**+a@BAZuA-HGV zanF&UY2*F~_4#t=7RxQ3qjJ4;GZ2~gw#=*CZ^-(!zTq0P$}N^E5VvEq5%Eyvw#3L8 z{YHX>JTu4><#_Q`gox#G_}MQ;kE$50j>h_;y{L;ve49!@lc9lJ*7rD90TDZgTwPQL zxhhNRvKQ60<{$yI%$+;f*J%{^IK27mTjYI;WU*KcDpopLJZwx|pm1Oz&3~9fe?4N_40sG?diD zXv4q~>DNA2LLfYi_=^m6G@sPCqq5h;wS6akz*_Gs**wwymWq_?n~C8I|1r#vN0 zU<$97Ueaf_YeG%mML#F68h%wk>IIrOSR|}`&zjlP!T{PXiP_yG26p@x48oAgH z!;aEzF=|Hh0eDH;A-)SawD)N&*iQ#l$&H|GeMQSdjoAZI1J3W#a<48l#SEW46Y+IgP4%GG z!)gvlOzlXO&A0gsO6@d(<|F>JyFniEXMM<-aOmcb2cT$C8o`j-V$yOxG-fM@bEYaS zx*MLweHu@6Tr1H3e?<>*0eFLXgp+L}1(8ih@aB$$pc=_kDP8*8qzumWxejPiG zTc@%Y;pFj^+jW2RsNRAZZFE~+Qf>FG=e2Cm+IFpl*i$76BL3-%MLo?ZliZoV)Co`o1e@;I{ zpas|7BQ&H*ozg)Wa1YVVB(k>BMpnZ*q_-c|wT8Gv`vw{_O`VbBc#7go$w}C}dVZmA znNOm__%)0Orn622#K^w9#YegH2oW5QD2$K|wyc}(v#WbVasTeqf#A3PX3Av7w7 zt+@uUsE+pMCI2_EcGl++J_IJB;M7rCBo5p=k|P?x2Qi5)_3~5ojxYaa;zU0QMWmMQ zqSW`4dV&$~I`^txexsU-MQH>#y%*$*1tne^bgg;>H~W)hFxcCE9zNQSLyu>$4>AMcX2K3X1Hx=0S=qy4ZYigWD<&`D^=pZi$07v=fu<%6(C-It{Eu z`>`Ni$CI&#E$BllA+Qem>}n66EZUo=uv?{uSkxB;;N3x|2CZPW28 z`ZYXzur@-El+ZAF`|V!zC8_g9-9#Y!(35z7q7&9yj)cCGMQqLU$C0iSb$M$Klx_5` zlVZ}B;ZkS0Tfr&Tl2eCg4$fy?U*3%Twnh*Cv13RF@RollrWc!EA^z9@?!4(C#yyJX zO}hg7e|g@t;(veMw0RHp;s4rs(-z3fjet7~AJT9>*522hXw9jF%&r0 z8oJATH2ekqm-r1PQa7Rh-t6^X4!`RF=yJknz%oq#Ej?n&^>4kCnOBn8;T&9iHj!Rw z?7`-`tS7^L1Pc{L8+h{xX>IOq@tEAjBkcfEAa?jLvE?`*+Idp*UTv%=dsLIAZFMqx z!U%X%gZrTrqMQyOWL{3<4=q20f$yw9?X;K%BN_jwUm`lWj+hz?YP>K1kmx z)$U%EVUOv_yx!*gN5c6F+=g+W!Mx!u->prj9D$!5+Yj4Jz@B$@X2j|6kjr@laNO5m z8}9edKu|D{YKuyLH&uggmJK6@XYXs}O>?*$ZD0es?HN_Wvtz4?1JIJlA7J*-Hk}Rm zXEPTV`4clMEmWLykM0U|{$tDA)HX`h>?Xsp8_irwWiKa%qYpk#&6ngR2?=2;^Vr_f zO&y7kPX+?1BCqozrnT*G4ghM}c{yT{+L)l)A-GPn+VD+N=PGj#iw4(xQi|oMx0$q5 zx`6KtjJoI7Dv3$2sJ8^bL0K$4XZnDWJD)S{+mU}&ipER3Fv)LoUuot` z2Ha0pZv}XiPKI@B^?f|%deslpPu;JGV!X8qmm*B4jLx`--d<2HVq=ybz>TmnS)+P=8j6*5#8s>@16oEOsrD3$3p;A~Pb|~HRs1Th6QL6E-?4?TPHo3sO|%aqgOX7kd|#97OQ^34IbiOtm#HbE=y@%D@LZ>_ z%{9rEcm1t5?FyBuaejLz#NJ1|6#D~vqd$wApH9I#^V@nmKzf)))RjvRI;8P zeXTp3oMmst4_nw*Gz$x}+50+hS)wqIYlyp~(Hq7@*1=909&-H{!e)H*2_@9mXjGU& z`uBH_$|?gQ*{!jtPsb)Xk6*zjeFvN-MPE2g_4u1kh@rtI^%jYC^;HJh_jjb@@9<*% z)3#Yvbl*mJ_@Uw9k2a*8@$mbA2Y-KgKt+dWx!$8>6WFkAGzcu9B;HQ)8)0In{{bsk z)U>LB0%F2BO>omsm_U=ZfBL&yF+C||l(5V2qvi3%a05~L4Xc(HWEb-l_VC3GLp>7h z4}b;tOfS$geItCFZ1}ieG>ZT{wjxicFRwqx^nP^#FCD-d@UoOgwX38};$@UJ!;5&C zru^(e*FxVL;bq}Q)G`!mxuV7j7+$u9V6O1qD)mod8UPFAuz&iyN$Z4{wBM`tWOIa2 zQ)Q@OQ_uUk$R{Fr;Sl}IYKT$u7ole64xr{Pp5SXRY)H)0$%A5wbmh2``9H0I4XUh7dZ z+v>NOG*~F9Hk4EgN|p2|F(5ojs{MR|S{@ZjZr&c0+;1oe^iYDA`-ILZYCx#zJdds_ z;73Ra`01if(z22^i64Wu6MmMfdhpZmpYX$_EWq)gcm68Dk5dIP$*?#X%$aZSbK<8S zKdY^Nn@xtFTEkDRpj1hZpXJKr@q;;4{098!0Is`qJMg0exP|#*`0?|}@Z-Dm4fqjK z0)FtvF#ITK6Zk=dO6BRVZW`R6Uq#$l_(F8Jbge1U8C*|Zw}y`z9S^OF=A&j{c&mzD z7tE(Hrm?c{{rDkP5t2E-A>giJ4W$viSZw(&y587>ZDJIrq~AG?f0El z7FgCxRu=udbsCP`X=U+TB7G871}nm5R~A1&uxxT=G3r08EMn)Gu(my?G2H%o%{wX`7Os9#y2VavlpM5RA>sTL||HXXe9qkOsDx4ah( zAm^hu`qc%iii3V3-Qz1P5?X46^~Ii+I124-WUV{o2~}0}5s?D;IQDZ@SiX~gncvmO;thdl_ zaCofs--2y4T9K_U8V%E{MYv*)Lq#xv^ z?Ozp%_7{1Oi%o4=%Z`azN7i<@t{m+n`VG4?p;`u3WjCC_Obxg~mWjzGW{rURV4Vz` z>Fnn$exJUkmjHNnYm~+mtHiCJs~~T(%!5Q+DhmhJ=QCk`slLqlYztUgVgg_w|S+Joi0ikFQ$Wt!vkoJv`8f_8*hX;T=I84dL12#l4;L6KCAGRHmab zY)LhYPO>GXttjRU0iOW=SeD|?H77$HNuUmKB;zfqV2?`7YMTN?iyh4nGvJ6sdV>JW z4Ls&2knMYW#O*4YB6Wv2lhK=OUwOS$aV8fw&u$!WHgP6XxpAZ2pcxxBthHJhj~Ub! z_oxl777y6kBWt@R+RwC<Sf4Mb&KhgEGQbcBzgRN@u!jng|&+--Vu;AA!lZ5g6Ocy zGzznnZ|;_A>%l7EWTIdVixuWX`xlTciLwkPbBCe*0B^E>ee3n#4Z^e(Loat!oxH6Uve$O=r1%zsYUpx``yT`xFNq ze&iNpg<_)(iO`W+%d%zJi9Y&4AKJee>51g&M0#vDp~*tJa)hiiq5LzRkF1bm4bimz z;FHWa1hrYwTfk1Th@E&jaiUfUrW+rRKC}Z!(jk3%2B`*>`MeK>DG7bAGMiplP?=es zX*r>KFeg}5Cf*^TKZ=Ahueck(1q!e8D1?4I{=9rgFZ|j%YFf~HxOz8WaGYl4$mSoq z@xSea!*QF!Ars7SIIe)hXNow4!)%PhDM6tr#o=i2iJ1_WXt{vH(VM}cPF&J)rpn0$ z9F8jBkmUZZ_Xi$_m^DmM8q-O)W;r4ps-BIt>M5Yj{b)-NIFY9?uT5~YZ;eMvv?u=r zexcZpl~GTYww^AqL{&xqE`Be?d#hRlQz%v2g!KtQWtic@{Yrft7Zj?=!UE_Y*L;TS znkhlQUn^~D1c?8~2u$^UuEy;kskvM)wVJKpixf4Yv z+K;j(!bb+Ax;P+HYg(-zfv)+|-%Wkrk4ckwfUN^dG<#*&m`n}Yxsi6z>>+L4B_`7D z=)oA&+Q^`1;4i5q&+*0@FnQpay9vF%xDFI<1DdZh@H|eNf9$TC)Ctr2x+zTe3z)Xj zF{ZD8?FCHlunA1_O=bhs6f2e6=7T?pcfaMDndi}`)|Xb=BcI8Y-F*Dw=zOXT<0QGl zSzX!hBwJRRY`xI-8wQn}VMr^avd8C{!8dfcM(Pt|OUR#yuIjnMc zjxc_iC8_N8gcOhWcS_@m_aLr#BK;IF4F{55@L@A9(&Umjw0MukOqsbcxTCw%p|-oF zx|b2FZBXl3HMOe<4nkDtgIOKchC{jWXFWo_5dChG^NB3-8 z?mU)OCFO2bxskTq@_gnI9nC+d`8@_ZxL;vKwQDH%-`c^u zM|&xI=ILl>;A_lSH>2HocK0%d@#1jj_@P?5+MQb~>jw7$iz#iwzhjH4Rqi}d`$nsn z^+SA~j*urw9#uw1wI)3uwNF*{4ar)72B+Hv`NNXA;4U{orFoHLbuHm#x?VF8=T?ElgR%%L=e75*!2s~FpT&)Z_%*@))g@h3h+(RZ0oa9 zi>~ksxtxeb51*8ov`v|@pR~wvzHcT}ei@(i$XSPu) z6f}Q|dj{2+Ant|v#RrhZw`hD>xUVl>pJV^ed_i2Y)pG zybT)xXkqhjI8sELf5YoPZP_mW?O4BRTR_Un=H2lu!zXD&b7B7v*RGWR__h5%NO$A? zKjr?~{vV0O{Xbsh=2Ow7d3ym~VWDgb^rL;i-Vl)X0jDs#Z43;rpU@qVaeOnB@i=C( z*%NZ?2lm}pI3-%MHiG&*>6iF0!-*i&!YZXNr;64yYriB1xmv3FeyBC%&bI$aq?xu3 zjC)#OpSKNx3s$$}ler_kpMVA7&RtSo=Kc*)6N^u7TZ#0i!f>veBcZDGm|6z8vulc0 z`OK73M++S5^7u9J_d%|ED#dK4Qq{&309x1dQi z_bRzOC*IGPhjHTUANZQ{*4seAKg&xz{*)8rRqkNM_lH+Y$t82gc_buI8=KcAvO z2p%XM#a-dR&qtwSwFRYYi>`;tpuF0TOy!2KAK4>1gzWB62GF~UOw;&edzIZrFSB*W z>xTtEwtUJob(?*#Ob<7si)T~u=4H3zq5d8T3ev~znNs4cuaF6Tw{!&G^4k{qT4D2Y zSj^WB3J=K~GfA{jl|L*bsiJ$0yED_=C$dCAURCaWvq`OTzfvQbDe>-QSwoy(oF2o(u%Coz1 z1f|#znB8^u0cLmK|AY~Xhw$*j#r@91;mC80(>2DT3PbXIkW|Vs{;Ux!yz2OTBjZK#aFmgH7_lI zk$D<@x{Odo3MM$`a}0Sej_KBzJ!omNez%&IR^c#NVp6)m2mvM!pBdT7sFW5I^!JBV ziCH`B0+`W>a2#_($(>eH>jjTIxZ}W3J7`Ki3p|{0El2xpRrw@CR4w1Xd2AX$%=2$j zH*`*p0Gh|@S+aYgaRTLx!|2wUliJG9s7*}Xmj2yp!sZATGzgNSxW4lripHFJB1o=4JBgcdhIbV(Eq`nqUvTL zBUJb|CDlF>*MC>6ua)vI?VltwpQ5F`C@l>gr*klEMp`$Yq~+Dr$khOi^@k+VTkzr* zj*Yn({uYf`*jc#dVqgDPG$SifLi#7;^9#%|`4}|Tf0Rgzl?C@=uBzI!)7DxB6W{qx z3Y=+)qc^*?LVHbW7zOTR6u7Ys?l$9xMFr0InL;)470}S=l4T!BK1705c z5M5J){#%iaI%06?0KbT82bxZ0AD8lzlJe7UH@!>w>A7}@Ox*LBTh)st{#<`H$Mz1g$1|UsUuLDhJ~2URMk4(-|FUYUf3f9N?XAq!KhLK! z2XQ&fd3eb)eYr%0an0}N^$flwb3-aq@x-<}2n0&=LCOsropqynAeU~094%p6Q^X?k z&liQTb%72heHWUK6Zo>T*1wq0op0!pT!m7<=pyWHbVjMei>YJZZ{DNtlKhfleaaP~ z!NluS>;yzOZ&!p&3w{^kHWa$yJ87GzI=mrO>a&+Lu~thx8rx7Lb8rzoVEWWBbKV-B z`DN0$ZK*yrwxKa|(90O|QMtWuGdqNQn|XWQ$1t* z(YmiTxW8h`is!O?!fqbMESzaGewBMngboL2C1;!|+^f#QYK5EprDsG{?s-$dIKQm5 z4JD4H1huV7Qrl0dEiV0f{IG3!Fj~*|>_oNqf6w!~Ux&)ds4VD5sm!{*YYUe|5UXFa z3u8RT7_!~=)sjdy>2ahYOzOb^9*U@snBVP;5yG|QGHvQDCa3<)IL7-@|Jo(q;N!YF zHaM>P7WHMzcfN+7T;1_0z4doT3eRy^4h&RktwfMk?yGEwD5B=`rGzFuy=CUXQL5sG zV?auG8U=6v0whNPN&DH_@X4M=C|p_d0Jx(B;F&xpvu&;Vp=gt}!;0XP#6!oNo_Ofo z(=fSeo!RkzkLeAG^xnwJ*3a=RaLKmCgsznWY)q>jKh!b35M$NJ1G%_EQE0XKw|Waz z4@lNFTbfOpalkN&U;XKMbjkC$lIPa&OqR(d$y18S*OesCjGq}qEnlstAx#LGrZgD` zXXs(hyLe`C-Uae#WE=#Jmd1Gdgzw2nB8;7fx-F1{I`#`2hH7|T7divHBYgZlAcCm^QrWZA9 zL1N-IdTCp5L~_oD-D5B>=8$V+_Rbo0y^*$55YfYituTOTxv|g3!|Ok3-c@e*|9(4*D@Er!VcZ5+a2Tckh>vz%eF>KXqpRm6sj>6vv3IV9$)qL;JFG24$280ci zw`Jzw_@V*JL}mkSs)vTW0*>=k?60UQI zO@io`BxNTvn88IzfN8YviP~pL+pYA0K-ZM5})V0XnlI;rky3X;d_6Jy`syj_IGsAmmP?eZYBJC9R zu>R9Td&`ai%0uK%g6wYMI}bj{Xw;twl-{0nOm^@J*-OB5r(rA%-!s82L=I?F)Dr;MTv1BuY`*Skd;3}kSad0y)*zjstVT@aKPp(^%+-Jvz z^oC?&FpRQ$s%?Im4Rw1j#Jh-=Q3-CUFbdn~OgwZ{6Q=<+@RM$!$BhF}vJw-H??Ioz ztL+(q*IkpjP!KL#>l`P7qT`0cmiW!Xd$Z3}Rm^o6zA8)ei86v^P`<*ocOupI5C{Cc}#mSpUr?eF$?~3*#cybfpXX#qU z(-e2N&>`ZvW-5H7GB=c2=Gk}jPd=JnpBu6vy@V;Tz-tIn4iXbzWCtvnE%JtIc#oFR%9Ld8F&js> z&#olRrWQZnlewuLyL=x-gYL{j`TpI)cP-GTaeg;!zIE9gTtENZ!{x>(^HwzJ>vw1B zeY?VY3`gIm-1Y2`corTFqV(qA=?^@wv)LHFYthz2POaH7qag{FGG8_KbkEU>zI^zp zzFa)oUGj_`nwi+!cIOP(Mg7nnKeh>koKre=6w`y`%Igm_DKojU8+S4w88W zEl=#_1O%JHmWsW=uq90jqO-trKZAP<0A=!W6Q0YIv-g>n>!O6|D^8pO|&bDIHNaSbt*H9EHuV zOtg)s-{F1%ZH~P7R%)M((dQenKX>`e)`?*0z74oqg5aGN1Tqm*!Uj zyNkIWd@S&-!2d{i&=?rr4w^ZE->FB_c-1WSGru#t6UJs+Z#+Bv8vFCiR-wJ7s2awk zgY1d)7dxrawDD)tyxSIzLKa@9G)*L)r|EgLf1a-AasGLx_?@2h7b~lYo)^rf4>*4C zA<^@)Csn{8&kS9v=jkd;!PZca$I0PQ$ttEl9@TyQDOokxb7tsr)iRTJrK`4(?vy?| zr0aV~U#;{Gr3Vo&JVg&O&~0zwUAQ%ywb4384VShg@leTV`g=^4a^fn`sKo1C?-doQ z`n{+Y&RfW0!O)azdsJD|uEf_yV5$wGb_6(|Y_o%8R`W^92T2YgN`kc<(Yd|P-W7L; z8u%u*Q*B*|!Sig-J&47rP!W+EI#+QG=SM7b;RV2}exl9z#>CQFwt<}=Gg;KIuOJfx z+KmU%=x8+tX=<3!q8sFj_Mj1ld@B2_*0C1^7YdDf`Os0PdmAv<5=(z-te+9I{3%Dl493>O&U&+qeb}2WNq_;Q5Iiyl-LV2UP($+c=#AJ!z@Y#@e z7nj-m#}Vs_*&td96_!$Zsfg6BP25ml8W2@|xMHKGUaF!gyN4s4s1(A_LViw~z8$H+ zD1d#@-tK`=behOWLwrPlQke?MN$7^@A%V0`!G5ga@uV=-2472l^Gwz)GAu^tB-%!z zd&eW3%yfG?Zr{sw%1>b^15+!G?E3`Pw~@UE=!+w(X}mE**HkySio9H)cOFE~5!@eT zJYin;b~o}QVHIuo1axLG(IjRyoHCt>|2G@i@}IU-M=boV!V%T@`Q`>C17&!<#ZSP* ztfyjCz>iRtjiacIT(KSyCMx=gA)3pNn8Ty!i_iZkClRC1VLu^%w%44r0P2b~_5+~s zq3PvdH#X7d9dTJJWW-I!Su)5XOjN)-Axg~05^Y~vLc=&$*tY9+zQIGM{Ld-r9U z2mXS=^c2f62Ie=<0`qjy$NGHB9R*WN2RVW%=IwuqP7bTVKDm%V4?N}{oR6?c;v<;MIz$Rx4QpJ+8J%woE43{_YO# zT50-Y%fnIAU8On0lFW|0=X(7{=!V!%Y0~0Vux1iRpsxs@8DL`qxpx)Y_}T03@027@ho$DV4Ur{#Qa$G znn06G=0^2M4wzw}CMJB&hxgC+~figkFIpzej8UMwhyd#(>NaCn`~`WJq)S| z`oBV|skVP}+tY2>0T|#~yH(*wyo6g7hUhV|KO;D?mPf3_7N|A(_%WwU?EmtD^ZVs@ z3+sFCSGcI3zgyv={^LJanTrm?t=F!4dM~rIZCI$Z@wX~gn7La$A!+$mif>#y(#op^=qt@z2Ds*e&AZ!u$=|IDpTiME^)GEb+7 z$=&CDqR^!hLtoGZ@RP0)ugmpgQ*)L3*<0XQ`#yT<|ElmDs>q%v|7}lVa+L1#X0i{y z4-2q9W73$bCMNY$O=ZR;cB7(B1Dr@7W)PK7;V9~l_ET!E?yG}+n}h$Qa!{5I*RY|g z12>X~iCH60X?wfOeF7oKnj}44L4PMsq2pS|h&NeouX5bv)i}hNa_Y}zgkC9xpKVWB zG@(*u(CW*1O)@s;b`~@^Ee4|YPC@sR zl7=@7TM@n& zM9bB2A&w2{yTBAJwt*;~vWicY^}}j|-eCOEJvv_SVK;n0QxT|}gmEx*oql;YJcAV} zHA0NA72{gZtxDvN$I15S?S@lAva+Zu{kdalEVdDPyWxB8X!xD>X`w_UDIL^2cMm=Y zSSGI9+YrF0iK801Ftfc;lO>qPB?z0MyRfDC*$p{)$0IWinF@@(mbJE;zT=fJN-LO; z>DM{Fo2wgtkjK_TVz0Tnmhd{=xUF^-Ktq3Emz}v*zx1Vd=!*<=dX{a3WPR43%(tQ+ zrKzY-)5&0Mw`xMyy6N;-^m3`0y?s72t~BNbex)4^TiPBWCY-&=ZwoOY)dJ@b1@|79 zGBt5GNL1toE~Jcm`(;0T(+voJ+uMU+E*?Tnu+agq!tg0f+&rzV+O{Y&Bd3mDN?5oP z#e2b>NL?F)f;X+4>Z>>!f1>sNoK3BA-LcSfdR5zsH)=LuCs%>Hu+cYW-cvnl(j8|c z=3)D(?Du?_sfxg}&8c8tce6gA62eWbZM~b%M4bFw$e?NE)}OYstOQF2rMa|9@M7(Z zwc;rBHXGg2Uu=2`HkqUi3eF~`;LLvr3N5gJi?rAXD1-1JM+ zh`zg1H`Dmr;!?48wTZ^ROXHgDcbhFd94*JJAoB#pL}Ts@AP@llzEe*~NE(q(`y9z1 z5Hd|5pG_rwmpqJ36I|Eb*2k02>X!h4%r+M;MmUM4?xs$65zGhA(vvSDKcu&q#**1X zt+QVtwcLK&`^v?!-IfL+XKIy;kzB4{U8%L>a^XEBNS!$2MYvKIr509B8w66Y-ISV8 zJvqdCmFRF$hLwv6i?Hv&5D3pgglJm@+-0)+SId@++B`xuldHQSNZns)7YP>8gy=P0 zV-6%<(Paj!bl3q_o%)&>_syOpD{p+U6c1D$ro2GTkaPPPL6(L6x3H5!?_^=EM3w_=(FGA#?P4^0a)A zJW#pII^;mEW9%JrXTW+&DqakB#}Ayq?{m{?qnL1_51Ll8&qDVMQWEOH*AyP8>Lnr_r+ z99q-%Q|6;o1Ceg5qa2s3S<&~yK-+#21hducvUlZzKvEa-yO>>9ers(D%G?CL+nv(Y zI#LU1sx(oGK0xZsD2T8A@eng*OnkL3ks2fG%u079^OLRGyFFPAGKpfIP=CVH3$l9R z3nQ!b7idgYxDimc_+uqY00_H_OF-5sTu3tWqLJG0fog18j(x#rtaWd)LQv51syzMU3_7kNEU4q`PDP z{?&#~JC^c^XdeRT(dWwwj{T|3%lSRMW7xb@=B15M5KRJdjfKYB9kY|z=vUnm?Gmkw zg(z>P9s}mwUC#j#gyyMF2-_b@h=#pI2dHHld0j8r)k^b-4!*uGzEbN`vPn*u?xSqA z?iG@0IhlEr1COo`p3Rb{gMf^yd&`;!^KaMYUuaNw2Nt*f&m3`Th?{H7yp8f|BN>An zcfo*gTY|LwH7dRAS(;Y>$XwezmV5mMiy6Zl(x#vp`O_9 zu5g}f8@EgNd^?}rHsR&!Lf(1#hAVkmSgnU22V3Tt36iXc;)|89K!i%sBe%y@k zZj$y^y)d&klC*wNZ(>(c_OoZmMUTpW$at}&u(g~aI>$EE2#gwu4c`-wmFNHUj;^_Go zrt{+b4$3OQ=kD79;Igs?tUv@qqBo1ycBVh>PN@!SnM1xHQA%{)E>y$9ba9>oznfR( z-hCQ~vJw-X3Dmk10ISh!QyPT9dWA6^#&4=^C5{9WZ5_SJ-Jqg`8?4rBS>>`I$n7X- z#>u{#`BLXb!W4??nC>1xKzgx|yoZ=WZr~KEa9Q*#p3{21oac}|x&#Eo&)3s!|2!|5 zc_~t4OaF=igk?l!v};M`D@&fIQIN`yQ{{UXDi>3TPQ~aM*RL?X;WF$EDN9zmVt2c$|k_)!Y0YS|kdqo3pzo*w@6TeukD4{k1TC zVjK!>F%xxzR1>g>)x@N0OgkEitlI>B66veeB+xd+gLf%yN0)D$dX7?$Lomi1qR{4Z zw({Oon@doy!VpE1sNS=udnB7YC)@5+)z|C5HP+j+&IXZ8&&QyiP)t(?I*#wZ@FihB zZt0IyGS*&uMIU=PS)rA2fJ&LFSm z#MKh9&l1yhQubwE%EFc%{$FKkZY*^YX#b z-R9MKs*v_)G8f!iTEOr!T`w<%7cDB>ND(YtLo%SCr}z1ajh{sNHrjLhFQDaXL(Ar- z{E}~nXOy4lAXCEDrBGonxgFhTH{LzHTxzn=w8pna8ohiWTVAfN-&vksuAuDzy=wZb zfJXEl=~eMTAbk``x?Q28=tlulkBg|#_j^Yj|6hCW0^UY(^^K1%>r0$C20}zMdXx?>vavuO}36_+njsy}@w8 zk%%~g>-^oma92>!3i%eSjl@>R!6mrB+us(6`MX;~y}DCD>t2wcOKKz<499|93;gkz zf5Eaypsy!b;*Unbs>2r#`PFzL24NO1RF|p#NEqMx{0T?6ucyZmjwBphBB>TffD7zM z^hbmIqA%?5`TfD3V9bZQaMxEH3nJ0P0`Lm>7W9NV7DW3K-H~uC=`-ILIJLZMvn6k;v+J6C{mwq7%5B z^+L?Wnv^U<`6X%{Q$yiU!lX?an(Tp5-DtKr7b>80^iE$m!PcPdk=O|xkx0);ywgJA zcrcbweSPc3U+WC@^q^iM$m(rh6lM`%3mq*HN2=Q+F?4ZfC>#vT$|8sA<3ZWIY)*Wk zLsg@ZxTAP&cgWxE2*n+9&hDG*ILVRd4#nr66c3#pES{Tnm9dp!Uq?@ntQs|9Oe7qB zAIvOe?%0>Xk7J0$%dN%{={O4wb`&2i1ao*#Th!E2?{ca2p0?ViW>3B9S+QdI3U!&M zwY7G!r`3Tr_#?g15Go3V(VULHE=O-95Oj3%)+KzQp7>m;DXza*5JE)!nwlWN%CIo8 zU?5%HqSM+ut!izn+S<0Fsbw)**9lI2u^_BoPv5_&KM|~r#eDs0ZM>BatqfXfpiU3I zctUF_x+&oc`-6^1rx;<3)KGWaapK!CCm-`~*8Nhk;%s;Z`)}ODcVVF!i3ty!U^E$V0^@feLap~EEb6^bVU0*Lg9eR zwQyk|NOrYwA-|w@1{3hRzHk5vLBo468Wt{$1>+dEG%U#IVOl;%JV;K`kqGq$OB~Br zG%apwscm*}dq!=H)%p6mx)bE*`ogQjk+oq*7)($Uel8eIgd*4(Kb}T+5j;*jE}5)2 zHkllqn@rN1=`O5XYaRLsFf&(8i5FchOEC=oafyCDX^f_2MIY{u0X!ySX7(Z? zFyfu6QNS4?xIhtcbxoPt<@c)yr13-maa;ATTj%Qtxty*FRTcf9l}g(oMU6BI6zYsc zdR4zK8b|bp3DN-e)Qi%Lf?Jw;A>3zQ549-ai1#7p#p9hw2Kv#Zp+txqGuL$j-YOAv zw6(T6RtNiM5wE&bLGDrMvS^z0;+zO{y}s2!q#eElvX5vC`AIC%uToM`j8P`+M=~If zK^H{HeU_8%n4CV6B9u{A1bu=2;<>X#eOeyTi=pa=Gl?5kmbywc^7;(R?V~cSoCsVh za;)I~+NPIRvsE;MS8v1yRq(^O-$lB`^&3<%3KQx6)(T`RKV_L}M_;H1emV|YM9_q> zz{XOgs3+3jgk-7jn)CDZAfN{NDTy%&bPRnYL!At8F=Uy2!G&pjw=eGKMb;OJ!m-4H zoxxZTkrDaEzc~%Su<_C=!+WJt&Ye3UN9q4z~cLQljO2*xs4Iog`j!$PMbEl$vg zBW1^%vY{1dk7lxmVgi9t&U?mP|kD2iUvmFWxQC}>s4P!?<)D`w6kTp4qU30VI z>3-g&fpOOoSh!GR!;T(E7>vO$Ay)SK)~O-^m0}@A@^TR=KB7t}&XbXF zBs^bhiI!;|#n@4fwV^~e0(ANsQZj_6IE5{SR+}uQ2&j5i-9-Ph-F=xe41dS zCZ7W?s5=r6B6LI`hwzk00|My*B8cLYc?(C)EjSoB-tq4lA-zadkUaBjndw{hh1!`K z-?A;zRd}?t#Sx4n79kU$utZZ1n&Ih#easg|!iV&MJAQ)!M!rG}QxPz@Dh)KgHt1WO zb|5kUX$~zjUh2jpAAlJ?NGmcQGiewP{87L`u02i_YZq1NtOH_3K}OIP3yFkB;>d9O zXgaPj(NsmEHPp7%HZOFfh7rxnh%W5!clf)5{?#%#5Q$hR@b?6LF*S-visb@CG@g-Z zX%f?TJG5vT38y%OgKN{)L`@Ax99lcF8_69KS~{o1?R3UVG7?WMd6KNqp zMwEY3gy0q+?H`UuTNjDMm^OwPbuZQ~FxQ0oXE__3VW5K!Z3fM{2@h_VjC}Uv_bWq5lo!da<7$Iu|nv= z$_$)`464BZCQtI1Frgl+Tm6wfifia_nh3NXE!Te>CurukAfD(c={~9bWXJND4ojNu z39iGHc@xsKT6g%U$UHrHcPtW)a3rQuf}RkPsrczaeF-DiZgG+8gEOLrv3_H_U>|?i5pF3vw5KY3bhq86I!4O1`b$= zblt8>nSjz#o|xi7Ufp%4A1fv*mn_|o^mVs0l`LYmGoDhkJx%Yb6php+tzheN%N&|t zWDM=A*0(Q!6;FhKrUO$NwA@;%dRl!=YdiBq z&T?mkv(j1Rtj7G%<#M}9U1hFvSB0z6RpqL7JKZj~+g<7|bC0k;Cy0Os94mfWE*b`{_>JnG>X9!pjM1^}-DJokaIBs~Ih8(<<2Pmeq}mRvQ7 zu^B%E9`Js^;zGtsA0A77h94s72K*U3@^--8fO(IMCD&mCZW-XYc*Y^`QP3aB*j~Vk zQ13N9>8sY1AyBBuL9f& zxD#+UApMTlUckM8dA~+Ikh2PKHsC72Dv}=%zqHGC1I~k7(?-UUKLT6@_!i(^z#_=G z4I9v|B6+d(?e(HV~bb!V0q2B?w zVRynFz^cDO9_+2idw(q14!CL$>I3Zl0Q~^C9dH`9N919vQ~+?@hhxcMz}^4Ab^89v zSkjAU*YZAvod8xHl1%Od+>Y&-4E?_D&}6b0aMcVvqYO9=8~pbI?mar0tim&PTisYY z1YA{`Ozr^O3-}peV_7okz;5PlzT4geM4`5YQGWk3DUY$%99SS~xHGstn zlgTJx;|a;+Fks$E*mnas?c`)~8th;fU_D@QE%@W-UUmaE1MXdvOl}0+UWamk>*`UC zzBfSrY0%e_WO5zg?xo4(cEIA6WHJvMNw#{E$yI=jt&j_F)j6nV2I~DzGT9Bd{k&xI zCBSJHK)%CqeG}G10UIyDySxFneHU`f1m9be$<2U`*n_kKFc15dr(hReF`yH0_YWW+ zVB>HyxeJgEG@l=qvsW<7x+2Tr6La$h;Q(d>3lk5cJhpc z$Cl2f&%34HRf;{&SU2jMQlX9{o*j=?imfH&gf z08Yy*2sez+OyDqV_`zkwndNM6GjBoe4WBPz>~O6eP&04KUW{R?&mJK~7Z$Y?=J7hn z*?op>WX~4}POmkt;CTkn0 z8FS(j#kG03)<)MF3X3*d>kFrCvMnxjL?#qYt1T>ARG8-}Xw*(w8OYwkryIU=6QKQv zsDYQOrE;}}jt%zu!glM5355=D5)5h!c;C@0PiyXiPoQ$wqFilZ`v&_a8@)q`_@034 zL*RR^cHMw)qXFLp$|W?u=VbEb?Fau+TqnCgokg{UdEkut@RhdW-Ge%PfOb{1Ws7xD z;k3=vmQ83&@dl;7aKK{y$%Mk<`a%bqxkze-`g#w_?F8?`sC*sDgLH$k$-ddP#Y%OL zpza-ak0p=6In}+%>f^dE#@F|7Z3r>vnbb9CY=doSp~JdZC?Aq8E?~rG8e(7lLy(hb zA;4y<^;^VJLD62ldl$;Vd;7B=7pdsC6i-&7{W>lsg9f`v%VKPorF6R5Mv&KagSK zb7M)eZMK|rGR}6K)=x53Qv3AJpZ{MVZ^p-o^6#O13;N4`CHO;g${ncxGvpa(Bi}KsKiw9n-1c)b zZZ^NwP=Y$tL$+89{tv-^U&J}pBlQlQZL&*Stg-sCY;kD;s{#4#ka5>bW67gQ=SO4I zz!voVCd&_6Y)*(*+@P~&j7H(BwxRx?;W3~;Q@?Xt;$s)~Z2LM2$$`^QaBk|nw!i^W z2g=cji<0^gVnlZLSbHQ zK?D9T5pDG%Z`%yIZq*~`G zNO=oBF7i+&fZsW|3-#JRMKt0%M#z?amFsvhEMOj~Snn`u3&^MC!MDAH4Wx*U{HVeI znZS(#cPzC9W9>2;CtGNoY^FZnM7F&l$ChK^3{wMx54T^Thy$M?$o2edW62oK?dL!) zp|^TyYM$}UUzpVGCTY;N`P1>L0y6A7~r6|Mgf%LK+*FVJd_44`_tB2YO zd)Q>Ru7aAxAf;i#{rPUt*VArtqDM^LU|q|7Gx>Tu5{z}Yc8a{V$qHtmC!1*p{1M{S z@$eh=r;v1l@8uM;FSDh`czTR2E*!SoR@n@P5=Mj2$wS~d`;M{XHB|Taz!QZw-He%lMHql%Nnza8+#e6g~9Wxq5%r8P)z6IV_Q@L}9cPixb_Be5U zCgx9XPPqz&xf|{?c7f+xpnnA4?W&Md+8<=tjF<`gYa;uzsxWAbjkNs{ z52|w}>ZE7v&n3C$3LcxSEh4@}t-DdH2yYm>H1255G>ivAF2p(6=_cyu4HOHl0nh@2 z{Gh1-NRiZ#UmQRk6EXjK2IuxSF=^3!lg!PmeJHt@2aLr9-`4yTQb^FBi~5F8b}@}F z-Ve*o?5e)dYrWdgs$k*GD4qvix8q?X`U!KpD&)}jYdWs+ScJG{eZjC-kw4Iwm^Kmq z1oO!{oQpBLo{txjyA0R1;@ZC|H}M((uN(2q+gYRoZ>BBs_?aGCl;*x+sDq{gu1}h? z?!cV)U*$#PeieAVg~voMq;YZxyj0dWX-LI(K2DYt?y%Tq7>^TJml*dwSW9rcgI|us zcVQ=s3u~-rOeidZyMg6Xy(6f1Gic_b6YMWz@|;!gxbeBeW?MDMuqXLgB408a^>$m5 z$v(1;jiN1^$zMtvMOX;y(|ouRoG{qMk{#OY2wbu1j&3lWX4#)JcDH32)agF zGWi1OYPsNV5Z`Og8W(x2mt>)k`6AcAXBX9tbtZ}<_WQw;$3etAnk&^31kMh;mm^IU_sbxoJPzpm*!d zJa3})nh#NSIhExx+l%nKw7`%5)JYDY&jo*nBbme#o6LS5d=95u4vPAM26&5kzz>0D zCTJMho=@|SseXh{lRg-coosak^zVWG4x%s5pyzRKap91~`fa_nHWg4klt=`QN+u7X za+5O3k+0h>bc3`V^*BLa1^PCuAH9Hc`}_E^jPfil7ySJOJDE6ne3XEdi+IulGBz7) zI8^2j_yV2k?R+JQh_s`Y%XA}1yr92~=${t!vJT!BXeyPrlHENIl3~zKA-lUx(CcF! zw>$EJf6p>INJzSyh-MFdJ(;B6C$kS`)W_RIMIY7Y42a;QyLyzHhP`%0RPMTrax~U% zf|z1z2Tw+0vkakhFX*4Zx$t8=PBuc%h{du$rB7}L?Pl!f`&VN{H>!FMa(a;`CdkkH z1BSwLF+R>=z~kmy^@W&G{D6EiVhC~<BR!lzi$2)1nASrbjmss-7knu_K*!WK-Jrb+v=g9Xp)+o4+zzb0DMM&1u%1^q z4R!^)BkvgBcer<+aV4#2u+G%Cv%}GQ&1jj!1N9;B#OK}m($vboa3kG z_Y%K9 zBG5tK`^?plTTC$)5ZM}R3!gkpp02|BH7(TGf2i@~V}q}2WaUzbUN+m@DJe1aW22>j zPpRdo(Ki5z0mV1G%!XC@lF9q1zfZ>4$dW%@7tb|mspq^@Ogbd;CGq{ zzjVI6xNy*Beb7+Riwj7X=YrqB8jLfTfZb=pFD)NDh}AY^H)PmQV0$+QS1_tEy7e|9 z=VsfG54Yj|2j5l<`;c)IzAf!M7ZtQ=(T9(-nJ6=WG7nI^_=bsm`t}Z3YXk2dw!5&=OPW6@Uw>}=9&M~eJ(Np+}QSdzTGPIj`o@m08 z>sZd~(1uRp30&5Eov($E9h9LB-MDA8zzv?28c*&Yp?Mic^?KQ%SZ_AzHquEw?oI95 zI+i>R=OXT$&UHdxlSvCp-Z0&gH}8?KQ`Ua+gE9B&u+uXJ8ywOnaYuCUs6TPEQ= z&A*59pKqDT6vZ-CS*Ey^TP>DGd>ODPPg>Y^%UnHMc7nz?X_1dX?(!adS1Te6034azU@Vu z@?^f{W4m%kzGaJ2{6N0t>Kx^{e9KRAmFw~?Be@DFFUV7XeKd%;&aSF#vSrY$wqtEfW^C3|iO&7NMybC4grr z!&bJWWC}wE+@B;Qw zj`Cm$dp<|GtAyQ_*Zbjoc3Zyk?tJ!-d<8UDPEdgPX@T}{<@sXv{v_pz`Rvz) z%Kh`%p9_^;#q6n~Q^54TgPyaXwwDi4ez}1C>QJir?WxLLbJ+ujIdFdCH07qbY{T?# zs|H-kW`!1#P>(_#m)*wwT2G+i^mPQ%(bnDYskMPK$y>XjCe9 za#J;97g!Yhs?fu{H*OfVSbS~D9hT{rT9lV8?AL_dVx6_a%ATMypIK*JW{1jW?Zt4h z1M#eV*3$~RNtyLXKKo3W^==OPZO-JOe0F>8tf4&iN$#wj`RtXv!fW%{Kl2WMDxW=& zk3th!!TJg7^VjFC&t0FhURiHnZ(DC&Z&@!r73Nj*5YO>hhffZlA=)?8*(3 z*hPx+-AU|vr2+WADayqY*=0Ej7Rk2eD3?!UJ93Erw>iou1?=xR%5U@8=Q+x+^4T4^ zgnuBHIAg%z{PkSruLbP7yc6CoU~lCqpG{yN9R^Qwaz)UaYW} ztjc#4w#}vtDC`p(VLr0c|JM{jdW$WJlM5{hb~apXQJy)B{mP>J=rH!NW#+~i>~X7d z`(f-c+viq%`9+R${d9JIuJYq)?BU!>UhGpa6Blghwn}4`=Tc zD1Sec-8+#`PffhXa^DoTsZhCfCVQcf>iAQkvU4W;jGx~ynTWPeJ_P^Ynyg%X82f}{ z28)!(rn38rl%GyzKjY`G6)E8Jk0RyWDeUti<&RU?@IlH&Q`xT$Qm&ZF-s0zb4pR2c zWS1SRoOcAf{a`vDK3Ku-kzX9FTs(!1^7Bs*rdACeqC9#CyYmn_fAA2Z**FENcy@~N z;S~1z6q0|#RAT?hROM!nA4X!{HcfeU8hdOS=^!~xxqmvldb)DwboSD8y86y^<=Gi5 zIbC^T1{<28yf}kBGK27M&LF|xpK%bBb>-m-^z+c+%C5uNFAgWgZJMdvJ(JxylhpIt zOuG8#nbh>Z&m{5Do!V#OLl)%?E4$FD;Kzo6FQi>ZRmx`;b{QXLh>O!^CNunY(zmY#UH+wUa`&+Szxl%H{Y^QSlOT7@$|K&0UTY2u3A%SJnUK$bjeo$|hw{n&EMppCt6Q9iY@C#?$3pSRXuX=j&7Du){-N32co zA-0*nu(SVVQ+{G+SK1Y%irZ*_q3#cDmUnDhECbdT8YzyvSqmjKaqvqx&_Bjk{3A_79 z<(Vbyny)FZ*0U=d3xT=zs8&$^*a~Zp?O!*!#mCEQ+_Hw0y3bs@!!?kQzHC1!P!VMN&xZ;HQAo|6L%16uDA5S_O z6}){i728mwJg|seS3~D_)Eor-lQmTAxf)W`%QYm{o*HW3MYUAH&9x+MvX1cE>d_T% zdXz^R*ro;rTEC$|d8&au&_FUg-lV*+gpD*QP}c6IgTV6QCCcy;cFPh4J@eKQ1?szC zDOGhn{x4<2OG$)hmMVW+%3kB=SGFjxE@yYOkjQeFk@4lvK{||)BE}tZtXXp~fB&}a zPP^rNt1_gpKUrsRX)gJx7n3Xa@a4(f^Slk-8t+DMY$%Iy}-}^ ztPuILIheV>nZqyUg6{Y94=rlj1dsBgBK9|{g5~cQZL{!y#IEd^%zm%T#{c(nlt(AC zk8)?@|HgdfS4C{ogxUCiZGrN`B6jD*+4%qHB;~ou?B|7b_z&OC`Ce*S08xKqQFhH_ zH-q;P>>aCe=}fl8rkp<$(dSW+T|HsV)*}&0lubvnCnin7{|%Fs&yQe_79EEF*B-3A zkFtkcW&zkUn`WiG^lo52n2Ypmpm{$N#pj4aTuJPX%kKZ~)S<|q&6vC$mm%{=xIKff@S z$dkFpVy!oiE?!Le$365998W)H9b~%4jaT0q!RkNyXQ2{u=e{mOL!LzDjJ#Up=H)1{F7^ zBiGsQvHa=G-1GAOYI#e|eR>j%KlklINoRWE}H)O()^E$Y`^+u{=mVb zMD*a4I*B2=FwzAiq ze&DYuYkA^yc5OM)JX)@dp27ZBuH62uNq?+Re)6qJ1C`1jzA@Hpe$**5@q2+sR?1?&R;?6o{?-}f`b;|2! zvW@l1OJ}l?`eV@0n>@;UXXNf~I0iKLG%B~9k^4-ea{n2*ukrJTo0OYD{#u|LuGPnP4~ z<9VUO~JETb1ii$-S{vd46T?$F0iX zs@#X$luJ*^J#Urr0rlt^!~p&o)NiQHf4>(cgy;<>+L4^K@&d1CiopD`0V<$dd=W~ zl+P>m5S37)cgXKrD<>T(u4vz9WqmKp`d**)y(R1WH?zL4mfx|8&5u6${RsKJl{P2f zm?!`?gozWRar_v(TS$f_aefR<5cYJq{9Ys>mbm#bEUzOM>Y_lJe1dKOMB5PKPAJ25`YpvSr0c!RWv(M1Ej8OI3(e)ggYc0k#JPPJrc4GS-*r137rzwNa&TYUBak@0}>8OI3(e)ggYc0 zk#JPPJrXj%tY1QhgiZ--B=kzyE@4!{0SN~s9FlNY!W|NhNH{9t9trtF1Q4MpAmngJ z=#;QVLa&7F5=JE)kZ@4KAqj^i+#%tJgrgGfk&p#t`z3Tp=#;QVLa&7F5=JE)kZ@4K zAqj^i+#%tJgrgGfk&t!D`XzKo=#;QVLa&7F5=JE)kZ@4KAqj^i+#%tJgrgGfk&t!C z`XzKo=#;QVLa&7F5=JE)kZ@4KAqj^i+#%tJgrgGfk&tg5fe1z2LJo(7P6=xy^h($+ zVN}8a2?r${l5kkU9TJX6I4a>D30X+CUqXk3P6=xy^h($+VN}8a2?r${l5kkU9TJX6 zI4a>D2^kh?=qQrVA)!;k8VS7;wo4e5a6rOA35O&cmT-rJBNC2ExJN>^TGlV2Lqey7 zH4=IyY?m-9;edpL5(;kV|D%Um(_akEECx6u)IT1`q(pwekAbQC=5NGHkIQ*b@O5k! zSiAlr&L_(2MO(y$ysYa-Yu5+Fh3xVRXJ!iC-Y`dnEl1 ziN9Imhc^m@oK~{m+XSB{q?~U^`Z0+gy))>|6q=w_*&l3WkCGk6k2+J6M z&JXZzi67m>(djTp&+E^oanem;%_`# zeAlMKO0MKDJN#{Nnaz=S_c;P_wZt!xcy0W?A^Aro{#i-C5&c2O4HDlh<^PpHvqNEr zykEX65VQxFj3Z=U`&Ibymc(o82FFSK81NQ$ID6$Vp?Fy>o0Ko~qxrK@Nk1KU zBm18XJjvtDT95d;#1CZQt0g{r{bD)rB+sradCuVYLzy<;l;W`SB_B4RVFbL4(@$aA z{8hVZCjY&h{$O_IpEM0ljBR?E-NdBoz^Jm(85?d3vexiRAbo6Wn z9rSw}ydUK{fi`d7DDj$~c1W>qk@!8i0>Sr@6Q2U&Gnr}gcg`ID4hEj|xjRcvY<1@N z(JVYYgGKU;GztE+FO-hEIQuy6iaw*UF>Fw zzgOZJ--wQN2r0ihS;|B2Ov6EY1L?R%;+@0ddrScKB=FSl+Io=I@2^PuXjZ?|^Zz6d zqaP2zLHil$Xp?yFBjWor0bvC;>h(9NzUk< z*i`R`^bcNn;U0+}{HZ`lx6bkoCjN)v`7hxgG(HCbPkh?X67tZVQaWlSe)K-^9TM;( zF7aM)qvHkspWy~1@t?uAW%c939ACt=^}eGd{cDm>c0VR1pJ*1Jxlj~u@9zYI$0eT* z;Hlo>Ec){${qQG(@W+z=`<(u8wkM1J50c(_uOOu7jp?Yuf)v&3l=i<)pxLPs?~ntR z_IT5Ap~P$Jh+xZ)+kq!JFUqR-DM?>r6&2DmGIZo4aP#(Njo%81FOuuM%LO^}NxVZ2 zEZLoG7w}Z?U{<{!N_yuqQ7=7{K*zy{iF%o2pb9iQR^lCx3WVmbwgFH4v-9zXCH>%! z1idB?{ay^U%ln$Z)3bYYL?oWwFTNiu@efLT_Bee-<8!T`*Yy06#1|C^2AZFliWGsy zwYFXkGJebjp5$ql`uv*2w{ZF*HuN*Wr&QuEFrmL&;TucKEI;r~iH~j< zbo6{H9f6sGUR#GhNub$3fhRp{@l>Lj?Fd1?<2r%izaN3q`y@X4slZ<)3t(@7uq&DP zXmXx-q`(hP6N1n)9CXAaUb{a4PsfiBHN31>x(PN73M74M^A4YESCz!K(<>Hn@E_g) zd;xfB*Y+&?{6M46iX*cflK<1hg)w@bl#V`!=-)g!@2KKf>MfP+dVQ9_ zYxhYI*7@;Ii7$Fje5Yp}>6m$xppUi)K5xo++M?m59nKNtY&Gy?Z+oRYPKm!v(mQ3` z*X;A`qh-4!pV^XrNa9BdMZNSa5*-c@P`gH@e)#V)0N(&S>2n}UpK(b)&Tngcq&;BR z@?$z0LVVU`@u}nZBBtGU`L?9rF6nn<(T{NaR5qHGC%g$f$)D|?^S&aUl-r$*xCAn`S_UG!`R9g8G>NZMhWK(o^%p8Z%L z==pIv{wnd>eaSmyy;J9kdUu>I5cDh!9reI7b`YC}Et&e`RN$$99WwvX^nA9&SIK-S zPZlaXmgr$ua(rq1zCq%PWuBvryIX)KJq%>o`41(&eT9&Ro*|;6;5cFbTeIx>7>RF} zc-dX-REh7-qR&BKB00775lznXK~M73OcnL|WWU@i@!5Irz&t6xj8mFDUjsbpO}lTZ z>FqH|KO*DcgQ8jNV~HP

bmr1^q>yIsgb&)U+_3w)6{&G^3oc%*9Mj)zU~IVB8h zC`D{k#?vxc@+9Dm__qRo6xpp5yiw8*n$X_}ypcR_nb3c1LO*eVu{<+P@biH;s&|PA zUNymAWrBYQcq9Eh13c+xcvvV#hIw|7(^#H5;Enj4Zh~KJ!sk0C^p~38Z#Tg|YJz{- z1pfyU{JSRj&w;-cEv|W181i${e#%|O`neGJB1xkMHv)f<9wE;kGvV{534Mv%xZX=m z@b8%5?MOt8?Cmh%jpUzeLf;C!QM(c*^iP`5|K5cDQ{au{IS2#a2tNmSqjpsRZzN~P zgwKEp{Y}90@$!J^KpCFdyTBXOn^$g(|GEkO8^9aMIRre_JMf^W_Y`SAubc3(Rv7E& z5a2nV#{`2q$)^T*Bl)i|!9Q%m|K}$7*MO&bN2UBS-Dh(vjpeK|!8ZeMBxlHkzR!gI z*CzNsneh401aH9v!APD{P4MTL;I{&AWY2e);2-9Eu&#BX=)gl|KR#>1e@3-2|6_nR zl7BJq+)jQd7(XKUUt>c5kO}^I;Eme#I}`eu3ytYrz#GZA1bCzI(qqEsToe37Cj6f= zp&v29{~mZu&Bh&{nb1!@-dO%}6a1;bQ#@*yd8`(XE(hL7KQ9Asr2jWe=ubR>F~>OR z89NntBmVus8}Yxy1i#G$zr%$8izf7cF~NUqf}eUK<&9IBH!II?0p3XdZ<^ryP4JhP z;BPg-KLosy-Tu~uemC$&`akF-WBk#;8}V;5p-%vBq=!u=^gl4cKWW0}1rz-1z#G}y z$0qzIoNV0Qc_#QK;El#(0C=PJMu9id+Z87Cw*YU{FFQ=|@0jpmHOBHBY=S?=1Yc=_ zUuuFs6L=%}*O=h%Ho-qSK4B&_8B^|BVUW)g<_jJSO;S_rILe&CJlXV3(HhY6p*n$Ul0LjSep#`RX3;Ol`m(%W}U@b{VU`MC-H zMc|F>fvKmvobkP`^}O;o9P>pZ>j5pVNq z#M`zL^ioolzdNuAZ>?%+RnbbifY(|Ff=#W7vZyZ{qZ9ZvlBcwANLGSsiVIhfbs@wR%@l2AP68k!hW@xr!HBHrYu3j7|Auhp}>wM@?=^`0}ATmN25x0AnaS9>v= z8pFG}V*$?^UF}O(q{>!6n}MESWgzGWWoIT?Ya-pwYP?#p8!tu+1T$z3ydijx+cNOM zYYt<5{zQ{^DH^w?O>M1GTN7?4-lvC`vWLCE5&q?d?x#ULNotS_8s>S*o&-3q*^Ev-K&5AWUG=h(^6FR5EvI!0t_oWLE7 ze`8$QRD?Dwzo<5yI&I>-rM22jo7LvpZgt2Hw)&cwJz|b#6r{E2q zcn`Sh>sv=XQ>re)dx$l2?nw8ALFqD-?hN(No12Nn_~N~poFDC&LRRmPdo(6~QhH0K8}0nSD}`8<;H!M`{d7LCnj zwSOv&Vftl{+KVkqh4WU!LA+Zxkbcu7BATlr8`&J9@YJzlX`8xYDU1)nG+jcCCStDA z)T=7#t6tIBhT(Q2x~1RyjdyZJ`|y_IpeNHIuSgFNZwpkXa}nfk%VCPT-u%B_iB#m` z=H-iOoACznz7DmCyf%!s1Q8S++nQ)b?huMl=Tkpd%C}Q9e++Lc@9XR=@iV+ZRlf9C zP4ue%o(L3;7u*ISYFAIB10Fbluob7_kGJj8i(b(+G`x%`FxtZ*)fbET`c=I5IM&ZP zWAs+@KwocfKZ@wSaf8etRn>+SwaYy4nDuxKdi|*-h~Idnao4(aY7}pIrFX!F68-8L zO0>q?j`>Sb>s;g@s1S_gD}7yPz0

^_oQ~A(G~DGfj=`31Qg-SHBGLlqX-(jW#Xe zS3GOvhT zC>X=(i%g&;q`7PN$c&?+c4IvQ{$F2l=;qAsnV9E*`uq<2L~LFM`B2a zorsXySMh$`P%vJF1WolN;0Qdty?!6w+|B!^?Ub~nhVPv{Fng8GgxFGhk<^8Om!@n$gT5)o%tSCTVnoEOQRza*_uR!2zBf zY=R$CQxP*WZqYavcAe=>JS%&skO;3Hk5gT~4Ky|~+ zR`#rDX?3fv($vHY?x6<`Ni9u(DBcB!L+_+O&9x!0B$V;q&h(ss?uIh?{<>zvfp!$Vp(y%Ls+NgwA1n#H-yi_ zpiYl$(LNORRrhdp>hL`P)a=RZLRYEwf^xMJYZM-y zjU8wz@5!)=GI~Wg9QgsSDtJUzC{n_d!85h~w3gasO?4TSWSfAioJU!teQJ*nFO5$K zH*RV&E-W(ayM)0=uNxQbTxI0HJ9{E)#oN=fdc*9+>G~01P)~ML)pf7Yo80x>_CKB~ zV@S{=F^ZlUf!kFP3S$U`0?6yS`g#$;G8FEr>I(I!aX+S0pzwdYFBA)MeHbPs6*M4I z>4+MQA|r9R^@t4SsqyFU3Hrh^mk)U0#17nuG8mU4Z$JvE^Pd@)HEoq4+od(GaGWL% z>6r);sSJAk=*sRSWDG8`;v(K}k6uaVIV6XF&6jCTmIRun43jaLyQGZXsL#@iYt*vz zoG5K$TBO-$7Mlv{6on2fm(8>~cOcU13x!j|2O?!=4_V8g|1YhoRrBcsEysC!S1j#i zDOWAlgtc|u%#PgWVi@%)%_NGYn@m^#vn@N*BDJ-bmh3#EW@rf&#?I= z;|v?@Ofzh;dW_Ca%{@3H!?{Nl<{lb746Lmwb5=<VY#lm~ja$F24 zxBk`^7TVgD)nN$JI$B07d1aL&Rx;m zsI;_6V}e^m4SivM0t?J5_)WTcDIU$fyi2_qLiW-W)4Rr_O(!v_NW~_sQu^q|4z1Qi zL_GLv6)cB)+9zl#C?&0@@0F6?^(HT8hhMF*7=mmtWx+ z*GE3w9goMz5ryI^HeUp>lB##x?r6VE_ObVXI@nba3v&IZRF58#8rF3a-GbhKH|?Wy z(ixR5L#>8i7c(`{2Zqx}x!WZ(@lCCz8O}Tqioa0ak0r=5T8|U=MD%H&Q`{Zyj754g zZV&67PC6od$#og5B=Gfsui8O~Ab~a62v+GcmTl5aGKdyrv~8_lwXk}P!yk3{uw2xG zn^7dpDVak)b5I&BFpU)kK6kLCtK&(PT(Qj1T4^P%wu&(cf6RSv{i814D?e9=q~M5JA-r|0s8@BF>IjH za$em;dSCOejHYV6XEJ|JnF-ewH9Sjw=%Q3knr#w%IgRIrUpn;SzNL#Nm((Fac_*_B zK`Vya6xdTo*V3*UF_o6uGgO(*WDExLzQSV}Et|&Z4s*hXSe$l%m`p5gS*dy&<=y$l z`W1ZS=&aIsL0R9gnJMH~elx3_6i4^5>F)AZo19an`rFI8Na#r$7nGhr|H8bC%+>9J zv%y&8k-d}RCq?udf z*l7_(RKpf;*e8vkre^(J2e;ZB@vj!!MfyE%EZlkf>qGv8idBrJM6kD&yl_{RbmjWZ z&K-T7=KGz+T+`bRc3Lh&b1hLm6v4)57=PAIX-_%#%Cv9Ntu{sC^r6A1Zy=;x)Sn2} z@;z6zaVaGh0|I?fL8rP)+$H6pqfP5-1JYYM!FRD$Oszw(O$6O=^!{}|wF8^M!U*~Z zxi!#X?V={tRZ@y(2e4Yxt3#DklP=0(9{%p~KH3}aLU^V81-DPn?nfxWhQsmu9fbnZ zTYN|_rtOgy)S8I)80iPMBCy#SYBaGH+^%-2w5tKDTG);a8>KCA?lM(PbjNVNRln;* z?d(m!#?RKvQsQ<(P0KsZ!lr4gfG29P9wim*?I+pPrg&>#2Spdw8|-Pi)@jg3VnA`jtBXXjX%01YQ##a2J+H0|BNg(m4h9xu zL`%9>>la*Tyz3Y7v7Aq79rohK>3IS298G6=JZrrEvoQUm#r{wicIZfeaxPnD+f6}fsFHJj>wfwk@kLo(Iv+A*!k>_Yam%-hO0UnBHlykYE}8pC$T zm0_CM)4T>)F__duJg)k=<74oZs3Jo_JZa0wY-l*37vj3bU!AnzE4A6^i|f1u@zGCU zv&8?`{TkNP%T;o2(tCIH(ZeRbjz}z_7Y{>PrztmSs;#M3Qx$iLNaIA0#rrY$=oLdQ zq!+lv#Y1VvXp`|P0v#ea;(yTBYwoFyM_kI&eDR^H?E)P)Tuq`$TuntrsDetVi^Q`~ zA`yTK)1G!n@97cw+m{%$;^vU?I7R%yZUk5fk`;vQI5I1Y=5(OtfjS+~rE9Mf$(jnM zFP}N!!|dM?6}biRC`0kA8ILaXMN`8>#GE>s8j?%a*$|Dk#wM?3LSnR~2oF4b@LO83 zV6+Eekw0qE+uJ)nk>eQ(H&dO;Ag!jK7ObTpuYvj2>5{CJfqlV8(=wmv@)*cLSww2U z==~EqT|ZS7kZAzRno7QF#ljfNjeIjkOfG=^n>-cnmyxT=uq5(Ebt~(2-iYiB z>-GHU18MHup||zn=>r-Al#YKnrO?i`U~Em$gYqe#4UUv=c@r>lZed8Y)`c=drmFUY zI`C8@Gy@r6f>2yAkO~^5ZW=ROJ|uTsKJ;-U(xXj+N=xx%XzHOEo*c`^=u$}J8EANQ z>N`z9bkPcG0)m1XjEQ#NNnxXXv7ovp6ieV~zRXdfE3AzM9#O|VN~0?V|0CmWX7ZJ3 z)n6d9>^06TznmQ-@P@j=bep@3=9$eQq(gk{Q0T%#u_^^1gdSSI!kVQ~1j5aF7?qfA zNop*mVSYec@ZkhY)gFBtX?D}pMvq=}p|W5MH`M9Lm^Pk8zzY8bWNbyKqf6NK z2{#h-t;T#_g)}RIoj|0;UfI@Am6F+m+k!Nv7m532De1Cy#p?S8%%E`f6kkuDbeY@? zs;8Tvi&hBG}@mUwyrX6?fY58(hmz;qf=@bAcQZ1Jsi%k6W=x!9E0@d7!6G)uNxaC}$v)F-LKzm?84G4kMaP*e@5C@| zA)zU|$aS*f`zr5BExrrCh`9)V1R^-;CL{mL%y$ohKZJqa>x0!PSc0$HIUK3B!itPFbaH z^oUc?2x2~Sz=TSA-$1Yz1HUJzvk_q-`Jir{fcX7@sNeT}J5 z))d#);J9IvbIjh7ouM~8Ue_B5)8r9%ddRg>?$iL|NwqiSv!oeoc9WXetM= z((O80uSk*Us7OI(`3Ao~3SCn&DoqN*Ez1Jx++Xr$l)mj^@gUQYJJGS)h#)5}x?HG$ z3|5XJZOUmfP5yEV2B=P2-HKq8)_}Ztjv2wMUke}&{836cmF7@5*wWYA5scB?)!!Y< z7&F3{Tlm9(SQtu;3vf&=0ItOCq^@uN*ho1XKKW0(*{qV>&A8NM+;JeNe}T1Jomcka zfmpoyVY!*ld&`>8UE=~6@8|=aGgbJaNQMHm zQl#%H8e}>ryh8XpVM|aA&4a}+UqGw_trE-P+D`s*6*nemV|w)$-MEaW-%y(x$b926zht~|8SD%|I+|D`lfOb|ffY&H85W}RrFhqdV; z9G)R^1JCIFacd<={M+>X@UJiKWMs*1k!pxrFe2fg>#nkTo`R6Kr*ui%NQXPy89$Uj z#x0!`3|daH;N8?#zF;NtBd$U5+;WgYiQX1!!s^rVaouSzN){)6VwC&(tKMg6& zkW8QTple^?SqnTqz@LHC_FRn%=;M5lw?7r_7Nu{JG|}%Ur8-&{Jm`ULjKeh;1S{h~ zmoA8$l9AifqD?%spOsu5C2QkaY{Xnl%fVqmx{LY?GCxM*WSsHHw6hhr@nz1LIo8wR z5+1%;OhbF%Dl}AbRniV+yUqt4x zIBjRja$@i@G(+eKW}Ba|Tc6G<{^~JI~T-{Ya!NW zMi^IBx>Dq`P$Sl;>6UML(SdIf2nW}yV#S)=2v6@ZrJ!(nDt3Qps|++BZs7aesXxAi zFA(=Y{FyTHy%?B8AouRYQbHn_*|phgMA|AC=3!!GmF{z+Z)p>k{yEu4+R(}pu~`sz<$9@K&X^op?z4`&7JW@ z#WhSAd{jW5MSBvggl0pm1g9lk5qT01`dJD6XcQ|!NDhHI67VH_tR&bie;ubg0B+hh zRO81|7T-C5pN^wjn4GKl;T*w3J10IQ23T^DY3QMRBta4Y!>j3t#~FS%sFy#SEq?6s zU-*AG%I4xDe>RhClP4Nd&57|Tk4GvEg-xUD*saUQ2G=XNVfK)^)@0}qoXt3*Rl28KqbF(Sr z892j>#8dKb$9og#(BNCLfF{3Ij|RIzJEki?gx~$ALxWCPp5!MvHTj8#j=`Plh+n;D#mIW#fb)s<&Sbw`RVv%m%kJ^5-|rKeza#ovM>Fi z$)Vvk;Y45F(JroNa6rnh(QChzs=+I=$`7p;lp56N%*tP%Reo5O*Py09v+}oPmDk=U zuE9vD>_EvNTb{eJ%4_cf*5D+qVhOYLcfTyJ^*u6ctA)y8y#R-+r%4_dy z)8JcVbh;zE{%3GaU!FZL>zA-yjwg+UW}ch+_x7&(Ocbn<9Yf=v4z0Y_uB(C5*T1z;6xcc@%6ALE%j(N(d?w?%zI^)q z!-CI-iz;=AJ`n6$|51b~upcMZ{7l`tI$L|{J Y53ODeNAA_A{EOca<>zG;$ok6u59JP$EdT%j diff --git a/PyTorchSimDevice2/torch_openreg/lib/libtorch_openreg.so b/PyTorchSimDevice2/torch_openreg/lib/libtorch_openreg.so deleted file mode 100644 index dbfd3478e7e06650efbe50b2bd6de36f52cd3986..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 569736 zcmeF4349b)w*Mqs-sipdpVp@P z_pMXs?n~Xe)ww2n!q_e`F*jzZHuk3T_rH9z@z;)qk6Z5Z0;t>nl2365jaTE`4Z&FUo!uUhXHMB0NuzpQ#;q61*JZ-$*}E}eY6P<8%Kh}uJCri_oLt)|5;pIOgG1{xW#c@3LfurJT!=I z&pG~_8Mip*=%t>FK1R2tgO@ChS=GIpQ4n)h7h`bGSg=1f)-m)$&*qpr?;3oR5xX*W zNZm;_N5pvg+|b-@X+}(!Zds1FE~D!*3hEmM_cSt%yY?S(eN9hCW_8k~#!}-uSu_KChzh3*qw``d&ky*TUx#`Yz8fjHRTP!RK}G zz8>DoiQNF7H^O@*X*}m<_+H2KD)|08c;621JK((<-uU`GeBK4`HSqo;vAgNB9zNI7 z_j~B`&-97s-V5LFgLeb52jFuZydNRm2%nF^`*G46=o8Ovg6~hkdn3G?;k^mo_<9aL zH`Dj$;qwLhE|1E8TY$X;@0a2ID!jMC8(*)(XDht7!TT*@@4)BZ>HBv2d>20dLErJb z9rS%CeD0#}AHwHH@ctCuyNRJ~7{0fY{tQ09V8`${zV>MBOVE4ay-(v`X?j2C@8JDC zynlfAzv2BOybr)zylBko3ZLERJM!-I{RsFxlD_wV&z|t^MY=b99)14K*SCJ`J7#Xl zmf6AX4fj01{`RIhG2hIeQ(v09Hs`$L(aHZRjG6rE8}n}X{Da5##BF|gtZT%Xmk%6W zdBmyTWo2Yt@Z>4)uJ$`ij^CR(^xY5E?o6AW-S*kO@?(}YH7s9#!Q|(|@9bK&=E|c= zPaM_!b=H^@JeOYcd)HN`j;_9aXODlx4;gvmslJKLH}t&p?^Unw^u`X~vS!1QX?bf- zxu?hGn9UC@Jn9c$7u}ir_;ruUJ&qBN{r&#NlKNYG>-O!9+x*=% z>ubvj2993zmpcX|?EWMp|I#-cPWT4EXG) zmv+Tpo^bN}XKssIzOHR$;vZJTUO)fCfn(0zU48Z6J{>#lKuYeHkNm55!#y!w2Hkhu zl*Fl1CUu;X_x0&(HZA&c-P

optWg^|cRN^2Nhb?wGjwp`wAK)7$%;9Zb3GoULCU zbwhaQ^$VXoI$`#`4?nZv(W!qM@!%=%eEjU`uMWQMyk5@?d-c7_(7Cb350Bll=!~nD zJp99Vcb0!O_{y_p-S}+#3E$t}_mqkEzxl-tf1G|_)hnMKn3Hw?vD>G&&bZ+7W0R{N z?zt*8;mtwrkI&wE;%7h2xpZ910>`lrFYJHntoOffdUb2B>*s&_{ry|t`SZY2%2s4H zPYYdhan5m`i?X{fJm#v}^|gQNwdI=R3(uG_C$QjywFhQx{pPrY+0)*6ZtU(ypV{!m zsqcLJ!;ROz{`NI(^)F@gdgh(n{Ego=4j5E0<^DJSQul28rtFt*UDxs8zA1^1{PB!4 zW~Ak=c>m0+pa1Hn-j2_HikT5O^~>!sb$=al?8C2YK66n2+b%yY+Wtix%9X znuls{Kkdn9Hr#a4hOMVAyyccrtFC?h>({={==N6E`1qUWZd&_MTJ1Nd_nffe(VfH3 z*)U+x)m_^@e{=o^gTr?X+HuB%o4S5Guy@UuUp=^Q(6UK)jXdJ9NrR@OO?oaj@q!=P z(ms5-b5_XUG>D^yRRPbO~uf=?|!oN-+y0o$Ckd|x5SUB>UwVRvoi*Nv-|yz z?wPgZ!CB28&z>;nqtm@#+|zLKj+@qQUbA)0IXyNnoImB6rN=J3?b_EjopRYbsc(Lr zF>&FBbI%B#_tdWCMH`;Ja^V;0XZ4)$?2&DspZ(o~`;t%3d#+$d%enm_eAn1Z-YG1- z^{1}+i5IM2*!HJc#g!-in0{OB4}V?r^80_-()T{k8<{g=;_sMy!AkPTIX|RQ2s|{^R`-^?lj~+*ew8;+>DC zjeUGquXDFu5cANODS2xu@A~t4+2M5l_k=C;)&Oh1JKmZvyU?2NKh>JwJH?ux zeY!QDHO`u^pJB~s*~D}AKx_N?)2w-FqBZ{r^2{pVhNM{Y@qMlN72~aWZ-+JS2@~pm z7UgvUj44*}xhv0__rBbk|6#JV|5Y#-d8_!JVdMXG8(v^jUdKTG zTls%%l65?%Lp-hQ@43X@qXS7Xycr?@6PrXgL_9a>Sf6yl1#^hN0cbsI+H{0a%yEggo zb{}g$Z8rV?%V$~JzhYBQ`oOifRsQdW_*><3o!8p_`!lThl{Wq11Ls@Y-)A$9Y_Vyl zi)`9y!v)sDKnibFF!nO+R^@O?z2tQ@)FB#-lkl z{wtvUTBY~oTx&jTv^B4{>6d!i*!QrRZ}f$6&@w*I|6B3dd!6seVHuZDR zY1V$SPPFFzp#Qgu=Ng;#GTo-1e9&gz^`6c6cmdQCt9I(L>6doe_jic$jrQZ-f5dDj&YIspmJ_^s8HXTlZK1WNY3r)tYy)=_gm)lnbuMS=Hy$Z1U{_n|$`$q-&?m zxZH4=b-3g5t@-md^Xd|te&YihKQnFYTWr#Gp3OM^n9cmK#^(Ai&Zb@kZ0gmIW32Px zK$G)x#XPv1G-6vhlOj zCZ0#w^glP)^kc`_jK2^~!vSyob+Ju9_O?xV?XsCieUN3H-oZBQdX`N;dCCxLKe<6` z{yUp-^Pe{TfYZj$9GiS7vgrqw+wjE>>v&GKsm}qM{(ON=KQqmyUB7HI?q6Y3e=f4g z|A{vB|1q0+X-pUE^rk=~wW|Nmz&ydK{XPl#Z^a+7X%{s%<+a8p|2@zEtm1QwO+9%K z>c3Td&W7@`;`iIMyVVKS;jXjk#}?#R+dpnoPxji_A89lG{@bP;Pq3L!UNhA?KHu3~ z&z^5H&m3zre{QnzbBRqmeaj|Y-E7MDzV6oXKNBj0RXGl@@qf}3Yd<+Q{cw3-Yx@H* zj$4Jh-DX^Pc%-$TZ*1cEh)uZTZ0hH$rPlt>I?|f&vT2XqY{va>dRhA!2$^8j9(&ld zuNgM|{9kP9|8+LkBLSQK^UF!r@p;0gUZvUa56`sr6K@ml=Qic?M25AWr+QlRVi;$w z>eZ1p_39&Nk5=~Yr&#+>u&IYHz&voU(Z>kRgN_AWkWBHKkqmnWi~&Z%Y^dE9M9_=0LWPDzR@eE&=k0sti#%C1yndXta?S9Fn zH^bP{U-HI8>A>VYARzSLm@2us$Kn#mXXG8LWw=Ahe;DRJ$XkPwLo*bwMR5I#ydYEh z=}Gn@G9-7rDed2-fD4b4ygo_Vn|{7MR&wKP$)}M2OrPYf`(?TyTg2-)N>>5dn|_X= z{Eyf2f9PPD&&}gyL?Afv3W3DQBwq=yLh;9BFSnE&Yt6e5-+R3gc7bmiA5L=Us|_O|JAmfVk%r zS>LJ)qU+n@L6Xh5_+&o!l))HVr%PTuMMh*8#pjkQB=_`^;hObr2DCf$ z6W1W)f46jJoCoz2>;Hb*)dkf|yc*_6`?}uJ4^=zkB(iUvCFAKN`{4k2)I#;Ck1DQ~>p6tF@mapec$w7)&06Kq+e~{X> z8ULj-q@PdoqVwk}oAxyf`WXzj@pm#JFbx*3G-_Y1cSw%=o$&P!81Fm)vX@Zw;4RumAV^C3k!;<8wWQ8)MThJ|*6Gx{T-LRDb$XKk2{$4qhKq ze0Dozx(sT+XN#k7?MU&-xK_sVPU3@L{SeDB0~a*l1>J&p-9B5!v-JlV&-L;&WAH?Z zzgO}SiqBHYHzyg6AwOwU|J&}74$hXR8ONO}m?7AXGv~+B;!xpfDJG8S8;99ei->VJzuu#xVI#KkNAZ&E@WuqLT}2? z`a0>qm;5lknk>t)V~I?cSuXDsP`S`}Wah&@@}sxw`-`Q$Uf=#=ldiL1{w(GXKAAst z6n{76XC3u>Fx?Wb(n9G!?mL;@O5&fxr>JjBWCej9M7+|6NM1+%Y7gSCl}p~{km>ak zUk9I}9%}XTPeUc|AV21KmkJR>`)1sL2d@HZM_r&}N3QoLTZ<&u`;AkJ)&q+pulX8oU$O7YS9hkJvvegd73fxJn2X8f9^U{^0pe8E*e&i<7qyV zpv`AuGo*do8`8cH#q(|$M{u6j(O1UDOji#W&_%v&lks_z;<*Si1bH2`7qdTn2NHr@ z?*~@X_}f9_Z#Jdt5y}S#^`FJ$|5h50+Gsp_koebSvV4QL$#|Om$%8W~o?81&p?vUY z`7n^$my_C;^kx`;uxY(;kJDs^Yb0z7Y>zNZ`TVcA2Oy(KRd|2 zC*@ll^`9y~M^Juh?dT#nUF5UY9^1~ByyhhtpOfW>@h;~ z_*l$;1{iMby6nT#rT=ES9-K_|r*yFFZ`){GOC$f+LHCRC)Z5FIG=HmEBO^%D7h@Fp zY0<{Vi;MCz}C)L+4{C|<49Kg11`C0j|nD-6syud7)uxoMx)U&b>?*Fk1~eg=(? zZ98SgnEm9bmyrE-$s5W42I#nif0|!Gwu{&BB5AL;mod=LME(qve$49voIWA1yFq5~ zeDae`kXmgX(#@uQy+r+g+c`4-&2fL=Op50fP+{QZ5dXqneQGZSG_if3csh*> zjenByNuYXILG8U>YwztDGCz%XrM-C_eS#>j0Y)9ww>a{DH;rGOT4@j6j(Gi!t~VQ- zBscTt{6y(rZ@=OG2=gv+I#b@GM;|&vnf`F+qhO{*h1pJzg%*I>bWc(!?=_BAALUj9z+}C z@1zSIGhKb=%5kKQ+TBw!dBy~4FLm$AdOm>odo-RJcgpxQLj3Uc6xGkVB{KfUQu&@R zLxy|HR0+)M>%Sf)^QT^`=id*N`I({RCob=Z{9hscbSMAc=Sv>c+I2gXV;o&@*Na%f zK7S}6oM+b2b-LN_Et@6%IOzK5VX|)?B6%~-_aOP=^>0X*Xuqpu&JQA9O5<`Xjmyw{ z#H*K0Kd|Lung0e|kLQs6P1G)0w01FVFojF~6%5DX)j{>4QS0}Pv1yNekCW-Dq4g-3 zc8b@rHtlpOmE+U7(d9T15{~(hLGf{tpI#oB&k26nKbZCZ2I^0o)SsC7bDmB6{ew+= z%(iJSr%`)veL~hlb6%H6;nvgo0Js&e$yEP6H2*g_j&E4s95>4JJ}ccB<7u9$uTMT; z)4rauX)nJ!L;A0$`E3=&C$50Xg|4&A{^8 zME2{cJ~(E`2HlU+^+(|!)`#i(%Pg;xhsyZ0Y4tyw#_@U@$1}+P8fagbZ><&5|9Qk) zsJ_)FOWue0?KECzXxFn}4VLM3(0anjWPcXKU#xFXJQIj-bjWaxM(KZ|?6!=}7fBwc z&4)LhOX+$>X21>dG~>_E5HSAwx^6zrH)?3U0m%`s4@5s7XLQgy0GNr_Sv(4 z5U&@(SKi8nyBYLA}EGctSEh{V3nEZR+#4G!DjV`Oux}Nrq24F#FqakWr%k zjFfyB`M++YKVOI?YDfpUmgdB_WnY>mo% z3ugMgzDm;w(^5FcTUb2DJIhy63a*PvJ@d*egUE75+>XpC59KWg`ja4%5$a0LE%cT8 zD)30>qv>Ny13rth(nbdYrJc=PPDoy-U{XSXio)VFUsWhDyTVtL~ zkiTfGue5X~q)f(bsyu7tgp!#RzKR9ukXAZ8c~Y*|uc!=Q>4g?}=Q$1UwDS2SrK{Rh}+qp1-^@AhN>DByVZS%tCKv zN%`zjzc*A|;rA8!i!i+0kjq(F?5pq>VcF-13c(#@XJ;kKlpKboh^VYo$(XB&$+;rL zP@sHrMXm@1u?GUmb$XR&tb~VU&-}a6r0NtjD% zUTAJ1sHZOY779SM^;J}QT~2Qalzh$=+1c|vg;(Txv(c_(Hh7eo`?G_0Lhu#-V5zUr zpIuhw8DA(G#LrJhI{5JggZ}cOY;uwtN_0Zhiz@>2y`}#0*`Z>uzoH^g0f#ZJSO}>( z7x_x7{3DC}kWGOFs@z@Pyeg>QGOZKJT-g^+O7w+NWyWXcy1ejzR7E+=6R13~ETThj zT7>|C$XLq8nVp-2;T}5lU*e2n7TtKwmY`9zh5Uj%#e9rDIL4Jd395SXcxa~Z?PpiV zY^ZsGAodQvQfi#8tdh#0FH~4O-oGFl3-^j&R@JbE6hcQ3@@G#iD>KUcWre{7;0j73 zUDcDoY-lsQny@6K@D??*YLsA?;?m-PR1b%GPTkl!t%l99X{|0d`$D>yv%D^qiO^=2#F*e+Be-nU$6P ziqKdXbj#h3Be$8Ib|p_@i3*Vd_E4|c{yXFMoXCZXsLE3mXy!(S9mK$uaM|0 z^Ud*lXVI*XC}w|JW<`Z>!Bqb&C>&^&uFQ(rFzqW3WkLJR&dyC1`J%P%6ftOHj5^OZ zv50NjDz7$)NiZRSN}m^q%qz2T=D@OnW)3cANhRh6HVbhyy30?OBSt_@!r))%#_dN9 zkxelEl>{n9Uol2bL~$e_M{vaIbUZOHkN1ziDBwc`0*#;q>XY@b1pNvYGIW%39D+86 z7f)cE>h)s!A?0u(gw@+J1yjK?DUw^5PkG)vs5-f+Ue9=#`G}UoGKY5r)JzYjG5xYG zOD0W?sE0ZFh$(4hWy$Pv971Gm%?%)>j3~>_o|+rT6V+JsPc-AUX`jD1?l2QhHfK12 zs*pD@%Uj_qpABuUECdbZN-cRmukx6ahNw<26ri3|Hj13FQ+byFx#1L3Xb4#AEySUs zQaLy~BbT_MV+Gq%$OJwW7u_Qc(2-dqUTGE0gesF$ls!{4jhwN$p_CBL=yNP4^3b)& zE?IWi+B6${!<-w&@l>xC3{?4|nKN@oVmOBv3>3HUmh6SvK>vqQeAvn5%$!h`Ec!Hc zsI(MPA?;Y|WLVJ9R5y{F1MQO9TPJRo4(3_8F9b^-X}+12a8+0(vPR^k`(?5O@b*A z&Xs&KOXj(t_x2W+`YJ2MkPFRd7VJO9xpI^hUiMX%SQ^R*xsuKK-FO&w{iQ4_mWOh! z1~iL6)ewdgf=KLg4^sQ1T|@P7dTnce?k}tTx&J-2|B%7_hP7W@K)7;Y8snQSm*ngg z??kU8X2q^_UwL^T1Z_YpD^|#Pf@srbxnVyM=|`X~{oEd3BoVqFtWa_mD1uI}Tu@f# zhlQ9zHiM9na+Oq0D~A~2oVwFNn1_ftf29}K-#Q&mE%sH8t?>IV@>P`JLMW|QvLq+N z!VawA;F=Nm?|gh@)bU(s`!Wz5QG6{ z1yptIB^z1vA&5w)Mbt>mQ0ttBGUJzQqB*3YRPLKUM^#3H7wjUtNLQUc74CqrL} z`D=ODr4F;^kQ7zjv-Oq6pEvI@M_ibDsBYr$ms*k_k!D*v#JoGD5S9ZfU||fFYIESy z0N&7x`J=lSSGruaHdBfzHS6CcOo#-zMR4f=9s3?oRqaoHU;N}9XdxiX7 ze|4c>EQnB(gE@MUcfQ{Tt1F=jng!+MU?%bdU@R8eG$>q)m1FYetST>=TP1E6(W<0Y zAF+6n%|!v(7w9?V`V~%iKJsyox>=Zuh>YZi#2AD$8CUFGiJk?Jzp#jvTN((Nv%*d% znFnYxFYYj6}54 zuJ~uK?WMp_A!;Novxx{&(M5Vsvr7KT0T{DZ#6C3yy*e((=LK@by&$o&f(=cq(NhIa zs(>}hk}_yVxX=N8+b_P@glUs2+8b|u(MM>ZaVD&umV2QY7Zww#@Kfs(EfH?&L>-+4 zlb)ysZ`OZjT6^z&%(s&iW$}j%}ZT=c`U9c;24IDr(uPl z3XcWME~LpK>M|V?Cmh%u6W&JBBwQvhGHR5<`gkdOLK$=$vZ@|*z_AQZcsg{5UGt<5=rZBFZWOU|PUdVS!_ zGw&U;YebPIvgc_P!FSFA#M-)o%j=z2X1%Z=!xNKO8BWxaf!Pja8@0IRH`^#2Iz$`Q z789b{Xy;>{+o-5-zkX~HQ>kC=M)dAM=vlq&##tr~LG-7>QhRY^nY;r42wSY=^lKJFr@Wm=j<9Xa@!*(v_AU z{HoJ&e}YV>Xb|QeC9&T~Ir9?~$I8{6LU?!R3g}a)m-c)&2%s`g77?xnB zvU4T!>tH$EN-czSd4FOWMsp%;)=Eqjvv+S*d1X})?_k2RJa(S2_UOyT)sjGkbmFOi zYo$VQS07SCHwIkk>IOh$F@$Le2wbAEIg<*>9M2k%VD~l`xliU#O zF0WR7@2q3pKBYOSP@By>hRR;GtyGDm>h;Tx|)x= z7YLJLbuUoWuyk3&peukC5brFp1S0cI4D9^=c%s)_qrs~bzKWd)yY`}LUMki+uvazD z1gho%fSO0xs^)d-lG2NOA)nkYHxK#>C+wjqs|tzfCLYP2mp!d?UN&T>%dt#!3b>*Hi+S^1INrgh*E~U1yOCKCBy76GnV85Z zaV>`@BqqTk670|s`+>!om2!PEdn#@Y(5I1;^cLny!uEu14TGSL3ZsCc%CY^O&}!AS(1j=ftmXDU)p)3U&$RR!95wJ}WD#*yE>Sme7?G zED=R6i&%3M1&KY^!ImYlLE}ySvye%PHz)Gxkxfl;!3@5`5@WRQe*JM6XzA^BGr)xd7s+@Y_!MF#E9>QZ4(! zLrgD!K@chxaLdN+t1K2BNlsaAC^_gWsmK`*-+qHsVb3m-1-s2)bcpPDhRq3ZH>WV< zf=xZZ;U*j`O~6Hb8K`8q=~fKiCgqOy!q(`Cfg;*VfgEQ`cmVD|!yYBEK}>A4DDvV_ z^N^6y+xSopK#jrXmMsceH=FJMrW9$K%)RK9)%Rfcz>5qgZ#wdPBO&6k1P@R$4rOGMA z&l$R4cLMBun+H!gm@|^-AXsEw>CsQuh-HlENXw-S5nI?9sBV0jSLK*4YcG@%djpDx ze$AHXs;rtl8}1r=qh_LVceR|v;hOeL+{O$m@N_Za!u^6?oS+rM<(t2>(hECxXg99y zNn(>>r<1%%w349}wJXuw`;Y@$fy@jzxRG@|bS35b=J|_EcfwT=W^C}>kQXMhur>=T z*mI0%R8=k@^fr2s$#A<8*IzQTCiv(1OJToy!azllIq`C(6joK>(q5jZZm^rssu)FN((p>YNL)`@1S?Ej zNw9DTH>|vJxzOGh+z2~#!eA6l@{X3zKFGThFow>A`#MQ*U#Cdk*MWK`Ly6?lOlg%K ztb!SicsB4iEb{2iG!iJkorNPkUOQM;515PNv0rdUC=%Ee5CS{|XX78%!Vc~I3K0u+ zx9Ko_|2o%V=yE0I`eymf{Q#mO+ve4;Ot?qZX@&~OhYgFCfveNHHIgPtD6h1G2A zpP=f%BKP4;62VMO62&RWc9X7KbM!3= zHEYv5X`2%}hsdtY&3+}i57Io#odZd@XR`#lb*~?Xi|E_!*n9u%G(`u>PXF1MqhUk2vVMPo+ zbg74f5AEaonFVc`2T}VU4y%uW4kH;Ns&YfC3(+%F$d>4Opbv`&JzG_7%c%dC#8%8i zp#G{+uEww8|J>o);i`XtzuW+rlGRT<8CD_}iA#O;u$CMM#k<*+Bcs0422 z;5<-l=C;nHB(IoM!wr^TRj3kI6(c)5wQdU6?C`iNsR}mN;7@13GaCL7?c+9Y$;nZV zR<&T|DtZ+JH4KOFk)3;?cT#RTdnAvqMar&2gch|gSe2rP3BO|pQ*ZIVa11Brd13QC z^eh-_%yPXUbjtDMO1NJ0YPZ}WwTa#=xxLxztE$eyrL$~!;Fccnr^H9@l+1%m5_zLY z^kTA?8k2|*gPKMYvdW=pu}c6r1@Z(Ny;(P@cN^pS8RvDX7)d$hP^Nfm0VnvbbXc5* zhe+V(L+}bp&cH>TfNMegMF#P+2Dp_~JU#@Q9?c&)z-kBi0DG#K4eq~EZ6)gNCe%Kv zscZ!lt~M=ZB(Ul|5dwmGB-bwC_ZlMW88Yj{>L?a=WNWrut$@oYyup&3D}GsJ0`A$x zUjX?flS4?Kx!=!ftprQO-2P*C9D2QC(*kYpg^Q068f(yJf zP-9O_k%Ml9){g9!4_%O0T62t!W^Smo5;y;AKY;_c+&evXT$l$fH3fDMM1CbkE)u(v zVQCEZW0i;G=8#nSF_B-`hhc2Q4GLHjgaR~|`mk$C#Jy_fL%LA4DK7dEzOOp!6m47@ z3BOl?tJOF<$sLN>;whh~ZhB;9`2sYGJpU7!TJWL1b4M99ZpmH|2S;-O@z=WPd_DHt zT<$>XJNIekv8%VaRbj`<%Yh88Uhe=ZUurpN`FNFCctu) zBi-!Nvc%r0arlENmQKVqMKX+1{-`Zo`rx9^HgR~tjz}BeTuu*+js7C|U5ZM};5uI@ zU|;LSscVrwcNKja%_MP~sO}A-l>s@LYr+x7x##(W38_ zr*(?CxL_&487JJ+p*3Y}5$YluetpCIxwpuAH@0gz{ep27Ht*6?*)YKl$}vfME}M2+ z+fBo0nkHw6!m&Id>gJf>a(lHE*RmyY(J1m$x=zMTz%o0q!qcxTILiZpV3v4nGutEb z3J=PM;c=-2pM;00EE|Bd3q`o25!Exo9M5X6RK%+s*ko0TjA<@;KO|yCvmf~oerId! z-I;MR%;@M^p|k|HBpoWosp_Fgu}+l;*?|w=dFgj*VDSbV;;$K1$X_wSc3BE;a|4hQ zE~ji^qJK1pnIsohYO|*)3PInA*DaM|w<9W<_2|fzh_e>?%pR;7h`4t0iHV~!N|GDk zX8X%w(^jE;awoFu4GKW~UQ>F+qxhkiWcG_;xo~|WAm0NQU2OP+Pn|bcMq-iZlCgj+ zDS=-Rg!_1Qv4O$hmqjSLOp|d5BdXv<%~Q|XkabdFu#ZL zf=2%wSae3fkAViOB0q8oCy08keqq85OJAk1qQ$#bSJZkau5-?mKXAnAE$a?tJ56R| z8?M?!E~g+u)RP|4QfvUmpS=rI;4cTb1C=583T;w4$Eygq8y!$nU$|Ao+pFFKKxOx` z#s^RDnK5A$^GkHb5eD;gs0}5M3je{jZo&ku(<3HX&ZJAgD1^o2V(lwtjy(j47^UbGlL_x44Tz7vrTYoY?cxQ6E;EsH4f*)wq{r zwqN?s?rf1L&y_#97a4X&UJRRaVJQJp4tov6qG5RmehUbesa&pUur-H{L%sk>Gq>@= z3YB>P*HVA0OcP5Jn6PBnB7(mS1&+-aihhmt6)*~=kHhf>FT%x&6Hdg>l#srv0sK3%C;_GS&Kgk?qq<}oWvBolLd>E*v!Ndq_T0jWS=6sH=0z4GhiKd z9uyaJAFk*#Imc)4zz5ND(i)em6}Q)DjsrIl~osbs56sck6@>M^H3+HLbS)o zjY(7xsujG7tDcS>gP_NvS+HBHJREyQPgnb zN8Jy~gkN5OmnV0E7w$CTZ#&Kl%)$L0us7EWi>hLR3p2Ug@$8u=*JxZgDlfv@eenA} z&{4{1z;D`*!|41!J;TIAyUbS(KjQ|g0X)Yb|0cslA8xvrhv7+I?4DsxCTa;hBK%)m z@bFX!Ts*3WKj7wfbit>~f|pg3e#J5BDwV1TFtKFDMCYoJZ1P1oYW%cydiXllOn3 zuwl^!9vFvqgik2sc*|hyNyDE$5DRj63c%`1zCWNXDm9Pbr?v#lJdav#Sc$&Q%8q-|=F@`YBEO zQ4o(<4}u371I^A+^Z_N+usu-RRiR~g7oXlk7vUB6jo}Z=6v6{=ID5_gA2fM|--Y|1 zGn1xY4A(Oda_oQFe7^{{IYU>W=FO?FbW~PS2u}s!AB+B1hm6ORMd(IG{)625yL=y@}BY z>y!(LXi95ehct~WlSP{zEDOyN5_{?+CFH?B!BXPGTQ4GC$3W4A{P;`2(60q5;MsOP zQC1FS`V9D+c>#H=W<+HG6)c7p7UQj8(ol@_tuj(QZ20x8N>naXg~W@7 zmzI=QRS&ODOC6q?j4(0cm4!5JLeA(h-oz0}#<($Kyh$URhDlt>>T6P>`Wk_WBT|ep z`T3cnbG)t*Nh8vvNq#;!Ngm-!j-^K_my27Dj6N29Vxp6qrZW+BR-%hbW60#G zIpcCBoei0D4t>Wz-x&k{brJu@7+vYJ8+>**jsSk7_=Yz4AM(!sb%pb~8nJLjjL`$$ z=&dJxLnhCY=N?62M0@IG^cHs9H^v36x~kaW(LSB*kec!CD~@9LGTb<_jt)(xwwJQv zq3Yk!wjaE^P%NZh{1<)Vrx`XX9uE4KT84KtX!Acw#N2P1x^MOP7;>G2ry z)s4c4&Lz_y#@Y-^=8YKx{KT@t|4a*h>_6ZClLMFo&%@ug#=nQ_fcG&aCB5-40Pg{Y zFFgJbJMKAc0sKYwKE}7|csJuIIu3uz+qiYr?eK@Z`xys_Jt}mZ(X&E^ajftkYxE~w zbwWuG18V&#pz#%B^fK1axu~nRJp$n!W1LC$#ryAozx3PJNHfnD$9o&&NH4zz{}OO7 zBagIC*dJxgBt2i)_cZ2`_KV{^jH^ipg+9_)NqT{>Kf<_+^qY^&fxp1q&A6ZRTw&kU zc${=-*e&pv?z1Vsf+X^HPYrK*EK75D93!ayM z_<6s^>xdhdMdyEzc&x@7iN|TYmAFIWEfk*wjXTNSsquQ^X&TQUo}uw(;%<$1yd=}> z(Rg!({5#wE8gHwVyjbG}vt_;nHD33i{DbJ#8t-^S@*0iDJs^3l#@il}yiVhB6wlQf zHy)Mt^%}2ll)ORXZI4UdsBs7JCXE+7CheOwUVp#jEgG+RLh@FPJ1HNwYrK)--==XV z1rJMZTpnGYn<@Webx!Gu(|G*`=_g*}9c1s&xIz9CG=BRk z=mz2C)c88$X&Qfqc!tJbCSIWNnsn*EPUFqQn>1cGKTr2&w zX}o~^?AQ2q@?*@<@{Rn&YTQG992z&sZ-T~K$&XXx8RW;U@dWbY(RefY$=A4(`~)@b zAwSg`ZzMl88h4oK@tomdo*4^yjbTGrJrhzw-K+?c=IG_U$60u$&xo| z-0(==qVWRaZ5nq>k@g)LuOS{=5SsnR}P;|0VsG;ZWd`(lmP6R*>FMxL~9(s zHjUR$llJ>Ho^g@nvA*c|*ARDT+;OqAH)clL$D8@0@%ppnxE!bPx@5@{H0~rnX&P_5 zRNB|-JW298jW-i-(zu)aH)}jd_AMH(p?J1wJV<`R8ZRI}9U2dkpV-3ad|OR^;xz6d zKk*tbAU{rxH=FsU@eJ~lp>YrS$=7(8{1j;1Nq&koo8dAC;4g6 zcpCX>)OZ~EY0|iZ{IqJkfc$LNxIuo}G#*EO_G`SB{1`>i<=BxZ>tU?M4f5mAcq92q z(0Cj9acaDS;_24-cJkxVcq{qI*LWNG32NM+c3!RVX7W>`@mBKFs&Qk9>>t`RUO>D< z<1M7)>AJzZ9&9Chx5n#BrT+qr8)cGLYrKGXtYD8h5xPFVJ{C#iLl` zaUC+;pvLR|C3&63n<<`+8gD&ShTEj^davXy8aEOpkEQD~v!29JJmWN8aJ#gR*Ldsi zBrn!@<8oP!H5#wKLGslauO?lu@%k2d9nqli4&sd(H(rwV%^I&G`xcEi65p=znw2u1 z9U5;Xy+;~5lglg2&7n>AiQyhYtuOq%%*8aL-Lt2J)+Uv66W!n2d$jrYayRY&VWI>1D28B5H%Tncg6?=iFSU)a}i6N1dDNh&ngd1$Az& z^Xc4Nx6^qe%YPov78`D^-{^kK^%$L->nJ)m*F|)0u5;+zT({7-Ne>wG_5@9NyVj?sAntLMBN&Fe_r-n?$pd83V= z78_o`+7S=eyzbHco7WjSH?I$L?qThb`#0xMoSUz)}X2V-q{JDKKi_ga_o~s#mGTv;%!!|sQ?gO9;?kCNL z7ufK8mJi%djSX+G;jK1&zYTZLdYB%6w+#>4@YOcF*@lO0cpU2ocs|s#{67!!+8ke6 zYet0y$x@%;T<+So|PAm zr@2mv^@{UG)=oKZVCA)wwZ~S*X^I9O4P!s!bD6z^mDfVX-HcZ;9%P(uwF>{M8TT;z zW*Z*1;c=|r<9^a?c!3SCwc+t}-vpz`{kPii{Wjb|>-M@Iw+#>4@YOcF*@lO0cpMw| zdAibUc!3SCwc!pnUUUC$8y-vROM3n|>AG9zMnGPF;lJGfYF3V!coMxD7~jEo6XW+X z-pu%ujJGm=JmYPQuYoYl{xi(@-##_xu!e=b&uTEw58I1p)@gVCTICrxCq$jh_ zXM7OjHH?45cmv~Z###^9AE+j89>_nawvi-)_U*%+DI;$HVwzjOR0c6XTxp zNd4iwfZ5;8?3^W~@_WhZCklAzI%IpU) z`|XVXiSahZkCL~PM5>HRp=c`$N+rjMX8Sle*1LI#Y z-pKfSj5jfU1WRu-IBYIfuQ2Q9Ij^O4Az{kw zoy-qyu@$~Nj3+DahEZ(84Z7clt}kZx)iykq@h&XiYHWBM<1x&>)`mM6Ka-_*wGB^T zd>gZ`x8Y94`!V|l8}4R&FSBp9;oBKcVD^oyU*f!t*(Wo5C#w&fhnamGv+rR123C&y z882e?PBuUJfblfOn;Fkw{C39k8DGnIknv|ZXY(JnBsKft(y@jo%1z<535PR2(tp2oPB@eIa4W!%m9y^MPpf12@p##b^P z&(?oRS-H3we}nOS#-}n~!1x%(s~LZW@p{Hxj5jd;KI2V{KgW18<0BYvVf<^xTN%HY z@$HP?!+0Cx+Zo@__)Cn(vHCWYaR=i+GM>SB57r+xF`mNgTNtOUx}r40jQ_;!3s}Fw z`F>{q0JD!}*B!lCe9{=doY~h5MxVAH{eA<7cq=I2n&;JdN>LjAt%e3Ud;GAj0YLN zgz;*|XEI*H_-w{&8Q;Ko9pk+jU(I+SF#bH_`HcUW@nXhLW<1FFw~SXazJl=@#y@Ag zmhtCV{OcHhmGRY#Z)Lom@mHAt2F4#`ypi#4j5jg<59Ys_@d9Sw!uTi5zLoL4jBjWB zQRb(O@t>G|2jg^$T^tRvcC?w<#|P#5z>&b1d&9vv8wQO8#^nx1>CDNv+&Lq88sjn~ z$uk(ou+5j7@l+Ne598*ZOFSo^ad=Qwy$TrbsVH3cF}|Pq4>Ar9w5nG%fWxS7~us?zEzKpMCoX;WZ8SlsJ8yG*D@kYjXvvf5vek`+ZX1qV+EsP(>cq`+d zG5^~cuV%cB@o|iY8INcFI~YHn@%@aSz_>9tQg2UUJeKi+jK?v4GUM@#?_u$DFy6#? z0^<(m$I1A8%s!3r!OT8`@ga=686V2Hhw;;x|9r;(#drbZr!hapjGxYUknwAnpK8X3 zG5Z?EH!=HK#?NH-b&Q|I_-e+_X1tzpC*uu_yBKd|JdyDx#y?{5Y-T)#*|#vB%6Kc| zBN^Y$_>rumY-2o)*@qcFhw%=^&t-f+FnZ`~v1Djqx$eK7;Wr#@&o(Gwxx0EaUl%k7K-m@o6kR#f*Q?c#!cN=BJwR z(agSv@e7%KE#u=EuVZ`(^Rt@q3CzBp@t&++H84(F`ovj{j89~Kni!wNcr)XZ8E;|S z!+0y>QyAaQ_*BN*7|&%q%y=H-9gI(7d_UtCF>X{w>OWtnjb%Ka*~c+{3FGmMU&_kG z!8x-}VEi)1os3`3cpBr=8P8yRGYi+v_zY(6VZ4Cxe8yKXKLw2Yn0+zhGZ_yuUdVVg z<3)_uFz#o(mhoAP*D*eu@zsnMGhWYl3F8fncV+2qWPA>@Z(@8nvu|enT*g}%znSq? z#!H$1?TnW(-o|*4@i60a8Sh{mex5|V_A^dDSt5^xBK1F{tl>T~<5i5uF+Pv+c*f^5 z?qGZY;|Yvk#kiC4g^Z^$zKHP*#=Eifx*5NQ*?SnD$I2z2@y8i2V7!L;DQ5g6W*=mH zF|)5`dtcX-_H0d#@iUbh4C=sEi67AjQ@_=?`Qm0#*L~-{lAUzSjKN>JdW|zjK?!R ziN(jk_}h#pFy6|zlkq#5|1`#b&v*vo-Ff|I`~~LU!}t)!^BI4d@dC#0V&N7u{sglR zGX7V_s~P_z^Han4-Hg{VzLxPi#?zVq)r|LI`B2aJI%eO%_&v;jBjbN&yovF@Fy74g zeT=s-p25OxWqdK?+Zn%(@ixXAnEx>2_cPwX_=Ak^XZ#_?jd_v!zmD-(#vf)pj`2qr zk7vA*aR=k4uyiFb{wT9|GJZC*Ph^e`UOm@uwJH&G_FKuV?%&7M}*jH!}N1#-CxliScI{ zZ)Uui@fOB6G2Y7fbBu3id{}V%!T5H@KVZC#@tur^8Q;Zt2jd?yzMt`PSUipDNc~^Hcr4>D zG9JhHN6ddb;~z8bV0<^@35`X=jQ@-A)r>#P{MR%76|-+({AOWuqlLPlFv9EoT(p`mKuXHz|*DBpz=v7MNRBd1TVx@7=+Sfi;=~$tCO7{?Y ziqg1&VPAWu(nksHQo5JWgOu(qbU&s02pywzU!nK@WTrPx=v_+p6MCD{M+?14>0^Xm zuk^7(uT{Fg(5sX_PUyu-4-k5;((yw3ls;bQDN3Ipbf(fL3hh$*B%uc>Jy7U=N}nur zjMAqFz4u2m{SKjbDLqK&ZAuRodXv(p3cX(GAwsWJdZ^H=ls--9#Y&$p^jxJAg!U;t zOz0^}pCNRn(q{_oQu-{R2Pr*V=zdC%5IRQbvxVOKZ!`T)p?4|m5_+4`i9&BuI!WmD zN+%1wR_PR>S1Fw;^kSt)3O!fpG@*S;rwctr>2rk6RQg<@T}q!P^dO~23EfZW^M#I4 z`U0W%{$Qp*L+D*fX9~Sd>Cr-OQhJQg>y^$DdacsgLa$PKtk8><9w+o%rQJgNl+F=) ziqaPfovHMApFfQhKG({gl2*=oqDM7JBbDX8P-d-lg;^p|>f0i_n{t{+-b4mA+Nz zwMyS6^eUxq7kaVMcL+UK>D5B}l)h8wDN6rd=uD;mAhb*AyM!L3^ctc2Dg8&GW0byI z=)GT?>8}@hm(pv6-lp`Qgx;j|JwmTn`p-hIRr+3`S1J7$p%*KCpU`uaZV=k1^!-9l zQThR)GnIZ&XqVCt2|Y;Zbwc-3`eC7Clzv3$yZxni$((8rZru3siZ&La(q1P+@ zxX^2r-XQcUrJoRbvC>ZpJy+=_p?ymKRp=>7KP7ag(ti`$rS#K64^n!g(EXHtM(7x& zpA~xVzs&SE3%yI}O+s%|`Z=LDDZN?f^-4c4^jf922)#<_7ldA{^ov5zRk}rJpVBW0 zJw@r4h0av^6`@^9zbf<~rMC**PwCf$j#2t`q4(}H)88ueE~U2#y-n#igx;j|n?kQw z`YoZ?D*d+5tCW66=*3F^UFf+=Zx`C9^t(b&QTjchGnM{_&@QFl7kZG=JB041^an!6 zD7{nYy?f2{w+X#V>0Lr^Q~E=pH!1y*(Cd}{Sm?D%e{+H0} zmHtZTwMu_2^eUyl5qh!G-wHif>HR|cl>ScWDN27Ybf(fj2<=k(-$D;k`bVMrDgBes zF-jj0dheHJ`tivJxc*lf7kuIRUuj(Eh3kK%apT#(_Vr5R10nm`*D8%Sl=ii+Qu+v? z7b|_F&~ufJ722mX-hkfMK1J!CLT4&{l+Z4v@kZ*t_CZSHgIoLB`zhT==oqE@3cdHA zX8Q5LX-L1)_#il>U+JTT-lR0%V1x84eXP)HmF_R}Dy8uODp>zedVtV#m5vwMr}Xhc zPf_{=p)-{}QD~RaxRDFeuQWcu1nE~AA4J;M9-}mFXxP`jcaNEVhtRu}9whWOr3VYW zN$FFCUT4BQ=uj$^J?yl({wfy*6(|a`iiKcgG`fW{b)$|rkZ`AY#P3z^Qm**PI{x(go z)buh@wE4>|uIV2y)XHDedo=xtrgv!iZB1|0^cGET)bs{TuhaCs znqH&n+cdpW)5|oyNYhoCF4MGM)6+FQP16%KovrCnnoiO5a7_=@^gvDb*K}`9ch~ff zIa>K^dXJ_*(ew^Yzpd%5n%<)6jhfz|>2;dESJP`WeVe9NYI>Qb7iqdm(`B0WYkIn- zr)hekrn5CYO4BKt9{+jNs>F%2T(XExgruS(26HV{X^xK-=s_8A7-l*vf znqH^rdo{gA)3<4QrKXo@dXc89G+m}?zow^adYYyuYC2ofqcokO>EW6ls_B86?yu?I zn(nUYAIE9sujxIS{zTI|H2t=ww`zKerZ;MOgQnMM`d&@1(e!PaUa9G2nqH*oDovMZ z+OO&9nx3ZViJH#V^e9cIXnMG&hiZDDru%EUx2C&m`p28+aH zqUnvA-k|Aqn!Z=lYczeErdMiunWh(Mx=Pb!n)YjYx~8XTdZMPYH9bnxDViRx>7kk) zsOkQi?yc$Wn*K3cD}PPz(ex*p-l6HYHN92STQt2<(;GCsPSf{ldX1)U)AUMBFVplQ zO;>5UOw)c%PuKJ`O;6Nxwx&mEIz`jNH9b_*12x@W)4es_UDH2iY2~l!J(~VR(>pZ% zwx+jgdW)tvYI=jF*J=7*O|Q}PZJJ)G>1CQ;r0FV6mucE>>Iuug3~z@UmlKxn4!6Rm z8+0u!EugDsb;a(Yx2?t?;3wfM7{Y%Lmg69Lr^F4Up@-N-XcZ9QH zFNic<5gOuf5Btfz>_Dh{cm#TY5KiME{5=vvA2|N`ixZZA8}4T9U=OT`p@UCgtzAWt zIoJxvrGw3|8^a<750QgE+c>yUIKUwG!ChEP)s<)o`a{xJkbZ;orJym!Q!&7m3x}kI z_rv{GJR^gSRKtx?)J~yyhZB_LYh-yG9hpRyM}USiFeiV2T@~myKfIQV+hLs@-QoRM zjNrRcqw#CXI7G%z!$BCZjvZ z_-yW^CpnqQ;n#2r8iN}Juuc^et4l+^fOuHcrCz`>3f<7#k}u+)>IxApaTl;rTC&i#6Qot&0Y`pMtGt_&UYCe7Ad0tNXxnZda>&SgU(^w;|!> zSHX!;iDQf0XG%9v6Ng|uSn_=QX7`FkHQ`xkvV3RwCRoC+S=3~NdI)?KxI{c3Fn&G@ zJJ~UEvxWXL{0QlbK_fo^H{!7>oKD3M7v2K|^C5*S%HSR;7Ka}`kq;!kh0fdqdq}Xf z8nLog)I{HqcGtZfQ-FldYtq5;}Pmot%T(jO!OmW$Kg3sFaL`aHAK=s4p3fBclUQN$W80 zj-wv>pIB>GoHxY1G7Ed*7iBLTCwk#G!f*_1UlulQ__k@*2LxSN4 z@O;P)+yoF_L!tpi9f^7rOGvCnF^@zIiV`6f4yg`%;TuNvVj}tBiS(@?JVqQl5I$FY zej82_pI?U0L<>l^BWL;h;geBdeu{&;!+*qHDAIXgv10eJP^e!{&XyAk&r_i?XWR4+=xz5FNGd%PmM z4shS4mDpRYTvQV|)LsAx%kl__57g&p3ZOR)-~XvB@U@>q>`I0BHhk+RARd8+BsMrN z-yAL!EpKP|DzL()@YrWCvi=SGuD`;#(|(cgygU35iUlAbZw`e2IG%-*N#UepINd0m zvoM@-=zGP+49uQ6IONGBh?KGDn>ky&Fp{(9K?rcF$k`x{5j+1ku1IePQti9ij7ZZao?p^$h>6P+}%i0c<$3f@Mf}m<4wy-HSX&~nfUZ@q(6-(~m07Yy!P zQIntxfxhf)@f{;P6u#BSS%DF%!eFq4x>xo;3I6`|3rodx!RY(s0`&2Si`gzs?a2of%b-!yuBM$5RYH?<>E+TyJyuby1<89?}K@^1@p>9-c0amzzb6c5rv;Usi2wKL9B@(AvHi;&I@O_RmCgtVg?#cVUXZ48IDU3ugUW z(7)J?j7?-)Pqqzc3&o7vg2Hp=VqQNzM2tH#P(TZIhX0It2)GVy=0oN{Anuh9$x&=1 zjnmhNt{vv)*qAENJDeK~uO1^?dxnS;R41(GP;cw-(`>OqU==?BVuzVpPgafGN(i+2 zstw~K!J&)pb?BUPp5%-}khWx)41 zNM`s3G4LMU*`fW7u*a-50-#@$nTkX#94cIF!*|;B=f%9we4XzkxLJf1T zT-FFYXZeet-|Akm42OR)Hea*@s{XZb%~+31CcVWp0-MD7!0}2FBIt(m#F092#t=9% zVfnwny-2`H=?e$7e00iufB!z@rb5MVV182bSLd^!t@=O3c~+1QE)4phRFss-s5JZCI0I7dtyL(rDtyJhLu!T<}$ zMCsoMC*t(A+xfEeUxl#Xm?(WG`M{t)7BjS+;Ty4bSd@MdmZVwwFGQDq{y(fsKLfaR z>7NEi4zKk8{T?Kom;PnEbY6%m{d(-*#LU#P^w)!{Lzn(|to|yY?Y#6e(7)TZ^rr~_ zR;52u9Fe8}1JvHA((j4&+bsPZqHfF5kEt+A|3y4Umj0rqzE4&)!wgi+77v-tb z7mz}=>2sP~uz3PBR->PWO8pn1cRwR$c6YujjaLbHAe@eI0OQ5zWF@Y}%y!8EJwO&0 zV;x;FChT5**N}SXp(ZSUs2hI5L!l21?GAFcdr5Pgd%5iS!qp;C3x{NcPbNpFp=%tF z@g$fTCHz0$-UU9&;(8xXB#}jlZ&1*9L81mtR5VmkBBEI#@J2T^0ztf>pny<8C~h>0 zU~qReuj|@qu~M(`(u&pgi^?S;mvBo!MGi@ng@T-UI6O zqIYC;*!NB((3#ctQ~654J&(-FnN=x4M^AJ`mGwnc?i4li-Q-EmS}D&Hn=2a!ntfNA zc`J*pw}b!J9Gc`yEuRT{Kpg$_7ZC^bg$vKLdvp^94|8YlpZaDe14mH7dyXd^9p!Y{_eIfR?1Fje+Fej6ot?~U@iC=qL)DX z5?Hzd(jiU0d=G#zLbg&-?1y&Cbh(8n$HiaS1k+vAB|w8w?V%QCtuTXd&g z0hoGu%xcM1n%<@T^+{7_mMFIBNv(&4YP;>XQPG4lG9tX;U{|APAT~KCKM)IIyEK+j ze100VL8(tL#fm!s-F_M_KbW!fF9t9xdOj*N&1b=;1{1yD?3W_D z8;sE~Pq81{3iM+A29g|B13CP6LAb|jibqR(DjvPZfo<(tUU&#PiD_yCV4Jal|M*pU zid`#0hx0Ft2*t;>YMTQjGO>ALQyyeuee0f1pgQ)t>@_-+ZC`wEr8`nRE?yeRR@C{;)4Y)z?m2PRQ!JqYKa)K=yyN*zLyT19j1Nim_+sfdG8 zeUm7a(PjUX8ZE+<{>d>vN2xS)8JL=Etts`qXpgu|%?D{blzN4^ic*a#IM=@LKA}_!N`g|ok|@>GdH<9;ivuc}zToJe zqtxHu5=#AENZPL}%9B!OgU}sP>N!+tEmOle@F7!CR6GDmP38pDno@U)GL2K}LD0iP zDNA&VjJ`!GIM?nG6-s@8I4E^&5~bE3ynjk{kwKMyZ`RLIYW@bH)FdHkzt4_RlsW{2 z?vPS5QKdDd@~Ch?sj;H+sJtG2me_Np5Mu~NV3Ba|{G33+4yE5SWxhLu7$E!P_OlJ0 z-C43R&G5Jj_dQ{rmNs%->2g=z1y}6dlk((UVY=f_kjvht`O)EC)!{{Ba=-SCA@@rXiKB6FNQ?OYaNlS5ADbcyz{e}hh-FhpQE7F8)UGcLfoEPxuB zfWC99K#v@Uth5}8rJV;bs4V9fyX;3%-(UNCR*Cid`b4xG%UCRxGVZWDq2O*HcQQ~_ z;L}9X?Jv*q!nm9`#QNodNPk@DJZYUHShL-dIP&R)I4ojDT z?Q!fn;4CGa!vF_XGX^He!;wDPf3<(K{~D(pAmjJ_L|!^UDnwp3zpv%x0a$F5mrp(N z@;F5;I&isgRLe^-^6eY|q@W0&cm3vOpmoA_qDA5X<+OTDar^duF4s#>jdT4=%qYpE z7W#bhCApok8jxIe!F7c2o?VUZoe5A0&Oary63Mg}`vWL5W@QqY9sn?f56BZEd3K%M z0eMCO)Py*FNQ5<~03^ioG1#Oaz_KG4{dzQ7`3)>ur#C7jmG7FX%1>71V~O(TvHZWG zsguf&qj?8`2*|S^l9zOAm(wWM2AA)()-lVU$lt z5aQxo%qsCVMdtwa3Lom(_k-)pi5g7ztEAyouz_n1WhV|I9Os96Z|w=nG60%qNaS z3=8>Rwr#It8we%l?Vp*>S$b65U9J2Ouwsl1{nhr3V2CzXoXxwF$$?c1v-~3{A}b1k zOWAOU$+*+wbJAD@$$5y%bqM0;M)ea^KQr-TS5RuBJqi$B(Fi4OTIszJHe=l%`T_H@ z8TxFG89Of1E*0tSBr78`oh2E3m;Zs#y}Q98#X@O4{;i_C#`^?Q#*?%FZJNs`0 z(b=EWPiIs{EE7L=l%kr#GuLRJ24thNweqvto<`}5b^j|ntvVSU`0-)Hz28GD0y?mN zX6n1>9(qzO^*Z5d3c#_#maBvo;2d%vL@tW2M#Hxvh#!`BURduM0u9cY;HL~wtHJ4m zFy0RYo%+(UkENf=pMpKX9+A>ry)<*-cxyGAE5E+b zT)C%{nObkY^HZ=p0HkXGzLrL%C*AsNzH;FEsGCxLTA3Ldco9m1C|?G( zCG4zmph|56o%&GAHGZqcbsVMcP zc*7VI6{U8x67vy;1BXb$qf>@RX25FnU%lEkSF?GgV^#Y_#-jBKhpZ$VG6jeJXr({f z?q6yL?h^3X`vLE)qI4#~@1TG7*LYHRKaidzq_+CZIi(^MlY{V;_!BiBx=;sY<8!_O z%tRfb_YNd^$~)&SOp0^+q|CH_G-vPJ2vPA+AS~{qed~&?FKCOefM$-S<$P;Qg(k!Q zft*}3IwGeK?J9a3-M|AbD<-m6nbde3V33c&*my!xf1}%dIaQJp7MBV$rcHYshSV?} zyI!RU^;xhgIMcM|s4O&v)yEv|V>BYid^5wjh{X?I2CiEXoXcG}IzuPvVp zO=b=4UO)86@hl81fZ4nH%+=}QgG5(jY9p<wYu-`k-k^hfJoP#j z1!PwaoY^RmBrpCYH{IYR5C}R62zfa0YgHxD?-AwGCIc?$BVrS$Pj6{GQSPFtQ&}#l zeZC`_-2Q7*;Zy^LvM7)qe%q9b(Ehc;p}0i8j%ixS3!%m33WbMEo;o%mS5LEC+#V*M zUb`ttzEGohf`%^gH4Qbq^0llO2Z!}T4+qT76RfEs$DM>PN2EA|q_F0H*3|OrCY!Gg znmT3L-WA(5Z#6B21U`j?MY{NdOqeS%Y` zqr8&fIDP)YI-)Q9E2iRTJuyH=jKAq-l)p0pe^wHFfc*8;n?$jA<>U$!5_rfjx~Y6h zYd#hr-*0X9ue5tbwis2n!zc>vyWY5)pD~@iZwDx5+`R!|zqQpbt3z}7C0E?t|AVzt z&JI{#&E996l9F<3zO$6ZRGeR-y@2orOwHMQ6@)p2wwGm%yB|Q(+&Dq~Si7zMJx;DTaxE(ATM^H5nUt$1QJZ;qFFP zzp-eY*$F2^*nWN++Pujp>pdV_Jn8*jk*3S6&4Awf#=Yb6_E_7jcL!VFq}C0N6{Z@C z(heTN{{~m3b@RtY68J#~&Tf$oEuY;Yh~F~|od~VYuR7Xa%R$|DQ{V5fRDNtN_a|EH zIhcyg*wwswRSHjntFg+6`pklNj4;1Oz@sZ9n8*pBNYwPz7iLN-JXs-+g%tl3vljZ6 ze;1ksoSMUTvMQ*MzXSn<`n~oO7FiZn6veR-IXJKAp2JqyB3sHvsbZ@u(oEzDDNO7n zfg;@b(~U4gwT@#8&r#MOG|qwmwaWRX76@yL?eh z62^p7Hw#KEa_r2f?E5h4GNOB{m2wIPQ3Ipvs@>Jf`&P?k%BxU5tBXK(|P9 ze&Eqo4wMXJwQ0&3E>Hj;R#Bl^+c^=WVLd$6%=8=GKplr3f~rRp=}`Saj)mqB=;ZL& z*q%?4#0IWi4Wh4k&0WT=apqxtNRtCcS56pVkHsPDZGRaTLwz@Cs{>}-X0WdvGfoaW zmDFIC$wnbCIhcb@9hguQAlK8 zVW|y6Z31kxx--3kq*inT5S(o>>6&X zom(2*@LFe%Cp-yZ4|OqrlJ5S?HGRdhqoq82e>5J)1Leu+dxFLor|Ps&2%Dfaw(nBx zEz^hclJew{xEf5)7iki;nqEJI&KJ_vw&3b1eX@X^*jd-@AJo(@y z|AVi#PyQtf;N(a0i`>iM_IDFtLzD6W>w7X|m;L2(n$ElJ7?w`$aNrieP&88AOmsai zR@@FzVs-Qq!a#pTgk+~jr~a7c9orma(SuEkN6DLm>CK@+AEv4=F?=wH>J$=3=X#lDHz-0V?^TPq)Kk(20d;AML_@DLj`2S_!0pK6QH{kx)^uO)p z1pPx8pWYQ|wYwVq7iRM@IED}hpL*&qdd4Nq5qt#_`$+eq6qX*pQXUQ1ky>d+HW}gT z#di8}1$Qt_>s`OKf%~I@*aCG0VjQ&#^fxAO_|o{5Phf1|<6rUAdsL<}D6IZGT5+up zP7Mg7Rgg%+@;N)?N1hOnPH zA6s@6$hY5;8I|&TpYn%;rGLKy+CG+X zOkJmxC47k#lDiyXSVl)-i`TyRC74Q)|4XRVCt)49D)v!N_$h?dc|_m!+}(=xn2Hq# zpfv-U{W;cD=xpEhJmdMpUdb4PqjX2siShLW0sBXw(n*=_-~lW(3TE%*Ers^R&=3@s zF%*bBD^eue!yHR;sBkVU9y5l);@a7_?7%RI4g4MPv+}(|$f|&D-th~@%)5bBW9H>S z9gUeWd)khf1%GwN%pXyQXKtHF30^xiRZC0X)67&FHR-2IK29c{-9Y%NIA|9@j9 z2HU0YdO2*vfw>+sv3`SUA)oCJn;W5>x#{?OZKNg-oAWSiC`6ol-ErX=FVA!^UcNxO zJ^RIk+;6~GcE`Z0o^S)g9)niT$B8{k5o8sEGn8&o~Zy~cPiL9 zlqI2usDoH;3=p^hn-*ZZnhvnloc_tb(x~q$Q6`mhfEd0N3>B&=;r+k3;{6)?x$^-z_nn#-ZU^|zh@WO_JJ z;-gB{jDXny*AIvXX9YIHKj75JU(N<>jlU_ZhhOS%`vokMDB(EL+(8MiK_RHW$qOmr ziRq540N=u*A^?=da6cVt8H>q4Y-x_o_!A(uh!{$|EG5o|_J(l0{~27qEGRn=Eo1X` zq{XFyn-*#oi%WzXb%o-dVX9N?9RFCMj=!<=$wFqe7KC8*P=e8jOc)c24EE;xSXA|! zQ-kVizXukeix!1SUob{@_n;#GfIq1O|JT0(S1gFupdZOSyAD(Kn{(Koc3vH(to zw*7ms5)khAlm;9IFlMEGZ^Mr`!Ki6qREOjCJuF?@jN5Ca3$Sc}{~(2brNSQ;hyCdT z@HVbM+&)U$#=zrVl4opQ3%#M!>QD}2EN@j3o;C$ z$-mr|mSSgtoXN7f2t1eJmxVI7!8<;|)lok`{OpnBt+FaX2$9M<{>IY(Ary@dzx#K! zBcxqg3F)hge_lv?qem3d5)MCX^>8Li7@tARqMWQ;;}a)e?tZ+mtr5tHof(Cl7q>rO z6rCY<=`r5)r8>PL(|O`YPx+8qWJ zczCUIt0z3u6P~TZtb4B~{eDmSgP!z1c*0L2EcWi|#CEvEF`%2F7Q~BlL{76OTiF9Y zmtSn{F|D^u>qCFbn9Kxhu~(FRTx@;ABW~-hfVFC{wZ6Txe1FR@3|8z`B-lo`757Yj z!81HVQT|Rm&(6liscbpppp{JrEX+x(2Fp>mkvAg8{>I1*&Gug4JHD5G1~oKk=e|qC3{=@?;{|$8Rf#7o&`g0)V zAB2`akn-0ZVEI*0!3P5W!2>Km{m%ykAB&dgwJ;>6jxh_Gf(5R=H$yWsQ-a4~u4pOC z!acIggijs)_vj06hSp}IPWgMqQM!*r{t?V9#!B?Ws$GxMZ$^4yEd(H^}WU?bf+&R2&FsKCpcZFaVkl|$%x~m z;3wnNHwmxz+QBQ{zOQM1D}3H`@$p{{OXjur(^9y3T5SCY?%t9CG9k|IGyRpnoylg_Y>k|)^vY5p5SZx1L+R> z24hW5oZfNr$vA68-=4|zJq%F~eZT97zDjzdmx{z4CfhKs%pGAy&ol@0`2dT{AkK%T zK;Sa;2vhn9491;C5u%&~Zq1?7Qp@qIs^|m^d!MS)(ukMl{t;Z>^kuw1ME zr>H2sIs1I;Yr#h*_sgIb1 zv#EEG?!}6JP|=~8y5iAzj|^z2bFD#{Qq_piCl!~@F<-Rk=iOMs=Uj_)sXxe3S@WW+Mgh{a8 z^|V)*@FnSGp3q>1>hM5fgPGS5e6l$-6)!=|jSj_25I2Y%Wh?IRC5X4sZzEC#VD3W@ zlNIHJdjrykNPZ>q)l(dY{Q-_4#iXdKpW;3&$?Y29Oq7zgjYv9z?YIAHfxjD6bld+3 zgUFSh0qOOP#P56LtM>1I2<)^{|Br)rG!u3KvVVeJ@xbB_49T8tx{!1zSNgT43u22MY^h= z>#2{KP0MwzuK$v%pDXpBO|<{Vc>Vp4i`A;W8W)Qa^)wsMa zQU7wczN-)LU|%bJU_`Ev^3(}NxJ1|?s>C zVq%(g&lO=!myg9RbyJm^r+Bc8!-ibCs|<#p-hzj*!v#pPN11Qph7lsU%qx_gflut| zFqTvzd{Oe9f#9)y2566SxKp0Q0pYKZk7>SXBd0LaTHDlx0c^}PL4mdH#LW+&Dmr;1 zR1|;Dn#<}t?5hxOrT?+}xpqrhikcPq4iF}*J5Aio9mVwp{F)G;8(cO)M$>=06_Y~@ zsv*R(ovw1`Vdd-UKS3^o6IVavBT zT7=|sNzDorG~OxO*8PeQ^Xf3}u zdaDmDaW8}|a?b*DJ|1-QgG6kxcykK6IC~D1k|HxEWnoiCpa32J(Kg z)1}$bpv0Eo(TWb+%;>R#mu=q<8br2~H^D%|gfug${~`lmeynVw*-#|?$FKBXiWhtc z33M+lKS4;*(V)SW;4#Ynw-W7CdfI_OG&9B1@$RY}2yPkN3X7!I_*K1z;cgauXoRnZ z7`Sbjv)4-RPb1I)aJw}kd{Rdw1UBX2#nDCVKjf5)^mLRej?NNEFq(QL4_(u*YCTHe<@+$SH|@M+#^O*d56G z&h8{=DGBHy*l6y5bzi*KMd)=B_h$*&W}??c=(XZLes!<$-e+9;p9K`GUWM~X;G-V! z=xh?xdnxdE<+~^!V0Bj&)3JyG(;@B;ahj3tDu$m2fPsR^xON$N7yb{TKT5#yBZ{L7 zawt4hYLnm!~F-sn1s@vXuQWz@7%NTa{68&6wffmmeY&D`su*xuK zCxwuU4d!O;FPpq^f_=E5V*=3|MsZrebkS&ppM%URy!9TO5&kFTBI?op|3L4nMepDG zJ#o3!{WcjZm(===MGoxeoizf366TQk|n@o{|*yCU9BwWpm{ zdcLL^$K$Q0Ndl4d&4^K+T2 z+i9)mPgm{iPtSYilV1bUa*91IBtIg!MFDFsTxUd{p`j4MFii>Oabvc>?5~Y}SMT?1 z1M!mgd+w*jfeP5Cl?u61;dHmclWi)H{v5R&qFD1n72P0a2x7&dHPggI`s+RM(a5z% zNp$=u?$B@x2v_TNVCdQRq6jP+6M2ORf&Q}ybRim0d&>Kj=jXoLuRQ0Xg^uKTDTwyV z?3d5({yBNBV+lQOTFG;nG_!wsmS@~O^88zX6T4+U@?3&49m;b)GdqyyHK55am*?#f zX{G~tUdP-Hmx zOPbL#wVAm$m5=VZ2O8YWH0IP{=h6sYj8f1>;fwGaFX3~}LR8H=KZC+~m~~wgW^W)0 zxk%9{^hEIIQP_=+5`|s(^`Ov*sO;CcXv;SE8IXbGJ$UsfzAls{*FfV?xTkD{n;<30 zbt0g7yik^Ni#Mw3%xhEUWiNVJ&NJSq>|IYzZsRguK2Bv=XPy+AQ2WD#S=9!(R!k9f~#ELa^b9x#Cq<0QwC^oXmqHYgk3HtVcz{` z5B-D5=S_dtG{hED{2a_}Zm6$m5#`;D7j8WVdd-%zQV{hn&b9BRa&F59Azg?~q0L;arfd_U1itx%RQDbS*eg(R1LY_0oaU&;ht@{p;^=#M;J* z2c38h>~jWFr&P;%uQQWjv4KCIF@i=ofW(Vw1Ee=6be=Y6He$W2p@uQF8^7s~l1hgIIf=3=7 zmJ%Y5&mc&7oP*!EJpK-mBze3AwEC~f`TgHcc!9|k7RMbLFKR$u ztjD99_|q>3CGAVWcXr|^bmD&_k6UN{5_vpjmcrvn{8AoIfm}9u)+K4=gbT}UpBdE%xZE7`sll5ht=u2t@Aomiq0rllg7`VV>FEyR`%g1nC zwC^Ul^3H8-xk5=r?y6n;Vc|VC8n~^WU7vC`uT$b~6*f_LDHhh-Md`4Ta&AM7?6K3Q zk7ZRn%_F|Oy}5I5+yaa46L@$Gpo{$7oJZ6Hm5*W25Q{P5BX9BUslHc!7t3J37YTIH z4xk9`l?P%2^F+g;q!F8b zRd5F&6zo<9185!wXs_E@3wOlh2L{=a-dU%^UQN2;x$8ER*MpJYuF#W1h6HK zwPXWc(c^u7UZ;8k71||uoyl?RHfyQM8s9&y`X7w% zkKR$py#p=g8Q)(a`4^7w#WGs{C*yl5=B;+cE!^SwKAQc3O4#4{E`s|6#`mv5PmJ$# zVY_jB@47?D!(y?W+REI8FcsSy->2yG)-q=>y~FX{cjfy9l4_eFK38o&< z_->+&)t*BAuZ-`RpolxH_CLPg4Yp6~*K~SoT0g_|R^$6l(jUWaH-0(3?}WbnCF6Vl z_GI4M@o{}Tf^6Uqv(&gAf;k8C_6{uO!*ya$ev2T-^%wY!+mqW6NgCHDuh8SVw_av& z7u~zQ>Bq_J7p`w60XmUNRY-|F2@GyWGFC5FN$&Rer|A{S#W6`W=G-Oi6!)N-x%DhUB)Dhnr;{9e6r0fjsnpF;02dK)nfA z4`HV4SRO84`rns_>!Fv>GUVY?ESl}^K;S=;hkd{OCGt=-UG;Mrekl({%^azoyh;@U90sJ% zeU%%1aycJp!y%AQa1dQs-+WLun{Vf2@sd3_y#@55Hp*rfU3a<cD*W5^QpAo2>}VXHDFwfs?)1?u%!&CW$? zktGxq-F6R_EOfGPrlBFFFVJ7-I>e|4vr#z3If!}H(pAm1Y<2?wxqRCQ(B<%R1K*6u zkD|AjPiL)N5@SJf@In^st6cC7MceYCpYqzuZdxISFs5~WPRk~?h7+pubDXo$I()XC zBOI)qm<4?65VeaT0(fTNQqnn=@!AYLGjN|G{j~_g{pTX&*q@_s6o$Cx4+FTZw8Ivs zGB9X1WLbY93zLifg5%6Dy&P!usc0Zekd&^%qy=n$k*-cgcq=Fu+SA#H@b#}iv~yrU z`gw~fU0)!h1QJ8PaNZjr5!0vQQz!vQ8P10rWQj+@i`W%MQ(8)MxsAMqN$>;DUUH1; zr38D+9 z9wM@lyV0iN*67JCEx4(L0!>|He+dy_4o|W8BD#D$CgQ@;`u+pn=5bC0>)ib>1zqtv zS3#1Gn!5Sm;2+VUB}Yj&`}I~F+=m}~CHmO1QMVqohan2KJ5nMefS)DDDT++IP1wGT zm3BhB+OYjN5-`^uvkyc=D9awn5JX@Wghb^H)+v*bl33oT`%WG4yNeOt0JKAUaQ0si z=l8`6$?x}&k<4$f6l?lyFuXx{9%nedbVo+NE)*c6pH|&^paT7Q>_##g8Uv5G1jjdr zegj5dBMH%Aq6J(=^RvrT1p^j=(UTOT!Cndh7>zF!A5}Ft$3P(}g}(vE@)?T7xBmv* z!YQuNbTH<3V-eVk&4{>yD#tky*{j#XIWgA?s2AQ7!kVi{MEuXv1xO7TmC zF)}$=Z3kUy*`!MU-kyON*!sL;YX;f+tnUAg_<#8wN(ONy;`G6c3hNBt%& zpQ2M9LP~spl%rxm`qQt#;?Q$cm-zm8@;n6}bqp5eQLBB_kJJ+s{2x1C#Zkj)s4(R! zE&oK4_t%=0iz!Y7aLeReG?k<;;8BsvNd@~XN6WRQt1$wh2r~WhSd*M@Zw_Y3xDH~u z5987-*y?)tpTq&^wT5FPa!7XjT|ZJ5@o8OvhoA@_9{JWb-XICxh>7+!@(c$GEy0(h zhrXug*$>up9eOdr@O?ksf23=dV(0|Oq+}5>>icUXNgU>fBWV9=QrxN_L++*;2Sa@J z9SBhgr-7ktPnZgRK#e8Y3uskiPeSX4ccGt2&WvzmJQKb~MtH4=S$h3bFdRcO^I<&} zr37|-Eo(EFXh@|%!VLi{65fK@dTf9v+$ybMhNn;V-_5cyu`@x7HIdH zk9aSghGI?6c@__=wEYk1Hu=;B-<$AXUX158M;2Qj$Sc%5hRcGh5H8p>nt2-o1)mvV z3uG*|))ue)F%4{r{LH4`lA4(C`~8A=YqQHp1VM zIk9IC+Yu4Eq|m$VHC(~E{UCEha5&l%?RPX@ll5k@eLoKrEI(OIN#f-XrGuF$p*{!WB9_*l%AS80NT~5A?Y#;V z_}4jU@aG0_)exS{0m}~LHR1vMG_znGoPr>gkTaV15IbzYe`^{=>)`4K*Z9@mm)Lur#QaRi&2_|A%SrB4z0?x%spoG;$}RdgHh95{t4VJGE3k~3B;yl zI^RMd%;l;^OK0G38bj;xt=KetnXzdZX7meh!R-7YRG1#mGXLQ~eviI%+24%bh9?7} zYPG|=3~=!e-3f^llPPqkjfwEqW+O6yY_m3)pf+I|}qLcVWl{4`~h! z;<`+xvNDb#Gd8`^z7d8SziS2ZmHT*1LcX&SjG^+(;JFP$_+k_2QIE;?8n6LwtQc!t zfu-pcn0#wiTr=s9fZ+>yfiHs#m1js;?o)6C z!#Ofoqs;L?dZ%FASrfdyQ?L*L(wMn!G+Sco-=$-KBD%gV{slq+Ud)NMkEc%&T@!^j7n=Q z_R8!-F_EjO{i*K|LP{Pw5+4sD59-}j&JC@WO=i^Z3sq1U2GiyP`=b~o)?QK{Z0h;b z^W2jehh;dp-B8#JWbkn69{$L3oLt69RyO&ar5~us<~y1v2oH?#kH9~```*PkE$fQ4 z*G)bs;T5+NXnAp=3@PE2 zv^)tW>^G^8yYa2+AEoQVyAJicBKHGwdu4K#6Vd1b|}@)bgmF zShEN2m7$X>Lx;W^96I38RwK-fPIsOvk~uxH7D*7?Ky17$Z7>0AiRA5DerH;Hc|DGD zl-h>`VO_@OC@Zs~*?#WYI3Dazwx*}_Cp*!fY~3F|wNHCCTj(uR*&s2f+L=*#y|S&|d6p%z(6p_F`w_g>-_f zs51u)y!Ovr>i&~M7gKtd=Fm_oXk@`(gllpO-%4#LszDM_sYda(*P7Xel~V*19uTL2kT}%f*bd@Z)X_Tnez5 zqw0}lzY7k+ewDt^Qbb?M*TW;VCJ^oS@SmV&3U(Xex7eJgD>G}*mGxMM>#odA?g~gm zF}yr2xi_09qP-(@Z?FLpso)Ut^y)b@6D^r1gMvPgpnpvXj?Sr|aNE`hZwE=0b?S_P zhyibGgA~?9OG(f9M8Ybo1u5o(6!RMjxx1GVMM^`VWRkVqH>xk1mlCR&2c7Y6a<5`8 z=GxL4Bf<-_p^7TRrd1n}N#q|Mlo(u9UW-qCf*>WeXf8&q@v>k=B|c5F7TqYB?{C4A zK9w`)fgGzxU1>z9M+4Dv$j(R@pSZv`z0%$SVQwgrQ*z#zG4bK(;sLinRepoQW&=OC zK#tDmgxoM>4+H_wpf$;4YZ|mo(K^r6TtJ4z#&5pL^%QjXnJC?np~O5|HXr@O8){`W z4jlxw{TeN*!6JVd+|o$}BRm3GYEp<4lO4#{Y5{XI`K+{D5od2|*&Hc^3@S;bGDRJ9 zMLPJ=x5YUUiXnM^JR7`HemL>>d7#T6_cN=Y5AFPaL%o&M3frSV9i*Ed7|`!%;E20& zfUDFY7CA$dKIAT{KRU82Xj>e+Msy;)dk^MZpnP88{cVJKL_{%MiCLg;baX*-{%zCybK4!=g&(JQPl}UukzvwC~5o?vmK8a%Q7A7PqRopP`_&W-cO z(SY3xMW7iMNv({d@#AX7e&eK_7YXQ9^fy_KuZLti)6oMpx2<;Pw$)9?Kx^09M{Or& zGi&U_7{I4;&$&v?Xg^_QWBg>q+tfWt>TdZ`>dv({sX&drURU}jq(X|+BB$w#r26!K z=_Z+vvKpFL&C>Xd>dEvK#N8_MbbzYL2y?rU*isMJKQy6h=&&n~EUBhh%(@%&Ll=c+ z^tW8Js09Mez5adcqC@t`0TLfSl>?-kq07uDRM&v?kxz06L+x`h$%#--kZ23S`k?m! z{*xmlxD4&G=RyL+F7eI@*dmM{Aw7H@&johbyT-F1EIoAJ^Y#OugE$+*jD~4+h%H#1 z?P9f6rspAzz^d*EVuhX8WUMfMwujY0?O_%1V3n$|x*~y9eJiZw-E6^Xb7&8YhQD#5 z;c>_m!D`k<7!9WiRwJmoJu@w`;OG2X&a~XJ=0Z-?m6+Pp&0;sql9|?X7&2un1b7~F zJL#G0rDqcx04mOzz#*}J(cya&{|9}EfsENV@+^Y#-UQCs@QhM+LMMJg+#}}$3(V9w zXfq-eD68^$&^1!>1ISln{rKUKwlWln>z@wWr|qlTU(#Xw*9iO#;HcX^y~+~pfABc* z!&f|X`%er*`!{;qhr9%$6PhVAkvB2SqeC_aqPH}XTajTX2&Fh=V`$bobnP+_PQ=y> zeT}yVK9z^YceM=9#FUG9oy+e!dnd*dX^#zwo~4iB>)!4fNiUkPe%FhoV6cOQ5{^X^ z3#FalI^lU2N+T)3$qOY|5&%mHaUQ37kI%Qz_phK^aeB+(lXd(z_WXaKzxd{zOuz>d zXnZjeghr^vKlRiVHy9%@#(&XbqyXhO{r1H#W(x=~wq1St5X^&Tlz75_bXJ|+8Dini zhoqf%eF8!%O9biK6xk^}b{Blg4PI=KcahVr#FHofxBFSEbQudGNee&FIqj_gM{!?@4+NQ(ZfcU3zfqNqCQTvFi zIOXrMU-*I@!h0EzX{PVnO3#ob7+o@sLi~;(uPtsR2B|5n6nT59iF234@!xiI;Js zrF-zzht24VtfoK2yX9(!>OTE2chZ&=mtAk82#$F(Q}A?()We=(_*^6=Q@J-%N_eN- zADU%D+4R89GJ=~CJO;sHYKfO%y~zWHIAhI@4y~0L^*vOS8TFkWTtE!P0wR6$A0X1< z+X)iZ$03*+#KU0dd`zibe?wD3J)A9G2KuWKWDk-z;y2-Iu=sGK#+FK|yEH%aO32C( z5R}jMvPUvPOz%`X7cC2-*pwV&p9<@WO24oes*PI;)%H8wW(E;=K{SV=+rz`u010N7^LN%X4Cy}r&u`40s;DwLc{%qhCh`y-u z2J|UIMX%sMhS(tgjPP|}6>ex^C$0$RMe50YGY>obYfYYsh4vI0k@xUzP`SZ41)uA- z7mbvbO5Iv49omy=+}#hkW~d?4o})^PNiLCX-2FXH+)*Oio{kdCH5S=ySb@kl?q046 zihcE){k(o$A7W>vjqEZ&5}hV@;(>axmupVkH}qwrx} z!Hu|J2a^x%dV1(l1TeClA=REdOscH|HNcwos?{MYFrd$2N!9jbkZvPzF1DnGc zCA1`0MV^;PmWsS65g#LF?A}cL8j*RDwg>7rH4u9+Mbg4Rg{}vg(Cs5~rqMOOG!VNh z8<}S8LBQ5e%BG?0?l5`f7>I)N}9!CJQnMn|1G%##?=#?hM>-L)B#MQ=5GIB>&F_hhb>8q-2OpT zhWUpe3-eE)6!XQQoree%_O&gP8m4L}4D%0ASeSnaC^so6bCK8{e!s9ml@2i~jm;8JFU%XD6&w3dF8H_riMS#Pt-8|KZM zsq;3fymP#H-EQUUm$n zsJSEf_1NjRAj*BRXen0f`a?9WPMSPZzy{b$oZeGq7M$iBDYXZ47roA53m{-I@N{Py z$PdZGfoFv0!=#gGMXOLJhy-oFLmgZ@A3*Uwjaodl*R?2vH9gi+h&4Z@RAc}oo7^*hf@A3P2*xU0a z`~+h7fDQ=8PcP;3*V+a3i-{B8CF2=*9h9sDnql7b0P@h1jreC{x`Kx;m!dd9H}b~e z(?TeYtjRl#Je)e&V=#ZCj)M;H_Lh9L1uW?ortYz zd>It+xr~Fc75L>J_WW!U(>jqA!zYls18>6Fe#i1ZSyH35J>sP>rtaK>P~ z)qs{VkxETKgxO*iOR*r`>pw$)&)c(Cuut(Cu@4Cab;do@5a3s;Mj=N&k85o!&Rb{S z2J|>Z^1GP)5G_Pex)Q1@H1d{eGHWgkz+Mo~wpG_~o{?(*#jmPEVs^V6CAI9nSny!r z+q_~~NOD>(&laTRqX64Xk^%ckU{Z|l0LL;mlwlFoh4{w$jRncD)Vdu>b`0dva=$=f zi7m+r*rCpOkb3A9$o9&p$o7Ny`Ngum8S+;U6WMMC8TLSkzxBvAE^|?~ku~`zBM&sN z!xWQfdWO{5Rp*t@xD2`ie()N_!6!y{<{L+a0vvwCj~Ip?~~z5z`un8QlF)@?Te6|tFpfs zq2(Xh%hM>T5c&ob_<(2wICU9Zq9MaYp>Lpg0n z=oem~s<#pP^YvgVl#9h8^vO{T0ZwBjk@dlx;^?&O0NzlAxR=Tm-ay`4 zc3_}3%g4~Qu2dco$GTe95%!sGW~uYRHyr+jpxTRKi3hBD8u->%RGxK|Gt37hoTvo+ z+J}Dxt??L+2(|x_`iz{a2QQ@7^<2RA-#2re2Wwl_15S5=kN}TH-4^jt70#__EMEEX zO5{p(yG8RrI)~}_tsKnss|-YzY^e_7CUl{$r4A1ZVU(I)Ms+ zad$UZ|(mF%9o_4NnRX-lPP)Yc#xA#${$= zUv?|PP*|@NLt))EK!jS`(^j2RCB~|X!!z1_AK&Y4B**JC2Pj6{FqZK_5k6V-9)h05 z!&*euKBHcL#3i)@f4gf?fs2rl$#8}qD4a$uX`Auu!Dl_92ZYac8qSr;a4w0%!IRNW zi7ZnTy-cVrvBqHMR9^^9updoF@Vxe^(<9IYCGqn zjM7)|@+k5-Cm|kM*V4>b*QQ$R+IJaB$<3HNB(ZB>HCTkX1xa>S*p+m3UM?H#v4JB3 z$g@`Ut+O|sqjR_{f$X0bUDU_3AVkE4LlTyfN~#F>>cO2+yAQ_UToqf zI*y>7jh=8fG{bb$TJ7*QwhFIx&WeXuJM;x{%besX9|In4`g5M}{hsjq@w#i9&UFiO zb>cQeDRk3!A#7qVKMVQBp}IuVWbqHc@|1>Ep&7lJW~@5^#aW- z`zf&km&z<{kJunvg7$Gbu?{WS1|t{<=aMN{(Ncffd)F{=mwh4=F)Uy&Jw z%5};?{RnY~eGCw6dN^MFHC6{KY2a@f)&~XqhqdTz1?#Ors$4S=p#hu1#^jA^3{skA zlEBvKH|m_mRNy}pOD3M(TqwW0>~pzoyW)rF!oMPp z0CtSvfG2QdtrcHmDOv}t&)cPSKLBw+&18xh;WIVfajLy}778BSw%~9~rtDz6;4>^Z z5UB}Um39VYL-|+6-?-(IL?#Jk#nz#`=TJzyST+<@f&eKz3@WRVgP>U~F0RTYm96}e zc?fh1vWxnD5iP00Hk=VA`D&4raHQsa;Xrg5o6~-mjrOVw7{S}@mZZa%(~RA=5lGEP zDf?OE`uTQwgq84^Fg!oj$%)XZfGj6MhaO3!X4S#RqW%sn=E8;CX!H75)WO8wcPSOd z%qp`XG7V$|B8bw53$wHMSX|r#DY2QA?g7vtg-|<1peFcPWqQ0APsn6@aeSOV15`C! z87l(=1{MhdEkS#)x#47+onx!j*Cs_b}l~8L$@0 zmVc<8&XaEsuKLqNbUP4RNRkuBmW50xv6_KwihffDjb z=Qps-UvK1zAQ+L`eI&cch#9>c2W)VsxOd(Vp`SHrU|Nxl%tHSf zB6cFNQ6*M4MC7DIm85Nol5 zb!mMgZhzs@7v9=8qt!Or$F0-C+qrYNR~EpPdn7G=jik+zgU9l@P(rE+lu5A*FC-Eo zra+!JB>Dy9YJeEm5JaaTlBGj})Gch|mm&4SZA7Yf5>jUhQY*W0b@ta%wi_$2QB4pr zOcXL6ftzGnf(9^}XGS05@N9_Wq7sP?hCD=uG(`IA6f-&U@W zVjCi(+?W-~M-V`3jc#T0P`Xe>@%>3S`dZSmoUeR^QQqcEMzR%Q2$3Pvn(mXMlsr_3 zO0%ucKvWw%0@1^|K~fYP;;Gf91_AqcGN{_Xk>QmDhbamUM`2RGENoOnWyH3DBV#uS4pS5y4)G*79|;_)gEnwBGBF7b zQxqIlaN$H_l%}?4OWByrN|A%-ql4XR5fF=3Y5Z_so=U|iKJ8XqNWtdyoJ2+C{v0lp zij&CA>$vJ9_JZ)QnSd#DU zVtF%0JG4=q*I93yOhve{;c?)Ls>>EJ$F$^( z=s(&3#%cf2r3jj_p|4V*g8fIXA!K|1GVX8#r0 z>U8gFj3V(Mjr$o2Ewn`hG8^HkAPY8Xa9kW2hoGLZ^~MfbUN#NgaQAhjw%*n`M@p+~ z?&_Q@aoHt07D08cBI6SPT7Mtz>mTF&{T#s#`+J&n=w(Q+?(agD7IM&I_u1*}?;2#Q z{!Tsx{e2KcbbsMb+JGG}Y^u2XA2KDvl;hCC?yPHyS&?cXyG+}3mDQGm6`8BjsdNId zWgN{AjT)6m2|;Y0ijm)l)u|YVB4YFLWB+>jsx-Jex7Pycd%o+;A&^PC>*9#{|-?Ug&z`zX--|z4o3eFX|9x6v>iTaAvj=^*p&pIlBDJ#ktmH) zZCzU;9WE#px_s00QF4>5#BhK35aon0d&IXRe1xwWpWJrvIqy4)QTzB%av>TTA7Ymf zB9FWr5I!;}H9m7!w588=a*QJQNb_pJrRYPX)YRhQ+o! z5j{ACnirS+UB8q0yVw+pa4vX8B6La@1&GOu+WNc6tbOwE{|5-JWDx6J z5|R2}h(L=O;i+F4uV1O_U#HrylwOx5+P^s7zDqy0Px{F|c~O)>($ay*K1vuHGa6Bi&a zMU3Bb9)kOYsEoa*@b9n(?G|valt;eC$Z`{XPNN=YYu#$X;^ z|4jWMXo&SMNNPS`nwR>QeJrFCThF(datixzg!{|=bny~n+PRZxs3nQfFFZj_;3FM<*rxdT!q9VubEZY zO;oz40Jk`ly&A(_s=7ydLNC!r8Lys}IF~Fs9lNiwfyd(WLH4J|BVPpgMxg5Ol@Yln zq98u~6~=|y3T9hk);R@`={AXH12bF^ArIhfW}$UGHER%8$a$1kn>L!1YSXs zT?YhtVC78NsRQ35pwCnicRr_qHg--Phv_m-Q+AdLb9%A6sxYTLJ4J^O+BSZHH*Y# z{a}Z6@fTSiaDp1{W1i}{8V+pLjc7YDQ4$kvxH!W>MZK`o-w{_W=oOFs+DqWoFceAv zOF=9Fj9rem;Lf^i!QnkFU5k6v?_jAxRKEus;_D0M`$xd#=H{8;iS+}7#kswurHt-g z^?o@;K<0NdPz5P(A$%{wOE4qCd3GlcpgiXn%In*VrQZTzd~dGQv_56MjYzvo!@w~^ z`XV%eP3Bej_XhJDk+unGd|rsBpxI1(oBzRQXctPoY2?8{k6Gem2|VbXX-wY0q(b}T zV_RBQQ~(*fE2d(8iaAP5fgrTL~+nr+7P zsjp>9v1z>YHwsXB!)8p1KzS1})Xt_KVWkS^6J!NH+(2JT!FTEcHWZ&#J*l7L@-hhQRPr+E;kNQ}TS8uzAiM-yB9s?CXOko^S=?udr4Iox{XSt_ zUOpoX$_S)@GV|ac70gDL3P?Lcm3f0nHE2Rf4CLz>%93&eV_E~m`sF{V`gCGkgSx+} zV43^GX@q6#bbYL3`)$8fm8Bo#S{C9tCJSxUg>Z47Vp-3$=0rusc4WXo3m+H$9k%t9 zSe8>wXtBgryohDC2vieln@TLRU9rrfSfY$8mLqVusbvx$$A!l+>Zr#(1w^fYpiE*c zpdf@9U#MoRe?lT5=Zg0fRLW!}Y9VDDzrJkdLf#5pIlK!wkPt10V?+-7BaKZKf(+uH zCT5Z=8B$TX7$lf`SJ#t&iUR>XxMK~uL-6m!_heM4~{@~|{;gIjZBeKQ* z_^2e|D*{$4#1c)8k2v{I~}4_5mU9Hvm>codW*ahs6VG4y;_Ml#-~Q!9X3|&$kyI>2mhb;}e_(@rARc!j|ue zf#Phb%UP4GL7NHAlKNUHfwLy+kj$k@+q@+p6lYmbm^v|8)wntfdX8+RJ|IIu*vFKz zAZIzsYmnoz_+^A!vv?xX*!DD$^h%b1bXV)&8vNSJsa?Tq5;4K+^T2D;xs{%Q`WHT@ zfv;Zv2cZ+mZ>}HhojsEHJsJ&=-(^Q9@f(cx*JAq>oa^!WEhKox@5}kqZ|8$*q>7q& z2iaSYPS$+_=JRl#;ny+z&q8{VAZXR|hJfoa=P817b^3PJYJ`|f2`Xqd!j};;bO-aA z5l?+c;+TS<`H{8bJYk$XrN@99M14I!-NS?>$2CFnPRYfQKX(c!+nw z?XvD3`kQ>!@GU15%NeczsrW-XGd=W#KMTBh4!hwXtUwV?2*)8fT*hSrf!czmCnVMi zu78BHW7I+Tx^E@06n_c6XWKsfzg@py_)oSEzhiiBUA|v@|89suw7!Mzm(t&^>=zq)%aDBZ9mrf5- zdG^&;0t2`Q><5cHV&U%3DE+L6lO3=E6ovSuXUtq~qY6A*JK9);tI>mvMS+}S@Y18t z?6RkL<);vt9mQQ%+HayOqpoD@!2Yx}c|oz9G6mowQ!jMmERtA%;#(1c*bSMuc#W4~ zliG%W2CP%~3}^UpNE*HlAcFmGI}05!(Rca};8Rz(UV{Hq%ljFN%2KNE`h+#e=dVxW zor)yEz8py4tETdVW1@W%4l5g)Uq7gEG46Tj zAW@bXXJGktecB^t>c$J?`vp*$U+|sp_h9_V{wF#$Do$A=as!Tn(6Nc2uIE;qr+=}2 zd$^XHgZfRYCR797=ZuFLe_7&hFdk(5X^H=p@o|hlDDgRrk6`>RiQmomV8rDuq4>UD zzZtWUemc`H$4rq(Uw{{8?CwmzFe&{U!ubKKTKUppqWn=z|AOg8Qu%MOPPX|biL)EP z66N1!`U0lYD=3lvGSeSpI?uim>Az=snCX1eC6Rs}u6o#$nf`E6dR{N2k6`+pN$H0% z{d}h1kd%H+Po(EC{nDiLd-9RqmFedurB7%2zwn}h(<>=G?M$S9%=FGl=|>a(8m8|8 zumt|A*e`zn-T5#n{ZOXQVfu=s^t-Xtx9?#3b4lr&m_Cl_zeBp;+U#Fx_lj&WswyC( zp?%jIcc&vTd*2SYm>PG#fw14&s-D;~mtS(l-TgmUJLPPQl-c`?Q&LiHJ;ULPNbdLw z?M3}JFmcV^tLo1oti3F0-2DK;rnS+u-tmX()6Cu5&6QsrYR>-t$-7fiZe1IQ-I0?y zD0X)Kl0HxegRQ}M=s3okb9i7n^fgXl@#w03H3s8gjFGPRQu8{qQvhapw4X6DEv3mP z_9##-p7egNNYiDwx3%)C-Z$YqTzEF$(9u$EYqn9IB5qN$SrXXJY z#Jcu&)Z@(sB&hG~6(Zo{SM-?LpC+mPvR?cw_d>G(ir)y|$*Q12zI+%NLwR2*ShXJ7 zh|fM}7F*qO*a}-@OSBJ)Ek4?7B2P#G(-rWe0O2;m4Apw_xAsDZP{)DdX2C98irvQ; z3>6P&-iuQK)&`n5@9EdXu&GgAg{P>z<{2KOuB)8o!`pxC`1c}ebls4&m&Iv4Xs z$U7g=X%NeCU|q>E5-qPp%^Ld|5;uV|IzhBy#}w6IC$*BT+bbT1++iW;*}wL9@wMCj zeW=p3#O~Kcv54O2ha8s(8wh*Y<$lit@2ifWL6XE0o3yct8cmoC>2gmS!c&9=p))+SNd{}Vi!tHUK`Z#GR9|X@9Kh^Erz{tg#>b7sWCd7Npuvsa8 zCNy;xW873qVIbd4?k^+J?u@|%>&jk%MO`2|4p-Iign199{D^hU@(L7Zq#3@!RDYTF zA5ak5s1ytcxUj+7$<-)nsr41L)hMHzeiR0Yvg?9G*&Z3CF!=w-d-M3Hs{4;S z0Rpi?Cn#vFThv&CYf?c;5lw`^9i3=YP^=qup}3CbZ_*El=Ur1Wy-(AxHcj)ZKd*eQe_k~ERct1@F-;$M*ZfW0x}Ep3C16Q= z)~D-ZvfpL7v)UE9L!;LsizouJv2cJVQh~#YFkKZj_IQ#o99BblM zo|*O^PPMf}`b@R`vA3(3eY*xcupo6bb9ad;FTNl(g6A#x;YX&AQ`tQ9!|3DF?cMnQ z@)v%y=;Ixx0%`hK+Bu+)E6E6||KCC%H-kjs_`B#MO&W4)IkMmg^MC{b3JH^k8Od~4 zd{NlL0@M`nyCqp1(2gt+-RN2CZW48+>Ea6Hs_B9AC!mFM5zoQ%Z+Z6hXZXCVZ=Nos z3g&Lf-S4er#^p9jgeExdAcC@H#xa+i2#$v=rJG(Br3|%@vSXo$z#j49BM_y|HI5ww z2f;$YYj^vK|9L8@aLNkR$)5@=)o(Iyd(KnQ>vH${&_KGytN58t2m`rh(Yea%&SNQC z0ti)?ZORs!>YrY~p4xt|V&qd!n{=qhv5*;yAcdRFUl@diKrlQr(3%K6o~oeLWbZ>- zEn+f5!}^jn7!0R_(apS#wzfXH=X+$0?yDoVGP*lPIccNr?@#wNKELPTaR#F@4|a2- zSw`QpM{_A!6W3j#8%^2z|7kRDUhhUDgy=7XtF~j@>k1_&^J=y>n$t#SMsvX)+aAqZ zNH*c7^A@AITUbl>xcn%KoBGxN(`eqluEWv%TT!9l)*sE^2W$m5cb%UZ&Hc>dwuPG> z{%B;sWaH*#VJ+F?xg+gp&SMGM%G8Q{U`aQ=AEiAtzQfTd&N(!iE?XPTdlzIz^FHp6 zZH?wADR6M}8!(YQnmjj}pS#fvWtsVZ7|p~p-DrdW{e>t^anO3TQX!G|^C-uD`_e$k zk`5;KBJw1A|7@fHn|BUBRyN4f`;7iG+MVw+5)R+zHUHWbB_5I;0Z&ExaA(R1$Wp8G znQfU`2Iz)7xcF@DD-&LmJ_p4=O87o@?vA=0F3M zq|A73ZFCD_=Yl%og{|qn+Ux{##n`HQyKN1#hV#tK2}nahY^}%uQ)#+BCi{KxGY6vw zc-5a8DpKpyc%e7ShQ9>etu2Nz026}KXM!PcLNeI198eg{Z+l#yQgZLJkcR%6Hw(nV z(Ofq(-Zsnz%U0eB%(??JbYovUehzRK4gp12wuJ%UB7JI^J*T0U_jedYO>6CuWkjH9 zhJF)(?I247t1Z|toXtCuL0Bdw5Qgdy08OEk28l%$dVp-fE{>PzUj}OJ0QN<$1K6%0 zCIql2tqs&;xY#w>`#3|*YUzI?tea2SI*<-$X$K(f766I+Qy{VPt1MGl>ayz~0g&_p zLE!^22eF9l2x>=lG@gOXJV}7|klE~mD-D^a3%3oKH;A_oDdhtD5M=J@L<7kl=ls$_ z=GL_VWJnu}bO`9E#?sLo%X{L(ZIjZ29Y879vrsAypp;iKSfm87GNg2_crReYyr)Hc zA|-b^l}}GKX8pzhYVUycFw~NgBkiHK*au}AYE!LK%33)c($l?dn=RCy1xyZVzR70J zSDVS+CzcCvJ>F_>(_qiYkF^b(rdQmN(h6}qe=4w4zsbB^hPYQNo0EAr^rIcV8omOk zR(^8L@X;Af^XuN*)-`U)%aBoaCh!FhGc>xX|RpX6_8S2wVUZk%uQwIW!eC5`df$B;!Np$1Gq z7f2GK7_`ogIDvBzIOYqS-i~wL08>s5zI6J$)Pw2g1*r$p&+}6MOg}G3Jsf)eZ|M24^z*#b2guo=pa1gH)wh}HJPT4! zhSC>=(wBtNUkatamVTa>dNcjJAoWHl|9heI4@2pzLg}rc^o=|R{&Mx#`Uw*gio8XG ziXbJsXS;LTN%&L-73iJ2@zU#Q%HP5IDIO%YaEmrr;|nj%1|amHU59LGSJD|cOJ zk_tfFGH1ysI5$s=ocpt++Rb0{w2Q7v+(|2+z3s~9&RY2#tPG`D>z|ceGUk>)JAm;+ zUv3V0FN63j`loKiUvLYt*{(l-;n%bNH60UDnqC4XpE||{L0o{g107S@-@gb$AfQ(tKR|(n zo2@pmjcQM#TIedM?{kOm4v<~hb^@y);C<_i(9PDBrZ+fD-g6^iP-ClxaWQ~;@VfoNr zevlt-AnORnZ(ed}k)0>ZbMH?TB3V+maGO&2{knGVm$!RAyWRVV?cR@W_x_7^?|Zj< z-=W=mr*`jaS7-O{gUox8`u|eq-HvB&yZ8U3->rTyFMYPTEgE>`6{6+>G&-QiZ_GVE zF#pk52tTC$9lm#d8!LkT;fCGD&uY0#cK4mIO?8p+3E{h&p%2u9RW}tFSyfn-IulO$ z%EK2P*Dd!ydWj0ohE)-TUO0U>6R)Vd^rPh=@l*P}d&~TQzbc1m{RD>9vQ7Ui#Zpq% zY!_}mR?t~!MP#L(g{GV~D~O=y=Gq?aRs>CowAldpRFaWT8chFSC1(`4l%&%-6WO*> znmM>>{@l0GlE-79w{mZ$7w2Bx!@ML0d-)5Totc-fR5#5xkG&eT4r$lK0WPkF@WLDwqza{$oZ*9`lIBuDEYxIA!@DnmPUFWQ<^$P>CQeN2FZ%s^{mvsiW7UfHGk7#P|U(fu;$Z9 zul&_3uJPu4+(k=Id3evYypAFf{su88)(U*@$%l2xVNJPL@x))|)KQyOf}IK zt*PM9D?2@D=G$k?VYC)8P2cL9?Od5)Z|75D+A>cw7ARzqf9S5Qg5#w6(N&6H4`8%{ zH^v>LMNorzg1IeUj<8=`u)UYiQEVJ98@FN6CmJSHy`ZqXo&KUBU^k9;KGpf%^~}`u!f}EP;`|0QFC@{~e4~e{C%7>IG`zq3t*`rzn5PH7dw(bDppV+Gu z&aoD*nDt%7tNy&i2nW{0H9!<^jOMBNUM_RTZsrk8Ldx``XHYQ>E6QSwS|pmqdbQ-_ z>rbOhN)^1~qrjh`vaD))GJT+mXMQ1H9T%B>s`;=cDC(&{wFOQfJs=ppSY%FAF1p&p zul~%wbf-FTF$Y3(zTyT_M4v?v5yY!`eSEHweiiBvJ8K-pl$wKW8kmTGWp-dY4c$41 z?r?F1P#n-SV+guWhM$okfcx?c33DP_K|3NI^QoQzk)1h_8#Zz1>`%uKP2Cw>S&&BjpwC%Wfb*pXW(nA6+whWwf zvfO86t~06SSazS0Ehnf^G4e?LH^z9!D3KVjee}hE22X|^$q*neEe0xdmbNbjrYWDa#Q7iE8fRJXq{V<| zKDS_lt6={M5f0o{#7mb7s=t-o0c)v6bB;xoBc_m5APFc)-m(BJ zIr=nmP=9+yXS<}FqL%K~zqEjG%a+=+WZTEeSBzU7+iOFt%*6Vb+7H=Uo}XLycETeV ztERgSI@(EblZhzz+V=R_LEOc9R%`wKxplL=M1hw$k&^{_Pd$*-v+Wr$_+HI9_$%CHLT0Ic$3lHdc!ULfp za@=Nf)L!P*54@X45&SLu*4D_x2AcZ?srfCA5HF0vP2Q0yWf9uv_?qt!XacN zu$cE>L`NhKTB-GVMLXKNQJ-8PA|AMa?y)BEPpl%Oh#3X{tBHfH&hJ1S{QEsJ)w^?= z(ZN*1GnWR7(i=tdd~Y)c$Ox8&kr$_7^EPlyJ;4Xx{E8D_9)mr>p5#2$* zc1iaugn#PJ;6KCtMOoah-8nPzu4ZxE$NnWYf@Y0540$T-Z-$-7ak$MuXe!SBDssYy+k`DJ51>DcHyMZLdn}_*-vH1UStwn>J zyJ1q{pACGVFz*?XVefy~ub+gl5AVVK`&cbI@NIemq8DHViP6UikknNrt?u)(Cip%t z6V%#f9tWN&b{WC46**?tx6}BfCDWaPDY1MOKNGawW_nUCz^9!oS_{;*!li$b7PWci zG+~P#yv(mqdzv_Pb+*6!yYY1gE6_f^Zqi~jm0gde8p78pHiyNRmaB!YVf?1?bv!Rw z_}b~`KO(-~&_XYr`W&VT;cF$y;OklYbxV9*fvAHq52PpHt2ef>@b$dJHFe!40o?(< zwgW>MeEmb1wfuKAEt%6P*a5zN{nq!v*FVUm9_&La!q-2N^*jn+wfv^>btx}d_!?FGBjRi0;jUAau2TVj{e(urSBw3+CBEiK>m&wV zOHaVpdDb{vZFW~G1N>M5wF7*e`eqnk16&F8wmD zuff|7!`B;liF|y$Acql^{T#6o!q*>d4vVjQc@(~G5_*!kW^-74&Erw{dW7FJ zzW&8a7QVXf^&{e|oZ2vrFZi%mRq<0bnccUlZD-U~B_?qoYBiWlB z;cMe-VSKf~2M%9vVC*4 zoFRO@jw^}fe%Tuz@+f@0!fzU1FY=Owuao!u5%F~@wX0KQu2X?Lyntlzb*%lmCBFVE zI41^PL{GriaO_LrtA~TWc~w%b1OAG>8phWEC%=$~8?jl;R0?*0ue)FQKKN=Omqv3A zt%$!`z}pYQS2_Lg@pZ7ANmRBIVk3mF^=NA!U!BPze0@n$8eeO8$->v)cK;Fabv3oC zQQTRH;mqzsjTr@^BYg zGOK`02l)E@rSF5U0o1P1{Fh(hYXJQA!|-(~t|A{_$H{9%Wup)qA$;v@b6Eb`hezS7 z7r$wIMR>`=*H^p#i1@lgo-#VM4BiOg>+dolk_RR1*DdjN5_Ai`y6GTBtsw#H)`qr053nUjiz*qUwFuqQ67zSUb(vrE6f*s)NcQ1Y)e7!|3jpig; z5x(96Z$AuQcc6=XeBC5B5|wpDY=rQ2ip^p1bq3u@S=8 zDK>}WFCHy_@teli@w{Z=>#*KGBEH^`i-Jz|be#(1;YyOlU-s*k`1(Ea27JZn3Ha)V znJE7H%$G)wN?LY+uRhO)@%5}Q>--zEWUi%P2l$%w%=f|9v*glf2GNS}^(^F_~hkdpLrTO4tV~A63o%T;f14WJN?FWuFppg=yK}U3~Wm zOMkj&FTSRd{ptnkI(?xZZkdbt7uB_42{1XuI`CLuQ7wvRp(nmSjN32S2SP z1M`Vxrru}m0CE!jM(c;Q74D1e#gkod>Sp%#3HwjfE;I+OafoU4ym zK>Ftmq@%62IOTxTkdEv|M-t;%OFn|r4%%wBYpov&jU{+XbmL&BVnhS3SZ@(e{I%5- zL0=Uf7@am;O=k^hsD|W6%~|}JW8vfU0##)bYi`d8?(A$?@@l7?Svv}h$$rCMV?iQ0 z@POAlv3^b0&6kYC;y%70N@PsFIaH0&=mGW@djw+k*J~68SdoP{mgy4R8k_4*;6UJ^ z1Xmy2{M+$7_BeTCC*lDPSv;0Vn?v$ycpE!vtrt14Iqgr@(x8^zdfAh-B8__}V4{A0 zML>_b*=_PqZ4^cC(?6lp=XANw9UkTA)i%g~#}}&ANa9+p2ioxt9P%-PTLn1mdtzvT zJ@1KMi2LfvE$>)(5!F#Yp&%!6TXX6%@Q6R36L&mat4zGuJOa#abO7s4WlB3@fmU*9 z>4r|Uq%mH;!HqF8xs|LM=V($QlRx0ij#U9+B`z%F-III_rr6cu`bB#zs@nF77H8U< z`jKmIl?>9ysY-v%f>xnd{!An}9~Zy(IF;ycPc=cWAq{V@XT173xn9{bUY}>Q3R`5( zPd9pVW?djW&O#(cFwSDX-C`Pu%xDhveeZNNo~@?Ab_pVS_e0>^fT9qbflZGo>mf9t zlN0$zJA-U4xwG(rnP{OL(Ryr>iG7@Y$#277j>ou|i$-(6Y(}pu$3hZ0b0Y16aoW zJR}G#5VJ)Ap(m!#iQzl>x6NLcttg@%im9Yz63`>`7n`wM%dD83x8Eg0~_o&`q zQLnyU2ge8cp(B2-4A`s3)OA8!2_ZkjdOu*F4_-F*Y@zf9`*z6ED{H~Puw`K2DpYi^ zUS|1s2Tsz$sCe>&E&11T{1|EaFFdYj9$04oW)a5z)I0Ve$-D5fJR%@VK9F#c=l5Yq4?Olt!v}mBs>0baTA03M$lRt;ts6Ov^Bu6HH$XkZ) zgeC-&?n49w?Rbha=MQnKQ#7mu@e;bW>7#hm&QE>JfBw2= z7;Yn=^FR`u`<{c_xxfQACY03 z-_6Ldzw=D(yQubs^xpiM{{s2-qX@E*v@Z$JG&c}r$N4E&l9I9iwjsar>dx-Se#yJ* zO&tyuGGE*ZJeDGNtJWSZgW}ED7IH1R6}ff=;wf#%tR>nUOJ=d)+2eeUJ% z9nS#m+2;V-@16o^2lAf}G-tAUmkF2~q+p=HpRqxQbcJ@kOZL~b@;2oypDReEM+&5cwo+iqzJ6u0C5z`n_OBGcZ1etW0W9_90%TjT3+ zOTbQ%+b-;y?!uR_thJW{c``C2zutb!_5VO?rORq*Y|hv*h(c?{{9D2z+lo;mTQOq( z%60}f@^O?gRTeBgo}!fiVi%~zFV4X;5MjU|*US>r^vDCoFdK zQwETd5wB@_s>!mCJ>Ndg=+oHhj6NMQ(aVOK}8q@89!_+I$1G zNf_zDZ(6mvd!tP~mq)6&46o|&QV*c*->uYKI<)8|%Y;J@5H zOP`K`i8RO=Hsz4CJ{@5hdf))0oB0#kCWCPCFUGyK-$z1q0X4q8+^2tR>p%&Oq}Gr= zP5Jt?lH|zbwr0?tr=XBhukv3=zXovJ5E@q(e_UOFZAia1`qd#U4_SQW-tb`rfSXIO zFakL@@LP(S(@Al1>ySK;LStXC<3&sMXA~F_{Lk4kj}EZ|=q@54F%`8vlRoZH-xiVu z?eLG+Ga}&>p6SXRYWNB|Y<|jrfjBtMi39&NEfq>Sm>HD%unvPcLN6s14o8qmD)cws zR=JdYNeRsd*mw1<93HX6Z?<%JY|KqNJmg{8rE(&ZxtvHNCrad4w}-6tR~}Up<2>`V z+DcTlb9?mX3`yVZF&wu_mXF|4X;-@W)JQ5Ma5vRNU4c9anQB|}gB-+oTJA-!Me$EW z*@I9}$pMK)&J}VNoEfF90_zL;1+}J7vTf#p8JYS8UPjM}klOyOy}q11d9t-Idfit&UC(t)|rSkTgt?n=>p8UKP%;tUS~Ci(>x0ZHv0` zS;v;<(%I(VC{Y^CO`XBmr)1!%FyHLSWL2|yXAtnbNq(hmoGbbW8B>y_=2yT~P-Ob1 zan=&`zuwGY=E;z@10>U}?@2T=-KTE5ZEK5-+;ZS$yn6V_!bHLhT_Uc2et|rdp$*h3 zm3I3bDMz9O zZ&d}S-BCqeEA_<^=Z$dQzbZCIeN=n6X^ z|K04)L*>9+&ck}$vWPvH)t$J?wx}|8VOE7>Vyg_tlqRYQqlrq6Q3mn=E?&37OH|YA zcLDiwCCOOUZC>v|GaS-ttYjRw+`OlU?>Mb>4P*65jlyPnP)_G(i@=VDu`{YKB;V|g z|JP+lmt@&Jh@xC+$aG(W7-)OOkEDrQ$nJF4qit()J!HA)Pbxj&+zvfxP7tDL1dDIY#vA+bTtiVi&5DuEjraE%!bD} zI;WJ6QvMkC^Av0>e|e5ciUU};DdF1!wOJHtSSE(m7CzYm%GzI9wKb}U?Q93E?u>oM z2TJwNSyYwJAwOiVvb#8m-^aX&^sFj~G<={!J*$gUu+-!H^oEa>Gm*D(W9J-u7?Br0 z=$-E9wncew@}s|iv&}lXtd1t~PLk^`F%AWFA(SvkD*(+@;Kg@;P-X2xKxbr1q@j#{ zCx+(ZW#jA>7D+^7q8EI4q_R|t<<^)X1rv)9j%zQar z1A=B1@?YU4v0MS>iA!aGF#EB4J`lL}o-jT{^?~dK=GevJEo~1EBO5s798p=K2xQ#`KDM%LH`>Z{qI^osC%x3<`;ZU zGee3+JMOU7H&BuKJ8$NLe+IVFBJ*?GID%ixTQ%l4*dUg%;`4%S$Maf_GX*kDEU~$8 z@)E+yyE7(1wLxP;rgXRBw1Y9OdB`nY-ES)zSCXS6Fvh8x1#|`!?;*+%u~@R8KcGI( zjL=6*uC;9chiqN)??`^ivBes5D^;h?0(XAD?F2e_J}s1ft^eHeNk+b<_osBk{|mRv z@c%PBZ^8fTe?4Y=F@;_kxe*BNX$nZb(wH5w< zt|R`}2gm>OGyMM#>gbUFAJoU*z0`5|pRrp0hsM7b|9|Af}n zmr!rd|BT78}cFzhf|jvJq_C5q!5L!HlcH*q0T;Zi8TC`FVwH{c|4l= zf`JH1t%nX>;{E~b>^k)IJ!GS8>~iTWIBQ`+e8>hVboB0!)-Ae(UCC7AQ?oxJM18I| zdG#AQMXvw53X{M@Htj(=hHhb@jSeiAj6LP|kpnez%yc*m_Z|?TW_8nctnoKU+>Ct_ z{{Uc_Jk66Pp|7>!zKDauHBf~@h?YiMUgNSN;=kZQFM;zzZgy9v<*K^Fu(CPqkB{2w zu2U8~-dgJ~&IeZmdE|tgSn@n;1`@j9lXh^yu+{SE)cjO;*TuoIziPGb+V)3q)MRWl zu=o3E_i$={ZEY>rq0D4dUDF44?y^$yYqV@;vcSOFO&77p%)|W{M}IP=?LUWe9Dq5d z2Ayh`797%31>_6EOH)dY5 za}pZQ;n$w+YhJK2m-&IvD;1H7(E;7iwX|C;vQPrGBZQ#>G=Zt<0LIgw~!VkfwXv@QPqz(OpZ=&^>-VlWk)`;qlmV$WXB3~9aQTO zm+1Y;XPt8dG6BYvnTK7Pk&7# zQ^fd|H3&a7SExDOzlO##AhiB-PTz=cumN2_@ak-ND+o@IynUI@)K3ru#{$7ud6V?5 zUsD9wVu^j{uac%r7aL(Cs%Flb7`vf@O zbW`M#&$-&>+|LB)HRg1Fs}qOC;)u~``Mb5-bEW%`{M;C4#G)lXHAnD0jG!BTsli#S zWe}v&4nZRpQCNKywa2gQSc{#D>{9#RpcuqOqtjF#X`Ey$m_H^o@kV4>Jv0vbXnES| zp>q)lNq6#v>H0fr(cXTW?LFjO)7-$In>9lxiF0UFkK0L`^|UHa_B-%VS%L$vA<2xu z776%C;4|y*2EUi{Um)N51`h**hf9KoA^wACAhJr`5XI%`iIi_!^t;B~2h(+EN;dt) zePlecML2QhIf10q&NDCMW6P7Oz0A(W)|e$I2Gw~2b^7!b_(PxJ1pUe+rjhL$MM<(QkZ8bi=i*Tf(@0F+)ZQe~UEeyc9c9 zyQALxf#8f;_sHl-_PgMrPGD=jxm=?H7=;`fd?nw-(JfsfGtO+4ppDFUHqxl`;OfUO zm(*PmX?%k>OP|QwQfQ;HjXTCNw71k0_-u@D=v`~RQ{s*e)1f`|p<|P3y?I>*0}#Db zY6!@OMDCS=oUk6CQB%s=QXit$Gxm|I9J#U9tU!de{3|VgSD1eRf{grKYW@;_@3G`z z$T5ij`==$x1O5d$o=PtB<)7(NGH-7Zpxo>EJnOFxzCZbg&AOAa-ZU*kqdNo?xo_~$ z*Mi?{JdjaYs$8Qd#<}x78t;1Zo>ii2%-BO+dK)C~$}I46env`08^Cv}+ujHN@)Ke! z-3Q-i-WK@CnYx|4*E z>8f7Gl~2^Kt?o@d_&3i_bI;V^?x?a3#P}NB|GYw{>j$h5wWrtdPi52V3(#+xULn=I-4}%D(~=+b+3yrM z25J!jpN_jxB`kef0){>Z{=reW2>94<9Qj%Lv^7AV{XYKM(&zj%VREvM2#{mkeTw9o zdi0?qQY`G^KLs@T6PUk4n(Y1K(&W1!@_(brCp)6a8-1F*N;G+Eh$cN~lJmhsCxLB~ zCL`FoUZOos9@&m2t$k=2z|rN1f|Lo<0 z>1q%W&UDglF2wMxzOS$(s$&&dEZ}TKrM>9r=Q($oeT+***8VY|nB{O(Kru9g6yrj2 zD|6%zu_7!+g05vievOd)stL=li8hyqq>CZg5MCol&`t6qo#e7(@g>NSXY+vj}_KyUTK{Sy~IdjRyZSovkUUPY*=T*>0JJ~5grF!){&HtUw zuUTtNwuWN$E+eqYAPynJ7g&;400YV^NVP!}dY@W6kqijP(iFyRo9P1}C5C)CrTLfW z;TAYDL=QV|2;e0ikwnbk2o~;9PNl;8eamH_8*?x z6TxWG)b<#Q`H9NWjuK3(BqicOu+31w5jrW46rTco`_9sVBv64pDl=^#l72e$Dq_F} zZZo?9$`COmttIxUX}n#QB6=>ZuR4Ea>B0I^95o;`Hh&P@86rfz@EwHEi{e>?AReNO zWW&r)d|C*Ipq)IB3@~5(KEU}K=-Gh%vvIy2F#I2J{+zACD{``7>o|7?foQ2c&YeiW zR1nUc6v)Q;XrJ-NSbF;&IR6HVdP|)5Z;$gF+PjvkQhHuhkm?Fv!}#VLThdEYzYx9; z1dJhk-$nIe{@ec09^X3>zKK4_#`gq<)S~jl2Rf$r@r^&t%72UR z&bI{kPNBd4e}nJt(rsJt`2UXYBYu~K?}s&w1K;=E+8*EgfD~W<-yO(%A?fv5T0el- zFaKvYuP6JBSwmkZtulPivIV4%VC%i8Q6w-@vfpvLwEzCo9k;LG`Gsj#^3PL4*qM39 zQ6ofQwLZVED!tXG4z?=k^|io`d`+}cVee_vQ;k`6b3ieVq2U6Gp>?Yj##pV;Bd{l( zM7IXQXv@hNt>C68PBGkAJW+!|H)MF#R`iOJeH%gQ+NuX;Hxqdd&2Fp;VRq9*vOl|V zXM6`@;Uzx!3S+9(9MuOg(snIOJxDy!(6&Y30 z7qKC1rBy|Re~wh+P$9ks7G|T zupeTkfJVac4yVCbpH^?qhR#~qXs{t+jm_?^y7;%8B<@A3mf~5=UFyZxMsArC>$BEt zToP&g1D&-|6_4^z_Es6;gx#w9tm16$o%2Nu%rd&{d~CAIetKC9#mtfH@ETv5uphx; z6OOKK{Hkt$hRCV(@iPh{J%?)*Dtr#<6djSf{6@QXcLKXs>+smEkM;S=G?3P^U=-gu zd6Dh6ZDwyYCH2)#Z&|K5kta3)8~!G+YU#y$6`P~^?j_XHA}@DdiY;Jh(42Jidc-SV z>_vvo@shhOU@$o?iGjmyo}ZPy2`1K7#Oeb8D@&~2PC{9;SqYuV<$1XfieRb`P@TwmPolEaEcSD(&^CZXXTm67 zds#l!N0H=EN^QOk8skeWF`8f}-8alykq*2s^=7L!{+i`TEsAlaRo|30zz6-+Wo`Yv z0mLxXR?0P5gl8QHuuzQ8o#XN66-3M5sl7l;`IXodc9G00H{%w`zjp{sm8qcaX?s^@ zo7o@*Z|frClWFSTQ?5S_6Q5V$?X}o#T3zh*S*$8c%p--kPs-n^`-2KfU~)*g1%f=h zX|P1eD{|;n>>%?`CdA_9!)%N0jPA5}j5M;sp(`l_c_dbteA4%ed5IkpgIJvgR#YSc zVzFM6pqxd6#8f%7*x!uuF<^Hivcg>I4LIn;mxP0#GJ%dIE(Yvp;0$3Z2?k4TaX-#3 z7B^m8d+_=!w}F6BG88BSk?r1`O?j12VB~{ z8y$+}k((3p%jeXs68G1CBF0{)_M=IC%!LI0-Q}ij-!e6tjh7!?UNvfH5t(#`iS`=D?^(?JL8tx)X;)?wG&0QmnB40=H zA1)QR^(~9!E%k`7inN#Z;J8SX^QCrznzY0eg|Z9kZjnc!NJ{PuSrQ`6Sb6KXWQa6b zjTd1CYxhMW(ewt364P51mcQe+mO2;qFZOV>w+X$On!qx)lOv$Fgihw9Y?#B8`ptQf z#vR4lFfLG0!;$1j2^MwQtZJe(J9H^MQN^@CQN!r9TrlNIWJq zk}vw!9YD)lbnI_;1|93Gjy>x-R_Hobs*Vk& zW42D!cYmn9|8A|mlhgG*qWbK-FxhYM9d@AK-cNmBB8tqO*Mr^Uz_*ofD5PMOFE>Z* z96S(TjXI7mu>9Uqr4=)P+fXdfYhA3TY%Ket(;jLXL@Z3hJoM+tbbRJWyKZ(U9|N&r zW?+7c?)DJ<1rk8wfam_yos;Hz%N`z%l|oV@+Xo%9n^WjmOLvztFV)2Nbf7~~5a`fu z`+7;cPzMpk1v@(0Dl}7e1*K<*)V_P1qwBw8oX7G}#IPfhND^rP1nc*)B^o>o+eQmr zzpI7RbTZ%0aTg71ieow zKqyH6Hk8h}2if*ppwHRY!t$p*%ESw4a!`!*l&G{ZPLeXvM=ST63hsyBZZU347+Me;kk|QnI zGeD@sE^4y(&z`j_t0mNrP$XK-8ErB>6w7k*qRa|^5T{lo5CnoXh*P^x5(EzfqqtJHl<76mr8qOl%)oVW z9|ab*y|&e_WB6+BWay5nLRT~q5B{lHhKN#!&qtStwm+8yA^mF0S)9kpyIXG$-524n zxD&pRcE>CAd1YWkm(|#8@~=ojeP>qiIyNmjy{U@Y2o7vZ_d}9%^746RaS^9 zy)3F+PLR#P)}WBZ#S(dS+(hGVc=02E%gL5Hy%>U|B3Ax&q(MiNq=C>Jof*(ZTImP21z!fcu-n^CSx55b(B#RR~{RW(H z6_X{MCuAbM94mxFHEsB2{ru5c>`;-DA!?{ZM^Vf8ZKjf6@zH~DocQQ6DPMshUmB1PQ+!t3w)~d#mfkKBSQ9lH5J@HVn{p!! zpMf7Q;kCNk%obI&D#N0R4f?@eZ~U0=Rb`)v2_^Z>XJ|*ub5;wiFrR%cj3Cm*bZCKb z*y(FA`#aZ95Wl@f@vd$!x)s(aB6$lX zaRze~XH$H^UOiD}MomN-|3!&*s=`aufNZJlY-Fb~Jb)T2;|nWit5>OfDB9ZnzJ3{q zY;U<1O+h0ef>20D>tsyI{pqKWF zH@0f{Q&*FmcvNM`BomG@fiE)NXvpIihHdk`d{*iQ>>XD}jWm#QP|QLbjob0VqcW69Y$N{r85 z!P_Zl7&a`=wnf>qycb}ermd!6Y0+c{{aMghk19dl8z z=|Wbi}7-p+BiO6D@97tzNpS33zgFLfJ$2@;PE5~qa{ zPYDubp9asr4?W)<%KQtTk0XtmoCS=^>}s~?&g^D+@l*4FHSE`zY5c;I`YtS)Z%`d_ zn{Kd(;p~wrGnG90Hs7LX9F$o#U7m;@u+4PL`dst( zD}=mp6ZtYlU)c7`gT74ROAC72!rhFgbvx(H5C?inLk?s;%Y{VYJ5RNj*eN9Ioo6*M z1W};5gWFLA-vb3XNIf}7D(oZ*)6W!Q4$7lF(ux}eiwroZcsAw?%Dh5F0t1%4$r59U zx{Ui2S7}k&SrUu4I)90*(u}usvg(!frIVOq$$=v#!m8z8)Rm`xN`j-`7!{RdT!4`V z?H`E654K*u`i1${h>kbG=HExf0GY|E7<48!u6@1t8v+9xeOYbpYSRm!@S) z$brLeUQ5L`JZ7(4Dm-=Idwk6Qm~X~e3-dj5U#R`XK}XW=IsdBYc`c`>U1+6&M-fK{ zh}GYo?L;jA6D|LYxHEeghtT2m+3Yr3_f;1xZSCI;vb==yEz5j)lCd9AJCb5CxSU>1%cFD|S(bU?Ck;bJIzz9@Ns^c4&0pMQv9CNV0 z+FIlcO3GE9fK;gDrb>jGH7WTfEvP!(o`m>3)B*>2>Y?c#cq71Sq7k6o=mTsI1BD{{N26<_wh}E1!oapEjT@y{Q5u&d_tCB z$IfpjS^i!^lKC6#yo)6+#dPeUVevZb!tsEv3L8}vR(N*Ni;$k}`#D6mUb~vfk zi;tVq&eFDCm9tqimHxgL$y!zR71sCkSaRGHf$*WR8f>E~*3>UiW8%YS+CrVQMw;;8 z{Oajcx`H0yi81<7T{qfE+ZB{UhkRRkWzWN(%MwC0idWdal0tSgZax8MeIyJ9))) zbJzJ|LZFs7<8~R?4id3GU)y;w9$XlIv|y7QGY!rYJDT$;6L~DDs&!b<@abh=-OzZM zG9`x3EVHxI$YXbDBo#M23VXCQJy*8mhNMJ7tj|kwMpfJ}Ti(%FqEU5M#^*G>Ly0qa zj>cb(uWfpFaQur{vaTuqT6`07q3PYojJpKs6Jzl=HTfMC@5`a~uQi=s+zHvXv9fJA z>V4dFTq{K)W$v1+YnJqB8*M3hN{~2Bi7Rd5T@K^Hq;#68=SFzQxd~sX`42NvPT&Hd zo}G>kZ{nDi>24xA&r0d4=~*&|?d?7d`M9UJ@Da>Ehtx^vtT*p&R)@0YpL0}S_WZMc z({`K#PkQ|(wzl1;3UFFw3I2Vkq* zAX#>wm?YV62sL7w^tc&Wb^+v(G5IME#oO{>idXZ$M(8y)iYy%6=cYQ_S=>a8fpX1$ zxn{lll&x}oY&j9!!bTBcZfaMRn^85rsZr#{)4!VakW6L)zh{jum5Z1p{R|D zE}kZ>G(D@Lw4mC`%H@4}1O%{NQDy)bRg#KwE)Ihh|2H!CIVoTSb4vueVdb{kc8_t+m8DA86 z%raWjBJ+gR?p00n-}iM+L$C`jX39BI2Joun_>#zDRSh>JFXcm7TiI8LY}woh@QNGi zCJkaP`_NIlTT_1lvzE&c7p)VLK6%VpT0{kR8cJaM6=iQLqSRuO3YTa)7myX%@V zdf-SmfV?;ZI2(j2{-*}2wQvhlty=vxR&G2C08wdJ%Dv>SSg5M1cjl{-TrXJ6x8L>wo4M zJyv@;{l5pGn1TEk2)`?W2Yb*nL7u536l zhilF>2jDU0VVc=nQzfz-6UwrK#=V^;O3>=vU&$w73w^c57P{Sh#zRZ9YD`_oz?J>Y zi(EaOsK>=uH@;L`fQYQam^-|znWHv{$MceT@87L?T{lujW%RYG>LCA|ADI8{ADDkq zI{%dH{>*FlE)$UWGA}WqP_m?ce*U~Efj>w6|DpMxDguwGm(C2}<=*GEd;d?n_j}Us z(t2@SIF!#31pMpw*x!L{`TE+P+!8hMsQqhum^zTc!Gc|4$#ZOn6&|W{P+j~5=aF!< z%{SCg5Ym^pHVM}~?Of2Q74Ee6tnMcaxqup|z)U@hUT45%Oxypq;mp|)8hP;gLtadw74bjlgBCk)&Oz3HXH|cBrOo;FRkS0pyOpH{| z6VSQoXw;NSDr{u~eV+~T=31}mhCDC6?j)%PTYY15mDqd_X6Gx+&c~dAu=PmeMU2ynuZ;Ei!1a7nr`j>ZwRWyOi2rkIap119H~27w z#eeHe{jU4Q9Am4GG!CPJmN#sAV)y&VqSkwu1N_hZ_^jas1$&sD{uiR=?3b2*Q1?X? zY;bkF=wn9WR+X)~ z%d>GI5oldhugtO4gDU$ z;>9~_lW%%F9ErA4Lpp`tD>atbr)7%Sg<&wR9W<`p`4W~mzI^kM1HoAN_(!IV+46Dt zZ#s~V!CH~y1ACzNQCUz@EF`NzMRK^K6^H}(xK53QsYt`^vUw8lL0mFJK^C|=$WxW0 zV5*7HCti#_UPX++a0+Ig zp^x!#D`19JDl@9gtN*M>+$MpgSTFW6)tQ)VhO;u#k$5qfre5p6b9Hrf1%QlEa`XfIh$v9p?+<=@Q z$JFHOmJ^*kEPpT3a6E()#m5w7lm*og1%x}Ce8V6kp}SIq6_@N|3_k6E>KMWQ{UgZ} z_!^71RL8%pW~nas_~fvIa-wrS?pmF^ysbKSL5w|CiE8u!Gu(B)T>_=c7C(#s$^6^QEGqjP@FR(%n+7qQU*|8ZG>|B7r3@G~tS&Owq;C zC{8z{hebbK)AR#XC5NZw4}>a-k*3vHIZM5f`~3 zb#Kmxc@m_t+%Kdi%bPh1+va&E(vYt)qyKi3_KY+Nsv5Un&Hv2HD)bY+$ zK>L{FsXchp_@l`K_eg1nfO(1mhKwid_q0Bgqiuwh30dOKL)?|Gk2JWjrlCa&i4)lu z`>atKOO7oSXArBVKqdk{TlSz{W{%a#OA5@58)S?wEE4GvcBOdKXyVKb)rm{P( zh9x4W_=jvL(8jeQb)&iuwhj^H)$zC0MMA@`Y>m$OCNIhviGp(F!md!@TalZd<|Udu zb4AOs8U8DR|7s`50^|BtW{D>i@r;1PHXCHu2d?zu*R*N^8Q&4J*jDJ(w-ha;s`^#A zwZL;wE7!cj$II*fMfnY4C`66|bi0T*;_sSa4AhhL^^Q$vd8#P~@*wY)pDTa}T9MuhM>;X1*zpCAm97EC+MjH0Es5k}Tc}O&V7W0vz z1p-0C4y4Bb7NfMA2~hzKMA%r!iz2JPBMtjOPquPl*K*JM$HS}^&5gV8(eF3ydm(m8 zQ5VX6M2MGse(5YTA0PpM)TVTK5cvgLcy|)U+F>$`JR#F=z1I+KRzwD1uyAR7tCk}% zzs{{aUJM7Bycxq9{n*+ZqKL$v8a)Q?t~oI_9TcSqX3EkZhY}N3bX&O^>Kd{9Ub(KEp>PJ!-3w9UK3|^saW2mmaGBk|!DZA>am0jlFwS ze#>3~Us`v9#6Yxs{`hWOS&eO5CR(gcj8eC&RY9x)=2l0%A8y=lPL4`Mnp1Ksu2tj*+j zLg{@fxJ#h(so-gzHP0Y_TdFfeXX(MhT2bJMQ&xD1XYeD`7Dbxy zhtiZt!t6_q#^$RyURdErP*|b4{8&7m7}T}_koJ`ssT2iP$}?dN`Wh<;mXjWH*N#&V z85la>TP846&AFu2DC1UUC6R{1q)R|~MKsZcnJ!kAa&4#vTFfng?s`Qb`GT5Trj^Ar zzpzARhwQa?ER~^6#jQ2UHl~_2D_%SRz(E4cZOVwxqIHqKGK^M$m?r^e&sj2W zf8hp5g=^fH#@I1+VN8NU1!Ijg{Myp4I$FQV3BQkyA?PfvV|Uz%X7yX_L|!=5*1Pkq6es9zq{@mQOH9lv z;=Jd7uP;y9Bg#-HD2@s;SOIn-nq_b=Bw5}HWwIq3Y)11h&=-rpKWmfNG1;$1qe>3E zn7fJ0D<`^z5tt}-3{y(s!HX|Q9Vah)W_oD-Ni`#|eYfzHncqQR-}Rj(*0w2u{VZ?w zNw~r5SB(xK#%^`2|N3W<$+ua1x%cw~IdBE$pWK!3;FE6OI-ULwUL^%kkt1ptk*3B- zW7j-}#`d?g$68wo!v3~ozX#5xd>6{M%vF4Azu)WqOAc0U`Hy|SM(-c#{T};%hTdP% z`z`i;u-@nE{W|-8nBE`ZJ*Bm@K;I_&9q>5mQU&?aVCcP^V8v^->P)DCzO8pXwt7$`r%pWdn^51rSF-Q{>UiO zV@mIqmA+&c>3x;H29cJ5|K%FjZc2YWEB%SHN$;Zcr?b)zQ~IhD>ADXsQ_poue?{px zlU^BLUOC6?iP_v#0*%&xeNN>1>G0E}uRq1Uj9h;KiIwqp+!Cnw)Ul^t-~Y7*7x!hl zHtFlgemOZ~4^Ev*mcU-B-$eN zwDB+8|2@_oYEUpJ*|&7oK}e~=hzE8a$UKX;Ro1U&(#s6ebVsrchDW+rMrJJbI?JR< z^m{tVB8OX_qN~yKU){5@$7_kZ%`|{}d%QX# z5=n*c#!Ikr!a<>0rpjD<(DVn_%y6Ny*^8lvoK(eB$SG9YJVd6sHKGcT^d$f(JWPkY5yn$@Sf@Ef4~N| zA6V&lG{=0+8DjRP;8J^4Y>^gx5_t=5=$sSTiGvm*J7MQDSy^ESVthLH1o2RYBP^@UkbnNflFqjCdvcyH+WFFd>T6+jnDUfN%_M*Lf* z11FjDutf0qc?4J^&6p|QH-?|;@=t2_h{eB<9%C5w^G2H+E3hca->zd;`KYoiY(%@| z-~+QiGxFEZ?LDa>(0d6v|YJ*WvBWNI#0xLGx;1T=0sgZG`p?#3+ibY9Nn zoxAF3fyZS&c{w)sdE_SAyke#}olLorn>ty|uI!8Tv3{Y)^rx#Vs#l|lL-gBDJ@6;sxNUHjhRx#uwSX^FIyB7r64B zA~zkHE`POO9(TC`BKx0^@)XlU@>>y_vjd(?BJ&qKCAq|c4^XlwHg6DqCO z^-q^MkYxjepxY_J?46J${=B_734%j1BsV85$;lC)K547)BC}G52KAbUsL%F6V_oDb z)hw*SGNwCj8rMTmDH|Qu1Z8oQL^$tosm9})uvME=snQmdZ1JeF?5k0jwquyrmc7Jo zsW!ltlb>VC$S28DW9Jt(U*>BW&V;s6bbpN5PDAGYlj}Q6kjO0n|J&owKa?UGsh)C{ z*F_)cZG?tAMmLye_Tejkjnc2mPV>BYL-9m)KfPr?($h_3Z9^liph>qu)eXfDD7)HG zPC6DUtK5h#ciw%bO3jTYk`XA^G8xTm3yx0>(s4e~J2bm%97)L987YO4K05G7JB@W8 zlfF-(rc52m#Poyj@~fg-+&TU<)t4)`p0vFsHR$$K*L_6y7}`MkTAABF((@5J6fC#$(`dw8?!Rb|<~;vpqFzSWmuri!GRNr&(Lcmb8DWm_pGKNr@FdB-zx`cr_O`$5 zPf`vjw#XFdiJ4lyO>v?pG!A*Zhc2ssfnZ-_!Th33qGSOXuUN7MRH=|yIU(mx}8 z3T6!VRUr|p?ExiqI6_u*#+Ld+M!cmC$D9cCY1vCp!@}636cKMLd+Ju3L1V`(dK&qj zV|sRm z3-S8ffOD9O5~+u6O@l=%RXIgd}KNfoaum3DF z=#fzRM_8vp{xv*j`tMe}ZD%y@lFZTh^FF-)vHdrlDtYB5i-#}xf%5+-P)th?6wKDl-5y9UWoPATe9YQNmd zm*BhzTIKskZp-4`L-rh_>`_)TK>CY!h*%mP)YR5zbE2&!|cQR3S2z2w zl#E^H>uc=4HwI&y1v0?U@O|5_mRUH9L!Y}2o^|6XqX*Y@9C%KtyT{|cYFJhT7I zFLAO0i*`WM^3 z>1*Nh2Y)80ncf}!vy_!x=-&~rti}(BP>T|#Ce8XXI@m;kSYzOIMCi8Sb1X&3ru zDc6zGnf@LAW%DJUH9OG3SrpjWe1*GF^Cc=6%+~~pIH^S;@ssNSBYsf56O`mGGJL~6;Qe!G+1_$kpp)-3+>fS5fZx|Shk=Pb`1RQeVYjZgFGCgmda z`>KFX2&0_{VOi(0hkY5*%Pwzv!q?2l)eE0E=9De&CT=y?nmXNg5&bN@@sj_?_V33K z{O*sel*!KHu-V2Ibe$}0tM@@ zj^~(bl3NoSWF&@}d0$L|K9=84_vp2ps#P?j?!IP}FOSwVG+`%Aq3VQ}&u!e%sB71~ zCCd;+Vi*!ju1|jEtZR)Puv;|M2P?cjcv;HFFJXS|@&)uF4;Q`bT4j{BYfYZB!d8-p0Bje+%6D|2QNX&Tl?evg_vt=y~L4z9O_o+MPJ0hBRzlbx+ z>IEZdg6;&7X(GKPiu12A%&NxdrIb%8;ETqZNqpx9!gJGwO}=z_ZS2rvPI3#{rDd#_ z`&1`|M1*~!No1_OSn{Q4(i{9-wLP!Ww#O{@BxZQkX!i#_ zm_=Q{+ms(4oqTpI=5onZnpmua_FwMwS*nRTpcm$myvBk?y;tJ%k+sXB2|XFoTx1JD ztp*3GqxK05kI53Lf19~MQRMVl=GPFcP0g}ujK+}OdfBB0)C)kzvRDOoP&QvP$$yxy zZNQ{iiRJm#i7!URPpl@SRgXtUCtt>%DaMM^V;27{Cl%4Ehn?(=d92^b-ju9e#7Rog z;8#^k?LsHfQdGk;F#?}Q?H-N{jq^VpjS9NdyLOVoJ5ncFI~U z&EF0{1fIYTpDIq{D-g`G#U{j`j`ViXOy0)1kYjVk8XEagPH!~4irx)M75))3mT2l4j8Cs%yP~mW z{k|_1FP6)NuU)Lr92GMQb0O}*^eQz*roQ#x0YDwMVIIj^U(>A93Lt8&nC_;5<9D9u zwMy4WzqHn@(sf2p%R9Zx&er|6h1I@lb>*Q~%#3KvTgW->;=1Bb#nl&%! z!wc`d(6rHV6joJ~A7%t)IY}$yeF300HpoIpPkn6D$*bG2md|F33_-n>7)^|mY~Z9p z{Z$^a+Ry_ZXURE(Z2N&e5uF)s(0HhO=)QnS@bwiz=Vc9)Z@+3F~I z_fZqMM=*uE0lbcO+QeYD+D4|c^eh+>{)g3tlP8mVA~wssm!y3dp6BQt9Bv=%(R{){ zX7hVk^Pg6BIW$0TXK}FsTR)yuiT|{2`oO&SI=o86a3_J;OJN+R279q)*q8l*F!*p2 zKi=U7h|5ARjRh*|;=+AcuDW4)k@^@DlNi-EUna|7KA@}7pt**HQaMkW%}e&#wd!6muUE2ijt-sa@yLM+D_Sq_cE;PwxU~ zGYfA~soM=WlMQMCYxCNnZQrLd|GbC#6wQU*w^SsN(7qhzBrG#C5*CUd1Xw7|8UA<` zKeg12^gTJ#|WzllTmagWu`PJy--%X-{A^)ev!3xuN|BDG<>vmO>?Z}LvD-woavek1c2S*K41LL9g7;TN^r z2JExg$0TM^pnskuZP>Doj=DwK!tB<}6U;Srz?$+$D#p%I)-~)_ISeDu-y;3Yp<<+> zW-h_;7R%#Ret+0M+N#1Z1hqG`!YvKsEL1m=3g<-{K4ulzGjnM+Hhk#lA7S+bij&z2 z*+`g0LcAA`JMf;z5?M4fn^!maZY~X~^j<$uQ#D&9u*TxChXMbDG&W<;MOR9QjmjP9 zBV|fYa{Wn`?(K;ait_8sp^S zq~;c2rMm+Oc>@mC3UHg-$6IG=(qSas+PRKfQ-`a@>ND}|sG4MdmcqJRbv`nlAEbSz zjlekOSFS1k!qHT|WJu(NHS9$-}3uz_#Oy&H$xpp`HF)29%Yuz+y{r>SrHV@$vFzIPB%7(Tob&|Q0qpXv>{)&p z`pWQJ|I{F?v;R9Y@h$1ca^lxz;+Ln}2W88y%!zN#39rw|GnepJ-JdwuZp$E9Wz1AM z-Jdvxdw@IKU-;MLnQMf{QahDr&MW)c@=SkbF)Po^_@2KHEBicDgYcy-Y%{V zVNe{ix9$$(S-q$D#X@kq_Z&+JZr|%kU*cfVs~#YTAhFt{4 zOs%2pu^OzAkv%GE>i#=|$49w>$AM4%AQU`q7grYw9=(IP*qP+mP zVPlv+)0$|sx%Tc%O&xNEa-D{hq2#g2Rv6ZHLI@hZ#DVQ`gP11fbS^)9E~?_!n+UQE z#g2D?ADGxXmCeMCEBYys_X$gtkSlh;Dr2~`Q?cVtM=Z999h*GY$LHz}gWbvxmlY!Q z&CEeY?9fXa-H9Fho7OB8JLsHsdbo8oD|%>PT}6+fx7%fOhOJ$`bGhTFd4b$kjP3xbK{}1Gj4Hi|!>~|-3?7Lfz+~Fa2%u*-DYeSi^@%KnWjdjLU zrE4Ls>Hn}94*^K>#p-Vmy<$iC;#kQStG&M!`&_@Ig!hryed(bk{Y56cJe`lk8cG%A zL0I{JmlL0t6YiT6uFMHf55oR)8QnRb-2UbA!hUo%$Cx&2&zCWz-RU#*i6vWso*ww* zhft&aI!K#7x$U?kc(dRU8f98#0YO7n+0-XqWyDB!N8wf33 zlxWXb4dO+i5ADHd?kJ$glA1P z?8l$2)uEsBsu}FZ7vo?*uIVdyFh4;{-|87@SX`hcbO(~FBja>YpJS|nbQ5^shLwB= z5HW~vG+kVRQGnknu-VAWb$$iLDaAXoFrR|B3)*xky2)+*ki`X9I zgURv!2|kj`1l_>#HVHW;m!Z9m{NfHOblj5*^t?}EYf)sz`}8%+O&maXYmdl`SJn08 zn~{OV-{;^A%krh^9{%g?b?Qr`q@ZWy^rex01wHIax8CuF2XGRt|E%7;qG$Z%Psowr zpW;I(j=6F^xj!NM_0h_|U4ygpl7bWkdvNOLT77=%&*cpyLh9%t1%83iM-|9J%OD3w zFvm#0I=#rd!?@JFh^pO`N&J>S@4Ak+m)R|OC4Cq!dD?WQIMD8B=niQAm1@xY<1VD? z-iCSW$V|TiwjePaw)SzBzSq-RRivE$^_U#IeibtmyjhgBJK`X)PSqPwnGW~PzLAEv zfaW{VAO6lr!I(1iT#@?E1mY=7>~Y$&k#K6%Rd@D?)R(GS_wjjDvhZYOafe1u2DW-I z8m+Wk$nb7qvg1+UP3#y-J607V|8f1Lwp~lpjv+{{Np?5~U^Be*t$|J-jy*QmP(L2h2F=9iGB&L zpMjj{jhzs#Eww9L59>v?5Ux2VVkvUq`pMp^B#vhkC1e&LyClPpD-QsBQ=!k3opl=& z=w{5w`@T7^!9+>y2T#=l?!jPi68puv4#XU3s8Sb z2QvCMj-d7*uC(G4_*rmoU-4|?#iB*lX3s!$T-KZcaC50@04+W>Xp8& z1D_UnS(`#fgb(yB&!@?C>7%S~breO9da0BCcJ>?R&`x?p_Iq%Xmh=}$lRm!Ys~{1g zD3&u{2Sx9vV z2$gu@BnC2G>*p5dbFjN`r6~x89uB;ceBoInS1LIeW^?*V)s>p+q%T&0yJ_DNyYMRg z^5(I8KHXoLejfcw7=Aw|ye237dJwKYYhmvnM_T+Fv^=Q8G2Q%FkGwN^XxW^y20WoXHDdO%3hja-v{qbJOO`-p$$vdP`R7L`yPjO3gA z@a6blAM1<9k@Rz@U6K~jT(I)HAFR-SI#qNc_qTh~FP0^)U7wDo=717#)A@EBw^sE6 zJEq|Eg+4yy_=Wq_SEZ?s`_vnvoB*lr%7#XMA`O4wUvoywTq5tzkv&2%p@0+eOsm9jN2XKI-;{Qa+=HHWVoq`yOu*)+05k zMao6*6+M`PKgRynBeR!9 zD?f_N)B`1r{c-=MMC+>XqZT$e%voC0WB%v3y+o(k8N@|Xgj6phR$jq5Aqqbi1>amFgwL=p!gXtXWNfzQP9 zB7t2WCdq&2aPaEReoc<2mRz5 z5nmUYPYieH{eJR5CErN0cLe`*V)%pmJ6-V09_+FZzZ?ftbdO)2=@BmK*TpYS|2YT0 zwEmbe{$Ir}_if*ni(i)D$o0RAU-n`U{{nv5``!Ol_+`T6|1J3C?LPk^etD1rS^To% zpWFV=@ykpr{6C9d4#;#+d*w^<%hoT#vE*j`3jDJCOA2+yFCQJ1#V_t4!!Nx~07G21 zjb`2AU*kb2#4nfl$s4Lk{+^#a1e_A$ms9-YRZ1R0vbTmocE&H&XUWPVn%p8c;A`X} z1_lQIAJ=^_UMMVeNs)8t5%N4%;w;^UrLO!5BtMKn0|w@>Gxg;Txsd~48>*V?%NKx> zpi1Un0+Y``-Zr;3-sc4%rG}01^CbH;DPIdd4E3AZXU*uIJhFtN25eHJsop2K{aa-s zDqn{87(K^fgo79Gagq8VQXz0HJ0a7zZ^>_z!Oa`&hSXCvi~*@JV-TQH{)e2cc1NMKKHy$E4DIH%IgTfvHVXEcs^>vBFw- z!68y#tKL++#!)?^C}I>&{--{@kcDfn9B&WmZ(%-c&oz%nD5hqo6+}Ze1a{pXj zW`yG=e`1pNn%6{Rx+c1-KVvCw z-84^(9@;DB9)~&l`!a)1zT_i2?yw*wiUMsMj+J6i-NCB)x z9(}l65ciLh4$K&S3hm?5kWBY1k(dm(6!qCN7)rQ-d~@MI0)VvO)l! zqpbLcv;(TK8C!#v7?01p{G>w1xyfepC5e;vJ0Lot}fdX`cz7 zWI@|lx?nwg_{LHiZy-%7BS5!WypLQ2mm+GGUQo49G^ef~I>~h6O|B|z?}cv0TR;!< zWcYqS!IntF;Tl)vw;~Od8gybMVi_*9elDhHk-_WW&FKhLX5`K9x&_;v{1=S}$4Y&c zQ&}3hX(?Q3(f&{3&>VdcTYjRGEGVz%p|tIq4)@{mR<-F4oiIYq!M1o)Z}Yi|fx3qb zPSVag4N~udo0SGMuRwz6FZIbUdn%5ArI)A(Pk+Msc*P^oD$>iu4*;d*kUdk`Dw$70mB zmRUC^q@&=6IWN1k2zL_utN)|f_l=D6z3b~}?x%4-3 z*FT!7Es74WMc4L})-;37cpwv7J*5glqdbU_{ww8XML_^F8_60;6w_X`(!-}W0bnQ4 zC*n3=zHERP@5q#<0UQDFO40!O6hx~reQNVtFg$9x|jCF0Y|() zga`E;FJHAMF3!tG>CEm3S{8TQG^Fqm+TKetQp=Y}6V-~byeT;Z9s z(p;zG=(y3*OowH{^IB~bJ&Z8Q7T!zHnxstFPkvcn49)w_$Jz0{n1O+n z>GueCGd|v|(StSqy7BhBh#GWpqLadOpB+BVN$tjZc4AOhkqHL1VqK)(fmd=Cvv2a*{WcPh^F~^^Frsm6Mv^3ME#&=-=*g z@|)X^2frTcB!`VoE?@U{^5yH^e16+&z2keybxgx*p7$DF`~ieFzDs+e8r;0!Xo@GA zzKRiV~mY*FNB+cA?Ot6mpV9@#m}@-ZR^N7wX4wMCNx$z4e@w z-%EPDQfNaxRgc@L(pX_ma~O!xG`LC)NPl9Ei^8xuFAGNLU3@C3peR+q{Fu`e`CO|3 zDn+$b$|}y`QPrUFV(y9goc9zsJs%Fr`>m)xQHVf3nCwk6^c6&=9ME_b^uc_B8r@$N zsjuln{uE!SDMg9xyF?m>_f^C`bA~Z97(plZmOr;h{nb+c$^bjK9;5O~4o+E8p<&k2YIcp zcl1|ZIVZEut}5|68`5y%zr441;CNRA?TxFly?y9&zqh-7X>Ui+0|@INI%_YaGQ9Vh zO`W@4PcNdWO9zGE)5`4VPvD~&;~g*7XF>_=HMakpz4E3-fe!i_k7P7rT9jh{wjpnZi#kJO)YB8&?Ju)rtj)8m@xao88;bo8I+)%!r?4faUeNPsL zN6Hi1S@Qc$Qt0W=;`j6csQF9o==qainS!Nu+$VewfE@W;@F8o;;O9&iVZGOyh^78) zj55+{#2%)Pv7roB81Z5sJ9R|#UKza5KrTP@1W82df5;ozwCiid!#lP|?PigYh9LrA zgu*h$koTB`LTu-OpIZotJsPOJt`Ge=$%12fA|`$XU5mOW7R4%1^Pj?_R?8yw>omN? zXC;vts})+B?g{op`RpXuIM05leQon6kMn(d;h!})wr;N77)`E^6=f~x8=XR&(4UzxN2R;|%o!p>kWm@v`) z3VNG=s%0XxE@3plMe^lCrGc3Kc~cBmDZ5P!)Ife2#)=qATSizq0>wqcwwla&yRN@2 zQq%mE2|v#zAV-m^$H&jX3bm4;zs$fY-;Z-}S;dZ#NZlT}p^emN!knAczOTnG%JSkk z|A+?%^YUvoF>QE@BU|3Pm)~jGB{bC9zPq-{;oK5hXb3ZW@IIO%o;QyDv9eZA_?rQk znXB&b7rb|Th!9>dd0xvjri%{&@4`R&*zCs5`FY;&=z$Wnqd0i}J^4UR&K7xX?1fFxrt-thxu;!ut(8A0@Cg0Gwo&DGPZrn19R!!}AVTd%Whec1P z`3HGM{%0HB6tTzvOR!aKM+iGP&fMJCw`i6%I2 z`MTvssobwJ<)W$m(l5|>G<9BC`e#Ikaw0dWlFw%4DgWs~TZ1;7k1hY6N?f>dRk2wmX@6>IkHgbmhx|2x#i$Hs1zE9q@-?&Elux-A^ zo{>j=;yEcP|K9aRt|I-eC~6$lQ_W~Mb0fG^*#x_SQ^n~ovxAeP%9=+F%0pXKrKCv1 zW<#ROhe><0fEb~C;Rlfb?RnnZi4V{CnP?J95D;Y0Mj8BM4F{qg`(4{Av692G57_TP zdGx4}`aWs|(&2Y;lT-1bH|bsG3ftk*5@VX2O3no%4UY#8y-oH`gzeC%iqsb!fG%x? zP4n42*h@WWVM#fg5w`J8x&~K@PCtP|Ny*VDWO|N(5BS^s$ykBPj;qUV{Ra z5#Odif|_6b$%=|kO}qZ!CHM-LP7E!dT2V{HEY?_zV+JDT0Pdar+hc=yKFWx+Vaii4 zN1tQ(p4QT7!&z*R9iYB`7o*<92BJg#N{lYWmdkuVoMZG>eFJ`cWAgskKxuE@a!_Lm znh@31e+9kYZw2aL&fMY=dvnS%*5-L=zb$aW?ug$~&Km){lHn_dQHBL)_m66qnoIR4 zu6Y!GUO_QJ5+P;~NBp`d4C`ij{ABDep=#+Br1)^g5&MwzsYcuCwB$~T_YpwSYrNf{ z6h7CAmF%AxwBHZv6N>as6HCLU+e@72&bI!P?AA|}B)0DwY0%9Up{046oYV;zECdq; zMzaY^UCsvF8ecY2Osp$(+1Ed?GXVQiJsKMzIdC zU=y9xWfbXSjF*$YQartryw}*gkUMg)EU14)oJ_?UQ%mGGhw`zL>St#jp9g0irezed zLwEhQkkLFQCnKcM-k`C`w3LbCHM8TK>*8&Hix7j^jJT(m@w`9cho*SDouC=5FPCB>qXLRF%+mOo!f~7~NU_oB zXramwqo7}Z>EC~pq3fziLveWFV&S6~zdB$4ITe_3iFF2MB|b#%U*;@~{<>WMajLd9 zQooO}e>(^h|16{)1g!?U79Uz1E*Pfj`5L6QBlB;vJ>m4ci1!qIQu>79+>rV;*9^*Q zJR>ji=uIY+M*3Y{G%0xofPYg!TYVJWcn0(}P+jICm*g4H+M5goQLHS!w{qEzLhnS~ z2qI?;6ocx=@vMm4+z-m|*G*BmR@Pq`2bHAjVJa*eAYV@W6*VPeyVVXyt(UJPbSk0n=_ z4z9iIo8a2V#9O*UixFeMwL=d8*Pj3SHqJAE8MUFm^oUK_{oocy5#1N^61vJ2;w+wW ze5@{Je$eCRBbtVW#^AycT&2 z|8B{<<|O~dPrg{mxBAH+us~tyX@2r|m3$S+;m80~F%25v>>E1`@Wu~i$loo^Zfg$* zxSC80`}j0_!Y1qBl|*?-A?bwy7)L;h@=qqe`?di!^xifF%d{cN92X^R{#r{*|B+`Q zLy8D2c-w!ST?bf+KbHww-WMz3Lksqx!In7Py4A|OeLjGB|DN=l?$BoX!@M<6&ij~0 ze&BsL@Iu#0s5w0r<_eDgr-j~$uo>R@*6$p2?)z0MH?2#w0)3gU8H z^NI3~u*D46W_QE^|0GGxa;j&=c}AYR#T(Mu;ureGr>gj-S1s4j4VLQe|oGb*Sq4lX6L0lpf4eaCGoy)T`b`1E<1uma%wbP780pYr|+J z%KH&zDRF=R@Nr6bH@_U7Pb^R0$ni*c=Cn8`ygnz~2aR@^=YX8>wK?J624QTi(@$i> zAy1N?my`F&obcM5@QWxt!*>3X6aRQlxHqTLVV-4~_?7AQoU-@i^nG?VoaxIU**I=i zW$UZRsXLyNXM9fGN9W`@HYa>~PI%XxI`7X3@0Jt(O-}yZa@ty-)7D<(@8-Nn=X@Ao zm*gA>jn?Z9-hJ-^8tm$#3)u0OWFaVM0MJ@*CGY<@+^YbQVr9Lslj|9C-L|F46{C`5XDe|DpPF~6k&sjGwYIpay|?FvuQ zcY@>kKi2$Y)v^W(D@m%xpTKr?X?WqZeW9rJ-xeo;P9pT27WmP5mpkVs6)+anL`g6WaJ zK7nt(JMBc!??mdBxoS=~!G2}YqbBo?`b1~RJ$L(mdPS?^S!d~I5{lTYuA?P8!Gq-5nEVhuW-6aPIw3}myo{0#O4==3qLt6a zF*vZy8Gco_(>8erJqM^HnlY(=L>-TRALn*9!gMv0YW84m@t*x)ETS5YJG7>k@y)|Z z#s99$<$qI!-`}feUivOmpHxX_?Genh7XYtUKq2;kKu@s6QTgQ@=;2*S00?LfvV<+( zIopIxxKr?kmw&JFZt?Dd_J`1kF2|iG%L#7?#Q}6V5uo@K)nf2IkdSvX*nn6+arR{L z;JRy;<6d3IninZ={>Qlb(%56k+B)ydTGdih=eQH*Imrog&4>Q4#8V|}=8`ar-+BBt zG8}EF=9*amV@;zr2T6C*J=H~!Dky3~0H0=9IV`nwda}5n#JdYrPF61Fv@j8Bv#r|eYALIsW=31sn z%k=H_fCUFzP0@Vl$Y;{CFvY4WtVS<=U#p>4yzE20+n;^;Qk(AEB|l#G)^%^01{g_Q z{ug@e2?i{Yb47IWSepV;qbVyyDJ3X_8In3V$KJt7?T)5KRu2zRdIgtgWbZ&h**nb5 z$b(R}WWxe_1vE~!AAidWvZhS*{$?DgIfu&-m=dyi4;ksQg#v1~sw8v54X1OcZyKMV z_VJ2!_bKWHy$J6g`40Z-rcKO?f3vx2A~1^vE3&w%>NvdN$xf{Qa=aU=0yd!iQfcif zRQAz{nuex8;Q;reMF2=yjEZ_vF!>3*X=6>TU}jPH1nIX9SV$+*6RA{BW1_N_ z@WfCFhvJ~)5Wr{m$W|T;W5!sb-D^?hF&V+`X{p+3BdFY33m2s=Vawze_BGwW9K(_B zfW>=i9c$(~Zs8~Bs?_OaP=nI`T+<{>xi>(Acd9DsgYe=q&wCP-6GZ`v5rw%G{RuU^ z{FD3`m*1E^nNz0B#ajUF9cv*zSJ$1}70tGah#J2i~xD1L`S6 zIEe)TPkoORNQnhvrr~~{(3j6j!(oe+j=HBy0yejbTu6)5KSW00YZ|qnS{R%B2R?2m z&BEm#>cduOGz2|{zv|NfGF?Pzv)B)lb=hcq4KnvIVqrT(FCucSH2x-_%T4oK2n?X zlDV{jr;N{cDvj&MGF#j)cn|QFkT9F~)%YA_gWKTEuuq@n3|}N-)0xujv097dyJ&fG zt%XY{>a7vi&QW8rhZMA^C9-~qCi`=eVtd*9@LcQzBb#282G`WH=W!y4g)hhfp-#xR&^rk* zB+omRM=x6bXrS_xSsqNmI^u`YP<(ga9ZLPHFicV8_n${UYrJVB)49H}HXJBgo`=2b;z zH>YEKXYCC@-&fCyR9rnTeK9J;NwWgLO@9*bpX`LB^tRve`Bqmpaqft^Vs^r1tEBIB zi|^xOq5S=`Qx4qAc8a3BQ-kzA1*9M2r;qi^S1En4(igB<0#7%1^|AVXtNfLI{(b!X zCCZ;qei+KSkXv}e+wWQ?%q{FmsyXF^5B7b>8PA9iyc~U{oT#j)o1d%N0pho~pZ|&t zV?$r6{gz0gTRgpxSv@Ir8Dz7hdJ}tHIAPO*wu{4wC<<WAFA;7AViTPx%j>CTEEB;zsNx<@=c0(yU{PNSpVGFoYn9g zu^}6tkrV!Q5Qb`||C|&5R8F`p8-|me$8^Jnrc1K1>34F%c{$8X^wg+!y<8x~K z%X^s|IkC58W7B`m2@lAr?}coA>9=#jJ#z9ya>8Ha)8DyOa7u-fwdhyD-f zj|!z%$>@(Bl{Ir${ZXq$f0h2|?gxd-yU`!b^7TiveErcwW-;Bp{^-%y1odVL-&ue3 zJ0<*g^haZttLAR>M-MOWq(8bqu224T{n0OAMx;Ob)Ym1n1G4CHhV}jL>5qyIv{BCz zw*I>MqZPX%|8Pyuocsq&NH_YUIk$!!?|0E3{g9FYW%YUsC53#$ zzo0+b&u`spftq|p`EK<`Z`^9V_h#ml&uXpU)c5uF#RW;EN_Q^0L z{|)_74LTm_j~sVzY=DMKe{?;!e!J5j#dH$*W%?sFka#_*ilQhBa-pCh@7ufW7r!Trqv*mW9FG?7x`da#;^`G?k zpVAl2c^rOh*fN7=e-OsMBP2Azdz!A}OZ7z?4H!awnOLx|uP-{)guc%DqD2x={|EI& zbDm(RJJA>Q16+617mc<4>`Y(u3~28EmcHl{$+Mkz#uSuU+8L`@zpt+^3Y4nH`1;ar z^hK)+0)3IZ4kbG2i(0-?U(~XLzUban!W^N#=oe;u2g-8`m$TDDebJryzP@OEXMNFf z(--~Mxg8xpfkdJ)LQ#Yi8R(4yh0#wq{`h+Oq6ZcT%)d%sbg~f=AM`uvi*Q`!Kj`tL z`l8?S49brBq9W8OA^Vci7hS0J%;<}FK^bCIr=!D+ zDgG8JqMI=hL42DhOA)DvLbB-&-6m1g9Wj#>?}3HLx7CSDrs3GZ1oBW%)PkPKSjb4j z1tNvvp-s1nqLmCWM^SW9t9=d<=0I;W!9I7cDB3t*Yv}73{;=4>B^32;7kkcD*awBA z6h-eoW@)ALr7ngtM5Sn@lDKdGW-wU`}>t=^&tVz z73f25kf~ITKIAItJu>gvfu1fvA9BIG^c<;Kq~Az>4Br*%Kg`HwL>-1S-er#l`VT%b z`j1_F{fE*|4buC3LVBqG@XJ>zeQ=O|m(l}$$Kif@nbP}_4g-{O=jkb_&eJE=R< zciiw)sPEVX7BSR!Y+o(rY{V9{Dc&LCGTh>nk^_CmqiIXNTgkTqF*>Jki~D^_j;nm_ z#B4uDqjLOIFy$6MsN_K3aiyR9Vo@oSvCwyA!4iC`(|U)ALoR> zkrQsq2_KPTYqT*NpHt(*FJ=0e6MJ_yHvM2u_<)@H-pa)Ku-fwV9S&E%Lcb_Bx9-fjs1Av$GXG}Fv)jS$=MQdLp4nf@|5e`(4q8#; z_LZ}iiP-T<|C!sKtaRFC0>mSAdJ?p&Ay9P(%I2M#0p0SDK-Ayg>W^-oCq5{0Qior& z9nNolO~66Ya_=otyg~-+kdNnktF`M-4(0-d&=pHl}TTYp5uGyazZM~`E~Jw zU-ZbUJ+D1+NIuW3a-yC39+YsV?00|#HNefJiNBA>$ygVaNDdxhA!GgUg)HlDDv&ZAr)AYbobLbxt24AV; z()U(c(4axRIbAhw;{awlX`x*IpQy06Fd&=E*$>{tK7amR}uq#n!l^4If z2hnz<`j*G?x!EyP^X4pB{^`JTe!1+^f%4e3^nHAEt)3w~|M ztv#Fyq}BDY8s^l$IwLgpsja4Nk1ovndMB40MZvFMDbGoUSoOueXPlrF+DkL-dT-x0G z+{S9SyUvvwT)9(vxswaipF?zl@80g@Ug`Jwc3ck5@Ev2VfE>K_4`k*eeHLvjfWyjL z#O&s!|AWs(^gd6X*7MT$P<)Y67b`A}TjNgP2%Z&ufPZ&zfjyNi>$|_6x-ZQ;S$ysz zPWc$tv@8c-DK*0m8U9B7JPU2@JdGP^ytU0*> zid5Z!Hzi)SF3YPu0_4eFddvLWZ1U;kz|1)=er*$SS>r*kswV*7Q5T=JDg2rVmBHyp zODwIzO!h_8hBeDB`7g;2c%?+sa;m3}EKQ37`}nNpaU zgr%;elnCf0pyTXXh4`}ibm>ckx)?kDaZ?2c4rEmv9phq*n|#-YY$orqnT2u$mZaC?*P@w<=Lik*~}FxA9j+xnai`k{M* zd1_4c54y)1AU;MGwhm1`8JYj|6}u`tC~|$N5Fx>1BG(^IaPX^*>l&XKyjoG;xuPVR zT&BZO9AmscxH|DxKCgJ!G_D(+e5G-XpXaFRN)Q;+J8JW;M^7YwE|8Nods`8$V zvGPLx?=dNPT=`(_xYnap^HSk+YJRgcx->m(@ap8}N*Xe{adqPhm&asGcc1>%$*q(4Et!<)IO0mzyGx6np?~=2<-Wo~bmH0%i!k9w5c!6#FboDr zX4S8*1|`3E|$^$PJ5RP{Uz z++K~3%)*@coj>bgK~3m)euCxM%ja*TK?&beXu5-JzTf#8>bg2lV&cj{nHLqt(InA8 zTJ3twbUd4vmYP=k#TDJ|Ok0s&R5*J=I!;|iM~&EI@vRSE2K#by<>(RFL)}Hk>yF39tjJX9{ z#Km9t!CU~EE!Q{LayOLUq8+t5TDcW_9)i)wG~4`vCq8TY#w8smfF853{LBG3wTgO7MvLNUrZZRqDSkQ1Ve@o^;c$4osuIYy0_l34ng%jH)6kJt4 z1uhh_*i;%Q3k&yC0(bFesYXn{6u-)G>&xqiK*7FLK2jKNi+=%Cu{Kpdq8rq^;3JB= zdbGf-(?M6>MuW`4ODrY0QkELWGE||wlL1ufdmebPKY_yQl*vlg`&Wp^!Jx#F>4oF0 zouLo-x`WU8@oE{F&9^*IX$slk{+!nC&P7(uvT5Q!_UVLI!%_(pfSR^GVE{$W2Jb0^ zEkVqm4EO__uOa53`^^e4XheSII#bVDIRet{2Z3om_TWWd4hwr?Q;Y zj)7neZ4MosT$(|z=1RgxuivHLHG?-+H$FW&+1B`ykBz@moqSEmaQQ+~L56Lh*|+lt zZyeqDS~k;BAj2PFKSw_1Bays~+E3UQ26A~+cRoleX6ROV@ zst_L$;#$Ka@%FDtE}Nu4$)xJU)*~V_#C?qBUfhwuZ^)Op#V>O?E5y4s zc7H$5!1L0JJldctCTXPhb_as`iFka7%;#yI$o*zr>*T}ufdnVbI(u-F z1v2;xu)KwRlKFOvtI<3yyjTZk?$Fuq7(H4{l=p`}NuLLI1t@on8<)O@S)HZ6j>xP1 zPyHU3S9_LzkI$>cg+Bz(`z~yehWZ_{YO{REwo6y=k(pm_Iy)+Tc9Tb=!|24_jUW6R`>D8gl#|*6)xxuBVf(G|!(>yzQ%KHuW7s@K_#OxVRp|chacbIT`^vgAM(EO?r)&RohR9VU0me7 z^hl_*T}j-+p{xe58GYDb^A5=lwV)4gE|VeL3>WJoX$AHXrGiJ1TY(TOR~gm$=y#$z zfT7lJ4jr+0{+z-9%*=9*f;qFnuJ(t zR7tdAt1eG1-D>ZT%$Ti2Jmo}_|9}{Ct+@cU!Kv7WPoDUm0xF;18E!>MQPv7vS+y`| zev_X^-FP8L#si*qbg?N1n|RiAjL}If1J;^rUzIkQebemidmB)jzj0DDa>>fE`R|JV zdODW8u*7(;em%2;&B*`RdfIDmY?}`T+ZUP1s}1prjbDrjqtEcawY#-l7}C)I-p{HN zxPN5iVR+&EcYEYD*AUU_t1MssD)h7xMLbP%t z_}~&BAC&SHKTn82!1WM*LhybC7a@wN~XD8u<(@WA~69w_zkfEv0z2M<(r zg9rLBg3S1H_IpvAJ1G}G^o1sL!4I9*|57BwjkS@sqq6(o!GBBYYEb?Js1!GheCo1T z+6d8E_qX6!M8@?xQZACfHFz@@0l>$>D|~yijItSL;PLYHQ4|>-MRC19y1Z3^SmlQJ z?q^GA1aYO0r#glLy~~|{k$jZAU)c6EJACQ$t_6{wz#|v!Q}@!5M`Horibs`jSw-5- zEd1*|p({$g=Wh`)IE{f^fhv4dnRMKK@hg|My^4pWcJuo^S?@p#!n0ycoKR znpkl?6h+#{uSNkw-rnr-%b9Ty3KEk zn9zKL>tOg{vR1n~xiYf&gY=bl10)Z<@TV`q+XZ^zebyUqm&cD+!4i8b7_Y{!{g<=f zQOtGOc!eF=-RE4lUAu4Uw5~g^`SKxxV_WNl9xY?dci3(}5~$jC>w+)-eACY6^NPE2 z@dX_a+(l+A*N)?!^ir>^_9!2ksxZ{>;og7{kXSYKQbAMlFdqaHO$F)yG(m{@iI*38 z-{@R|0&qG?KV53xBqKX4p*}Vass) z1p8o8wjVvxcQN!%{kXJi{eo}p1Mh7nG@z%fubrnrNqv3gep&pRtB)L{>b^uD*<@?< z_4JYN{Mv`FfIMXAWd`2|1g12WxQ~>r_){wwNJfMGv$4cAQuo{CDLx za|DhU&wH{*UR!@(UV}yL&9TKw^QpNfH*{-SIr5$8xV1&cb=0<++f2jn_oz9?O_b`V za%EfspjQLgjHUjx)zJ7o?fj^X19hH`>?fy7Z!5d;QsQw`-8)Xoa!Ovn^X@}=%3;b# z@4a6MK<-(k>Y(t&ZG%F6Kn(RC^j$1^cl7|4e^i=|A*uB^_t=tbH}Od;W+0h z?2mK%EUR;tVp8`s@}oKrvUv{XO@7O|&YQA^^+|sd)OR2CIjP6%EH%UZIPUa`-rIJu zbvhhVM;srch09-s=i~kI`ahC6_rhNMq8-uBFI8PGm6v?a_~>h-;WV~l`X~Go(4{b* z7g{>fJpR06{5r+!ST)Gs3B}bOtf8i=O0&&%v>JT+d}3tjn7(NQ^A2w`tC53$w|VQd zxssa}gQ4>#q;LPFlIf?29)hd=?TbM}1M`m*ffwEf>U+ zZ|f+sO_N_wCBR{-hP&X>0gl26vt+&qvsD!DCFgtHagbYfSC7gniRoR?m&)Vfq z%cvq%(uP;epgtPYzKU?!7*()21OC3NO^OmgV(bdU*s(lhM)2ve$sd^{%k@(Bd&mWw z+>YEjX&$`7A|NR{TLC?(J{@@c)7qWE3C?_noQ?YMY6Yo`WLJU*QA+jrcnOT8@{Vel~F3idzyVcQGZ4|a@a zTLN_wn?H)8ZL(?MBo0nCMcai`gn)qmQ2awS%%KIlW~}X3=p9c7rTQME90R3SJ2Dsl z2j^~Q17(0*aPIakDwOJgb2pV?XeE*@hXv5}#m`kG?=Gn`=+QW+QD=|>IL&{Z#Iud zS2YTeUCe#Bjgk6K=%6gPhX5A7^55ZT{|qDCLzMd@Z7{f66lfc~!vQ#!=NgEYRKw#C zf2%<1nG?rxu$9Y$$@jd0j6#! zyg5jj;$#jraTZ`_ngy7<72c=xpwJYr&SWfquo)w`R7qL4)#kO`;lp_g zS5L?sjD9d@Sn6AHZ{E=2sLn~^+_P}gi;`jiEzx>g_rsBP{pick7$oO;wqB8P3(Q|N!HD{jNGGVJJkT?bUnD(YlimkalWyEUXn9JT1c$cdK)!n zu~A4L!}S)sX10=u%Og9j`{cn*8B?^oq=ZV%6kC0lS)v`vPaWr#Lzq5-n5)(ALr|i= z*`sj+*ziq@xEkHlOr2-m**DT~KBi`O@)VHlBsbg1WkICAKapnNSC22WcQifV*v(dk zA}LZ~WU8WxZH1AUv-rYVZ;H>xFuLuuu-9?d?s0Tyx+;~x2KkZNEsS060R*c{+4$LNJ4$PQH4J@%@nonyDPD6eP z?yiiJ86K~ofmo^xhB8vWg<7O3zKZFrKZ%5>`$Tywleh2(Hbq0%Gy_db>4BDF4#I$D zHD@Ufpp44itigvZ{DF_Kh1r=F$|ChY4q9ke3lsbnE>a8U`7P8R0+umDLM`Hcd~{kw z-W#Wp^QxFo7LoS|S)d=Wh?~6MgP4RU*?P_1<)-Vnh!19mqf-2KI~DI6{?W@qJf8JO z^;S4u_E6D^jltROo(T}0bbme`H2cG!nwAOvq-HQrx5}w@nf7UwV_tOJ!l5ttI~5dS zJGInHaBIfjsnw({JXN6R4t;H%?bMfu^74VUS(r#JU8%U@kNF8rB+s>|5##vro@~h7 za>8CdaSZVdUU|R|hQUq@F_^(qUiu#EbsbG>u|)fuyn;k)e)>0zE|BNk;>YOI!j=Ac z{ktoye+Q_4A2AQ;19~rJhV|4!x`XO$ZyC@`u*rRO=&2dhYxlp11pSq@^?_efW?q+upk?j- zy;z0!754tip2D*K@*VB{M=N$(#@@e}F$DJhbc&Jf#{rQ^`;0JSMj#B6e(PF>3BTJ( zu9_vc?~aSx_aS@muERds%R05z@kp}M6@c{RmZn<(0}1UoU%_#C;sZ#_REi&0z9W6{ z?f&r_TQ6VYwni~`k|RI@b~fm@%1ISI4uV$L@qXn4Yy+=XgHOP$G4~>68DGYSBRj_- zlV)LJ6oZN5kmEw;`N_!4#|`_OPR8V6%PeDRFSG-yQM$xu1aB?Iw}Bok9RfZ`okK(@ z-~t3@MS6m<{v@!E^le!E@fxp}5PymP=0Uc(@D&jBDK(j6KXBCTv|Ekv9Cgnq68@ge zch)ngx<8hNu*ltl3pnqQV3(XjE=eyir(De(n<^;R^-Nd(IoHK7TcJ0Q0NjAq0Z!3W z#aQkOam210nguRy!-9{#kj#Fdv##TNXsXxt->1rMM zSL;Hj<%<`MP4-`_JXD5OVHcm9xC2nh9P!FM~$*Tgr8!p2->D?mrYl#W6o0_iBTXT)tMKsOf8(S@x&Z=?=a{7SX z?fv1L0exinC>&5)L=gDU;>@%vveO!=|7$S#GJk%}n;T%Y7P}FeFAEtHCO&;VS+gvg z&6B;8r&)d-pc9Oy520)E5SRq94%61*nGPwq`0QhQ!hgLj zT;&VsplMfQB<5;YBlU+UyX(Kg?zs#rn!3&2PK&03cbGuG`-k`nvkOj#=m`) z3KOUOJu&L~?}` zQ`}ac<5VuQLwKz*Y7*wstM3!zXEh|w)%_o$A-rtL#7E89r zl5fViCSK7R%Wul?Ggg;NS<0mbJC)lb^)J(^ebQKB8Di1WGADU?JN|n-em%@6Yq>Yk zgfjs$A4V6(R&0spw*+t?LrV8#>Z3cmLviC5Sdg9;HIBMYqpopY41b_cOSjvk zf8M}MX}L7yoB`Cz&`m0pSOxb@r-}o+RF*LpmzefBv0`8+;fQmGOLZK18tMVRnMgmH zp@}Fi56pjS+iwIfczlRDvVd6e6g>M{?e>XB3)!T_pU z3wc_h?G_l=AN(8!_G5N)VBdO{6fHia#oQlFJi~GC*J)sQB-x3R!uSLIGvTa zzwrn;$y25{$=@un$WtcL)U-&ynT_^2^A`W-PUUNv*~AYWhLoHFH!@(0^dc=YR}rH> zf@}RyUdzn&7OERbWNQ%VB=47IhPHIBN*ijOpRN3`#p39Hq2JHN_h>$0AdKGk%ADb+ zmGQE6Vi~9UrvY?L;Gdo zN2l)5m}05NZG`TCQ|H}=Af6UXLn8_SiMZ-W+VTB95cb0d<2;W&@8aB$qw9}GO8 zguHE-*7;>Cw?Rk`r^#6A9-A5ev{&aizC|z$QDB}Wp`(q579MZc30EINQJu~e&m$t}kDcqlr{&l-26$!P z&u!oPJ_NYHenpvxSh=ciK@#HG1OJeX!#3NnEu%^m%V(K>&+b&Co!7=VF-)CR8XG>Q zEIx*p6XSiJ3q{@F8GMbdn0ywLxq89&MF$agYZBGULui}Y6 zEK<{lMRU2=(7w3q`m@H|j`$*gfp5u0f}Kq3g{k#VAk!Yyp12e{cI_{qy5~5&ZCwU`3*M?Zht9R%JvP$t88Z=617p~j+hP@H>Ucs)6r1DHKat3B z700E4{ca&Wi&ZSwN$DN}@;lhaF)5%q`2=T+s=}m=j8(L7t{ThV#Jo7J1094;Cs9Q2 zf{}~pAiYq>oV8q3%3tdwKaTu*X{_Qci9rp9ZKV4V%3C6+ zgt-xiTCj&a2!FF0>qw(?oFXL0K|MJ`ew%4Utx7$$qLu52C@tNbK>G)C3ju4Y}BpCYbzgmpmuJqq&vLhZY}wCW^0 z9j2>bi=Fv2>%?NGe98TnbmdFz)L!=YSd(jiEZS=9L%=`uFZX@|GvlP>6S~>BhH0gC z3*LDwJE^N+2$sgMABrx0w`Uu7Bt|&bvtF(2al@b4rOM?$ZLvU=ES7)x-`!$4Ig1t5 z!OTD5Mp>s%SgpvfmqjZ!Mdm-Jqnn1evNnU!2*pgE(wXL2g-qF0s7*Fqdp5*apxU;#EO zi7voe{55+`VPke7~ss;n9LW3T3FWMoWeMBl#aH%drI zE{_+#qeaxCe;(3InS?Jt^#Te1%(H@tiY3ogIMpDgsZhet(Q_}io}`02dHj${e}oAn zE{q?-(8Iy2P~fr**{z*CZbal?!p|B8CfTqIsedbnGjo1;z2-SLY0Qe&A1bctoY|i; z&ucDao?FlBM=yg(9@U>-eg}GaD6Q$_-{o7~sih?MEu@g_hHBCEqmZxLB>zJ0#Gt8zpfV9qaXqdosuHPUS){j>>^?WS7t;i$DGN%Vq(_y^U%H{y+bL|9CoN z6zLDPYW_?+BWXn3<)CZ3&{DAbL-%EaZbP_)TDfRk>=@A7qzg!w;Eldl80Lq*3UM1j zpLM)xiw;CgZq4fhyXMQ4!OoMO&VVtFoAk;e3G=KMcQUQxU+qu{@gtnE{DJZG#NHZ_ z%La)X#*aiVb;fY`1w0L2CVd;Wq)4ety*j`B39tGo98Ro~=JZ78kI*a0z_396JC63D zq&lVfRe$$sIobzN40hK0VT0)@VZ(KgM7l;XhZ&;l3+;`ZaonY(E18S*e12?co}Uq^ z3?BRfF(tqQoy3!FsIW9FJK<-Cz|UTC{OrP+AVT@s|NmG|{?r}U)73odzn;Df_4v5; zbPVtRr|W49K_&iuEax13ELcy>m;MI#r%#gSBqF%=gnu9s+>r$<2vHyu3QQu1SHBuc z1nXH6MxC~DahKRXENGjFZZYQ$5g9i8P%WDMEVCvO!MpM53?i3waAd9((4^Bh>y*DN zJR^d5_eR@c^o8vPWUut}&9I~8ua+niL%81fSgdG8F)Ya)G*h?kf|oh*CSLrSQAGVi zxSkA0J6Qxn&&=_aj}QPtx$<88EcFL{gRj6+RvqVZ6_sygC@r=>$^@ zH;V5w`gQ|;NPZopwBLN>GE`tMBpX!0d{o6#0l#=IUj4ea{9PpK6_Etd+W`9E#r(iu z-*OW53L>X)>y%LavP_2B`zhB~I%+YH!6RK)^NoTI)0d1uJwZMEI`&s7Ic#8H? z4mMT)F*XC@b#FxyC#$JspAaJXa=dy=c0amqcI;h|t@%#cgeRGfIh4|Z%w6ZVRdK8= z`pt9-P_7L9ru_k?%434RTAqC%zh6Np<9N8x>L@4Qw|Q`UR57o*F9#p?=C-pJo|Cf| zVk=a${p^L$w?E(&e`q?9X8vh6^KZ9tj>-k)m>L}Qd)puW?O@{L9QZBjH@G=@9w^vP zs&M*1Xg74t3~lR7$> zJzPy*&3wSEJ}_G{H6r)*dXY)JGWmtIWp9H$o_;>)^DdObq{w8i|Jl<;Gt6Bd8m|47 zaXj%)G?J^$Fzn}SKDNCA4`Fa5Fd51F?9=Z1RsO4@DSS2PJBE_XtGx+%^x*B~)BeH> z!~yw~3;dU8enUHk1#C;JnD{qHwHw6Gsd)|FRgT$b}^G)Ki{pf%STumAvq1 zHsjl$6^^YIw*5oWmdH1VCJjA@Xn$%A5Un{mOL&G=pg)qi^%SdHNqp^X^YeXrAkuXN z$dDnGdjY?VEeq)oSz^7My?HD&g{&{m-fS8Q;#5O<=7vzd1#pdj#O9ljzrQ<%?rUY4 zUvd+)pIn*w5-sKI&XBLr?p%UVIy)cydu|5!ckvkvsKB1Ell(!W^6v}xyB3^ACqE`0 zr{-H4#D5o$a~yR!`NYCCbM(UTw?Yz1WS2y4&-vdCuZ-8d?y{=ImI*ER>-O$u#|9O9V8{VQ(V>9nje@DFN@Kf*9D3$4i2mRyH93cw0M&U!>SG+9h;Gi3~dd`I6SOgS`5U z$fYHf{6m5kL(5oll7)&uM4`!p1iu_q1R{T8w_81h9>N?XB;t*fvbkV$F`%DX{+6MG zF>S$_hcUICzI}Lp#E#c8QlYXHnSB|I$y3JrH@iF>nF02+gXWMA(6_hw?JvU#Ryd?KR6d&g%6;8);mnIpD4v>ZDdsG?6A1Y7 z%I3BVjHOPf06pY?*}}!56QZ%+NiqwNzq2K31afr)>>=?(+NN+7<*L}feEdeOr4y=n zSsakjvni1yqw0N`U)(2$i=&OFsJplnKifVs^E*|aBXps{H7-}W!GPmx>oJ_?ByG41sRg0i|=Yo8%+VIsK@ zg~HZk!Gg5)q;6Qt_2PxQ(Q-DS{?(fL7MI7~ag}A_RC>k`Vaq(otXCfegrHp6wue=O zw?H9tuHL4{e@1Hc4_G(BepM*ABQJZ&*8<_gl1`AHQ-O8yR76&@>VN9cTdkPHa;<`{ zw*wXK5f~aqsnznr`%uS$U$vsS`X6Rymp{5G)j!Y|;aUkrXLPDPj!GZ0f211BU`>~q z2MgzHLK4rbtQ2m;bRxs{izolyoP1yR@w)rkkI!V7Hj|%NLzIXKBkgF`HtLyKW+ACH zHw2_vaqgT-o}Rufd4YuJwlNZhWj<#4{~bRQ;eb+u!rnODHO4%)|4z<1WxTM62rzE2 zP^v1h0Y&F%%5skyH*ejxuzd1|RqOL3G6>WHfGcyb6;fFXPy zNAOh@xej%m2a*PKnNFfxX7KZ(zF74GR_SjW*e7qy)UEb(K6+dCnB6~EqQ zob}uPEdk%GX-_$I@;*uVvQIzg)X9JPtBwn#d?AZy#vpMRm_=@d=C9RU-eKyli@DZ+JrTo|#7jKPRcFrS$0|%o!^(=jdg6{Da_z|D7qheQA@w zwIy%qR$BUrAKsX5|4FSH=4x3jBhN_&x@ZtAJWxkBhB(Rw$uY;QKL2>6Dv2a4OEUK& zX0oaJlK&utqSr;`^fnD)ewjYjK#@}C^9ys^=Xf!az(J{EWZolhhscd9*$m$YH_Th*o+W>p? z@7mcay{xApevJN5W=Qh>qhyR?+XY{tmPRAZ+44YM=vPJn>8dH)Ew0L&Jpbon&W=w>N2Y1I}T7s7A>c}3h81NaT#PNN*-Szi5?mEYAXj)xLJ`jlzWbN5l!7gLWq8mQaPf-BRo&j`vnwl%PFwWpl8wk~x%;#$)X*^&^1OeVuDFyd3ckhr!F&2H|jvb>B<@TUgs0;O~5 zV5IAdyb8s&I9+d0lm5h`S-!xypo89*^6AR9P&tyeMXL9aRs?=YBP@P|(Oi76`Vd!QFy!LN-UQ0i&rGGXO%@ScEe@H!<_KM7JxD4ciPUXvZAe5_taot+U z)tmDZ)laTwqA=eS%GIY|tJ#RtF`Fpl>fA&A=(`^d~xtzYrUjge*pJs_^W~N^Czexr0x=N(l`Aif#tz={YPqHzI`p?D>saw$< z0Lird)NJU$hfZ08bu!s|QHA-->O0`aOi7GJnufkiDUev$&kI|wSBsye#q2!cQx12` zB+zv{yT6%UWC!o=tqwb8+deR%G5*itic;8X*v3hZgFw?^q!s_i;_1bt!BG5u`n1~2 zdPVdBC?}&sDx4t+1mrpouFO1oANEZbXnB(m+?@RR3DWmQ2xTFCZ~Vzl==(?t z<>~vm&-^dxJ3xoNzY?#0rR_h^_t8ZDM!NQ<*&XQn&2%u*wI8oSvXQ6nXL$c5eLrgF z^u1Ew??B%nNtk=<&p>`m;PHnyT5iHZ<2cHh_WSa5UAEwmq#KQQguQX@AV=A`J)sQ# zJSm{$l40)eJszOKJA9zaS{WwsIu$lrEj8&w4 z@61ggSg>xy6g_+v%x?1Wd{}0a9m#v=Rho<#J=Oyb9+@q7}3gK77w4t!~}r z!~=CB$A?|NyNG+flFk}^hfs>jn#H2}Y5!tsh}UUKy5@ug9w3MnqK4XrrU?B0j%-(p zH0CKh#K0*0SSo?S+e@S!mBrV8XCyv=SJ(rG3|&J-=o8p^wxjdSqVvn3^Zooh{Dnvl zA#I1i_xRT!Q^$(29xG%qJ)<`Bgwb{jt2CNzv|YChtN>P-AN`lKec8v-_7jY@YZe@B z?>8Tr4dPQDL)*zv@swzlS!bc|*(uQX&e%@r`*nmui)5bxOZA@-k-t(;#C+vS^=EGK z^nJE;MhgFox(eyLVs(^$Lgv-j(r^iq)+9$Y$!?Mh6vkWmtw6u7$mu5z-=9*Pa6LIu zXZ6MR-4Y+u7q9MXdjZdA?1N8XOo^vwTi#>5r15RW=vH6)%OBx9eVEIByu>}0tT|GT z#~6P5{sab*&j()U!8)ISS2S($PkyqCx2?4}-@H8N^xZKJyG|3|D#L&|4G$!Qv^hNl zwdCMqs{Jj?DMx@%Tw4V=V8Pui?o_9UmXU`Yj(#Sp*uGwRR=NKs zeFU^iI{>49C^vHG>pZKX*p&Kwe^l22{p-3%b;Z~hPP5Zbh~@6FUY54_*qKGy>%HWf z?2GZ_x68AnTXy*+E#%-M>>DS8`2P$0M*C6pcn7V$k5%0JvG$Ge;Fl~8*W4swCeYgF z>IQJ_+`jP(fb(By>2EXw(CzFS4;oxI`unQ^vv2HU?e_Vr=xNTrkr&l0_6?1+0Uz;x zHV$^8IN2>H?Q>z9H&7u>!K~Riv(qTK6zscJ9zNcfwP>+u@UQ-GNPGy6K(spK0#8l zTij31mX&%VRrw!$MsUWS)2u~~G?X_XRUPH{ku1k1E0NIVyvz=`3T?kN}pH&#PrTsnatL0}TrU=9J z*%5jBC(f?g6@5g->dHA!5Hv(H-{_BFo5_RCM%lBEBz_5awb%M52wg0Bimn^MO&u@f zZ{%45-IA;lur#8|>&HKuWEv6&A&h93X_w8Nlgog2~A;4jPpKGL;8Gf$Eux>BgS7(V7kGeInY z8H;Le)oYnB5_fWe_pBHND4}hz_mIdnmPLAl)>0m?A3N9v^L?6;@#a$y%SmO$_RYbOp+NOd3;f(2#gWlA{Bjq?tr4q5xM()zNPzbh<_?0fzuqf`1M;p zWUZcD*+O8{m%pVWd|tOq8_yW3`_~bX=+c;+&dVjs1_PM%*xhg_0lry{ys8hTO{rd%1|0!ll{Nl)XiDx2sOk zBmb9J*t5@lWWAScUoYG3OtCQ6)qN5LGRO1C$>kbE!LICOn=aI1fG3w1(Az|%w(Q4r zcU<}nXyfu-k&+iqn!-^+c$#BT^9N6MOv~#{RCWUm{o&qjx~`EXPDW2u-o{fzqbhH4 zPQ{0C%=rz~S@hi)T>!T~m`KDiDnj03cJ*RbKRSf#A)^crwDRXR9?Ih zZEC=fkNmE)_Bv&!dAjWzlHq%stZPbx0{XJrw)wY6n@+ovpLD|YKgMfbnu;hBnfEi{ zS}I|uJyF=|q(181>OX%!bgb^__TQTY1$|^;(;!(;9z(`R-cq@kmbArqb3d%0 zIe2dFag|>*kF+uOT^XAgX@pWvfwh(wLjtkg;kjqQv8|zC{z(#5+~}q8tHl#GM>ve0Z=qBRUOTL zRh6sy5?5Hgw?Z}fLbZie8$`Nfo}p?1)k-(yOM8W-4X|AsTxt0$+88xk>PSoet0}=H zDWxbZfL^H5%Ub!zFM_~vyqVwt*=p5-ovsDi!AlPu6|9I^Fn7#5^m($pzegtCY%*E< zm*J7Qn|4j$^GBB9o83@Ey#k%85j2xtg;mpR)wJwb4_?0YBd%9PLsgMGE(ke*dIU_> zBlsr0Dy!$WuAb-){b?b%d&G%5tw-=yJ(_}~S8Mf*clFflSWgZ0M0Q$_WJ$9ftxoMBY9qJzN|j!Sc+dNf^04|3enVs(9&x^}3ip#`OEwyFR=-KvMv zc7TuhG)?zXx7e(SfMFqEdvywPsBOaO5V60Dpe@j(f*c+E>Mv3~gnh($MkxHZVXlgb z3J$VT4G0)Dmk)pjo`+Xy@~o%8o@*C72qBgvlfK39g*?8IVMkSnU(B~w z0^(203)C!s>P5F*frC%}Oq`770va9}HbGf{X@EFo+tT&#qKJ6sbpJ10YQ@~nBXxW zv5?#Jh}Zhn9~%!9@Z@PE5WqNdTxAPk|J+XzNHE|PB_}ixP^}JV(Y5j`IyFtdd{Ubu zZ6aRwvj3QySeg>L#!Cia>@EJmCXFDZYOdGwNt_fV8>~8ZH($wlK1CUungu9r!?E8R zFWh>`iQ0T<22hs2jL5D#9Ert(&Yr@cL&P+=c3FKP9MmZs{7qL>dHi(eC&531>tuh2bp868~To# zL}>V2bjQY5Rye-;5#PN;(VfIYCm!U5^X>L3GC${r{mA+eF?%n+&HU@K%$KbaY0%ZY z$Nq$YF8FpzW)?TWJk^ib#ns;sNlea{@iN0yM)hZATKhGT#54H@YBPTb+rQ7fk46#~ zxR7{}yKdm8KP{1Hcu}E?%-7$}t$)&U>^%o{;x9l1g=H1jC@g&!pib<4 z@CSC9DdP1n z@tcxY9d?Xsz5*0#@+y`8g1WzBm(0y4Z#@h1&1ee>-N6iDmu=D_il?SRnb;NILG{q` z8Pk2`qsZ*%fbYZ5` zOR>WYqg&+_`*62zEs<-fE`sbQ8-S7mToPN#mHxV=UPQAMh5G_Iz2eC>@=6rw6`ON< z#c|>qeAytS;({>(j+Bbck$G2&)k_`QUZeQ&QeniC?SEEI7HxAk6KWEq5CeJCHQun% z&k>W%?gs7)@f~WifJYL$Z{QbM_c#2oAQtljjQH+q!x$c|;Z|fzg}eFXkywH&70}Gf z>oFoB`Ci5gwX>OD{h92i2O?1VqhY;M*$#ervy~r_BQ8UVMj@5gqO9Ugk>3LbA)Kjm zq3{@$idQ3%Mr1LR9Pfu6?jSibiRF-vTQ5MM86pXZlrBh)PZoE>J74}C{qIxtRg>oa3%ty$LlY{VI-mB4CTGvl#jIylm z*S>s<=DUXe2I6A8vcbO}SwTjD{<31pg@LRX<1?cFPF9NZTTJ#hNs&D+sw!7iah~7C zZ!9Ti9-McO7FkaEr!^-R>0-J{`%BWHgve^Ny++S`qJ+J=a6;nMl+t>Q`d6xa&VQOS zbFl68`EMQyQRXnbJ}1v0L)m%^cR9NyP0gpqcoN@)S8~>fLTPGG=`|lCPhHFBG$naz z;TJ_Poaj&>Pn~Tu=j167&5xC*Y+KOesrZ-;N>~B`Z1oRV5(rZ>BusVAFNmjS^sQb< z_7I}hcw-2q-0Z%4^DbCI6D&KVgFvo=4i{x^79Y;TtA51ZZJ|7|2%Bk6p3nhl#_!*m zq4&rWE&11FnUUPcZd*U%3EpEb#)_AdCy27m%M*9m_btd1aHOz|mw6|w|L5*K@QYaPsO4_Lf^EAt@E2)xRHE+awW#h|HNW2fI^p4 z2@?XYJBk7OOCX%C;Z^>*Tx*l1Li%&+kV-4*U#M@Wb2@dYk7~uPfty*Hu!KfI{S;936^|=%_bbIKO}(ken#vi zH^-8Xl3IZfGnv4`2r>z51CXJ8&Hp@kB5J8j&g64kt$_YOg9s3`ZlK+ zCtt%fnEHzH-+Ei#h^0#&ReUYG8};U6In0us;DG-&mV9l3$UM$grOVBE(|(%MQ!tpH ziE62sMLSb@J|#h>IxpFbR7ub4;D<*ghpdc|hjlLNNxvQZ#?#-a2~+JgC!fol!16ac z>GJIOwo<j_vq5mIdnJ-sWz=kTLu2wG@^2_~;Z1J~u^ba>k!4=VtW>m)R=6<{)M;HK8HMj*FX-8kkl#5y!T8avFlK zAKS9oOP82o+x+QS8DWRj+b`sM+t$8APxUbqit&CzE2$*e7nnJPkiQ`E8G7n}C_Os4 zPTM4V7<7)pUl^-@toT7Pwb+hRr9Ed}2k|7Htl{L!mg45*mh5K?pNX=p@Hax_muiTz zaD1gE=9>ouYf(}_G8ey?z$Mg%Y>rz z*k-QLtj<=tjvKz9j*=H|E_5sVJEuRo)4Kgq&@posIP0aAA~QX+)<078T6p7anbDA+ z6lTN%-2k!|PtNcFlx7C<*kIfLGmyIvd5EMbycwKX%4U?m1_%Y{o!uw6FgRWh)62o} zbDt04IG4v-0_CiD6^C}bXs#FT)b3+BKxQkmRIq{4v<8QKjy8`ZHUXi|t-HDS^v%jJQ?PezL+zgM4r*P$EHtRcOs)!^GMS6@SiL>g) zrbp*fhq!sm2KaowqrjMl|B89cWy-xuvCvXe%9ZA0Z!=@sPR$noDyE7Zm~x#MwNb3{ z$4rA|jS(G-r}pEB?_ZxK(cwJ$AwP{>*=PkF*lFEQj#4<(Y>6kWJN^h7loto2W(u@r zecPB&u1U|R81t~!&F9o!dPbG?K@W!MLF%NkSpCVBwe9Pu99LViNV0tyc0Nci$mxeR zmLSdKWBCHm6kElXoIE}B+%ph`IxkeF$ugZ*ccBtZ{^4_gzw=U$)RTZ-=hyvAPa;bv z?Mt{Hkwd4JD=|VoS#o%U-2tq}f;^LrEF)V_wOsVeGwG_CR4d87TPiTFT|xbqmJ_0= zWz0-YgdPNmk%P3z{ytHm#-%l4TfymdyYgr|MA%@TpSd2WEp6pGj}VzBM~Q7TfPaJC zJ4!gufM4L*n$&*JMk{a zpAVb9MhHmK(I{&{0oVT`*e3H2h>64h!p7S_Ep(=A17!;u-^jxEuVUPeSoCDcQBvZF z->1Cvy=uUp@h1)D89xizaK#AiSeqRZe3GfuV(Ik3pX0gmul}s0hd;LnRp`2vRNZ(rfUPNL9v zpxp8Foi+M2ediQ=y2WnCTdOa3;MFx&hn;zq7~!r<7pkB( zAd+1w>|Y_$lrCLHehR$qATEITv&0OjN9%jLiYW6o9nkIa7P{86?M^+TgPQAYcW$+R zZQe)Qawxs!J2vVXN+GUA)tK~EM%SQ6I0FECcP)2-RU&&!c>Zq6GXJdbq4Rh7OrcbUS?id%BaW zWYRPd;T+gq}cng z%KO8Mz^2)g?PhESuso1{TL`2DRFVBjUe^>IQ6@MB?6gP~pF7u|8EcIK!i9nk9P)oPr^=;cM@hVb@f3Rpx zs&g&`Kb~yGS!@-ObuV|ipN{^sVW+xJX9oqOuSv+Rwj4Cy$KeL=oX?5DrFJ1RBHPy_ z#5ZLvP=TKIQk^&JLzny8G$44f8(QiovV{4sOQCnV&}jy++g(#}RIcLJl!r+Uc$e5K zq~RtSBY{el*V(Dhi!*-_!A3tmg%`JX2(tCJ&*dkQz|Re+o`;D8T?VrJRL4`-S>Q0r%-s!G$|-<2F`*=VGg4WLQUw>S)F!kCmi_hS^4#gwep29)0J{ls0YC zRdg=r?XmF0NP6_o1A?^<-npW|Dm;66PWfCF8P*VhJ(jvcXOtM2iQ*2Uh_;9LboKY6 zAWj{v)DcTBox)E)h{sYtZQ+^UVW@4|XP3sB`k}R2{CDkf2-rWj1_EH?msl0+yh2|V zXTOu%FO}6oxQgtjC}crL(;D@Ufj-E}6U`h3+X>6tF+W1ao)ID^+x*eD=oh0(pTGGM zu@E#@Nt>nqkHKbJDD7WWJ<|#_kH4y8orUM;;6p2P0UoEw0jf2(rV_Ux>X`#vAn&zR zwpalfY2eB1r-P{&W^~X^RB>l`V7%Ry)*G+jP3A&)+0^-gdf4X=r2m;y^vudoedqpbAs1YDh<^=tFhAp&5kKzrjkKLzfK!GZ#~8zVuz9+m>o3{L@Sx*np{I z%R=ExhzXgm!)2LZk7rs5&B-{7No@_>h8X)Eg`2WIN{6guXmh9jKZ ztv>lTvp>#yXRupt7d_~&T+aZ0c^=F{o=*|A8c+VoF8j9{)IqE3ytI||lEoPXSoke~ zEFi1lF6ydmWey|$x`)Gvz5nQ_Sbzps@|B@M8(!Q0-Q?M{7EYe9>zIms>$z>!%X9O2 zFs)~E8>NBRzRKAk0+Jd@oXLFq?+j<=PvYwTCXX2ZEU+h@PPLyLKRs@DQP)pQ9=Q`% z>aT_0g#7f5VFmnj3nVb)r+?yF)6uhMo}d1&?6E(fwL*F1b3&g_%pNOxEB06Sr(x@a>AI0`wJs0J2;G>pDco}zgT+eHlrgP#+K^H zu^33k0bUBz&*RF5X4!*e@#Neo9Sj9EISOh*Tnhm>vS&UWOD%R{gOQ9aRMla{WyF1G z2f1L0aqR8n0yE-1WL0(ZZ9sig)#8t->b#Gu3Ki}wVG+Ky^1EqTG-s$RDO!`jB)Nn&jbzP*a#(*c$zQJ~!4%W&2N zjW=xyo1N65g!-Beqg-PQ5c&c+Li%~{$%2Ks2BHDm6R!47}%zGRO- zQtJ|Xv>-G8L>UV#h(|C0lCdOQjxLn>D}SRU1XuwO&hEhOKHEyRK`09n0!K6wlyPUB zg=MO&Oohtm0Kl&T|BhP3%Gcn&{f~xwP&`^dQX9Ysm~zL42cQ&~CMxU1AtqT!eDoZw zWtF6EMz5pxOi*o61)g=FI$qgg9Y8EmCui=a4LkdCwneJhsoj^G1OU@7@BT>MixYb5loX#z&-TU^pT6r@m2c+cmirz~-3S7};Jf zHRJR4a%uk+a3_-)pt)9nE;$~PH4%uzfxL=d!Yneh5Zi49+>8aSU{&N-z^VVAT>&E` z3_IoKK3gKq(OB^VcIdbSZJGJ80oo>y1gl^T44OJ$_@5f6GR+8XZY6l&yrSAluRjii zuOlH>;?t5T)5M*TJ|;Z_axD?N2sFS{!&>o2Jp3V-oY%eUj`RM3I>fv;L>7J!&VA+u z_)lQBRo+1~1#a;#T#VeaZL{Tt+J$ElSIe2fsDE%?h-}CC*P%vc4-=F4E?;oh!h$)8 z);CuJ;ad|bZhOu4+c?x|+c-@%5@Ibn_&=7Eked0M8Ki~|KONRkE8>$LyXG`w4NG|B zkNgyjcM(tt_-l5*@Z!MWf}Npqe-(qZSW#^ZlyM_qPQPB0+UIX&aapPB?Zp-==s#EH z-u{cXdHGqD{_pHpmow{y^3Ip(^d~3pR9f#pF?q2IVccH;jyWLr`ia2>@=kAFQSfJq zUjyWy3=u;9AM960(pn+>UT1`#9QvnUjh?k@iF@J?-Dm5p@O8 z%>E~~3H*))(f?2I8$fG?@cSDx_Q}ETrqWLoex+z1eiZr`;$I2v<={6NE*Zk_JyV>p z|FPT(;OBCQk*<-%Ui}2^DvW;h<4eD$jmc$3>IZwtH@3}}k&1nOcb-RTG<1%wkiPtL zrPG+&dGE~>JBZVqMbA^`ls9(Fs>e{cklV<@u3LGDZ)S{gv^t}gXNhweyJDn|@4vCuzupw3ZmkKkQY zKA^2CwAV;Ts45T=HmbhPFim04e0c<#NM$O*dU1JSVJIW#B?J--v46}tnFQzk@uTG9 z=a#86{?AzJ%GyE2ij}3G(Y%>VJ*qJI0Q!!6K>wVGC;LIbV_%_`Sc*WD;s*@S;-ALm z#A9VS6qCi8MNPOs$ddX3q&pKaZq>oZMTC*+te^Q>I%}vn2aoj7?<3(aIY%zX_>lu} z{25;5k^fN12b2%ywWTFjBr9$wTwA*M_O7sDTf7$YezuJUOR zE)(I#_^tmZ5yoV9FsYbkx&d%^=3pz(P~`ucWtm+l_FATGnR}Q<0=8xS0d!$dW~Ptg z0u`1^gam^DQd=Z66j9Wfi6{XDDat%cd4Bk>U|D|WPH3&bbT zuT;e8qRO?0f7NB1d{CxfiQsz715!H{v;3e?68clqstj;0#l(iA2+=>JT^%8xPmSWnO zRv4HYc&3uKTL%4BJ-AR~@V~^|37w})X&x#?nO3Z8Yt;p|B~#M@%%E*#309p##`+?@ zNslf0CWy!F`z#gdKR^g~Kpy-ZeBu4O-*E4r4Bk_Xh12%=e{=<23JTm!0eSW{cvx|# z356~Kk5yBn5&UPDG}@5H{!1UE$1mr{zXE1rgCl>34r=n-!30fxf#YOP_ArMpJ=rPY zMb3lr^p6;}+W&*vXHn84v|w$WqNVFf` zsgzO`QQI=fy{xg9F}-BBv{CET@U`GPT#)v%O=ZHO)Sik>z$)@cKjhsCG>D8o%)BF~ z%P+ozC;HR%0=t3+SG#z&i^6=6c(T;$DC;c*hE5vh4V+X)_9wtT-AfNyWnWh>*RIbY z_j_p&V?QdhIa0B6DltWNXSx2!bKx$xNsG}?tk?+gx7bjih8W{ z3l+Cz^uGQM>l=Qy)R8(0-#`c${@_Q0IHbf=5=wlN6o_VxPGzfPw8w7IRNjxMuL*zeK0kmfU8IPu}5+CgBAvBak2 zk~aKWVt;Dr_xV0Z;088PVw#mmYDqH0&wBNsPupIu0oFq2heIP>GwfT6KnL_*`);0A za4S$M@*)#CIM+pes6u-N*3^n4d43=vF*=lWPLJ)hqq!^Y#1^h~BMx|Lpg)k{FDJ_bHxusUZBWtDDhl!5IMS zF}uiA7__EE_^f^E!jecap^fUkH84Dv(1jp`N3-}NR83^R!fcTwEx2Hs@kx)tClRgr1x3?e1# ze&Xm2i4$M)f8{D$>hO}6*_8Md zZB187kqZTXs5{HXUv{!Xf7Y|mqODTtv`0aMp zwqKO1C9I+cTgD?K8r4wBGpDduq$C11m#m&Ypu&4;7wnJIPR5hd&bwk)9%S6Dv{L*~ zTw0P&B;6%r%7kTOFWAjk)?<9({{}MS|C2$7yeQ{}vR@9R(*gDZtp2FHCqq;zb;+Yt zw|?t-+c^ie&%aNq3Phkqk$s{>{vtiG899)BB1unYsser=_&^l}6=;#%vlV`JSJ?pg-YAwP;uN>+Npp?O(%&ZVoO);@M$8j1OUF__sjBls22&%bL?|V~d(eZ0(66g!E|!Zq{B& zQ)Hn+5oSg?f-N%V*L15fvh;BRwTqL(2Rj1NUE(Dtlq)v;e$}Rp?j?i28|js4>=f`k zCZX)E5tn>*_FWep`fD`K+9~>N1*s zpA`X%3#ca`G+y94bz=&S=dqOEVUe`z7 zDcM>XNsH0$IeeMHg0J|hVq7O?RQPA`2)BJ$%mh%Y*SgDedUgMdbRNdQvg=~$zX(@Y ztYSmMZ-ihHWS@AHYQ#^(e^*8QO}_wj07v_)L4U+A7;2>JWr4>Hx62MLq&xZHwtr{m z{unx6M(ZHfIa_d&_ok+_v%9$Qu)&vRoW9@tA)QZ;op!Wz77MQVtkfx0q{e=U zA87{Lw!FX1-xc6H!m}z&oVzNEh9D`7g>71Z15@i8f{C>ImvjX&JR15Hd~5YVNwQBB zQY^DCdW*yJXc|xCh{Rm}T#FWB&74o1$-zi#I-_Q@zvL864Y4N}b>Lr|pa*q$UU;Sh zZqj4FNlwsw6S35Zth4FGk%gyFU^cK_7n!p^^D=kjV0{2$lP~=g9I;z;JmmnDi6u{j zN^FkId5~J?;)@@Yv+u)ll;Jzh2d3O$t1nlM=NNl-QeWKF$0}w6lzEw4UhIvAUMMBm z^Cn9)u?u<*Tf!(Ge>S>=3w)e-N6x3*J*02Ee#M=V)tuUY$zSyBI8U) zaV*Jo-Pt5Z?fSB30sbthy!e>KyDfr~3 zkNISm@DuTq^dY-j8#CMov`IbM=Zi;aSVKLQmGJgH}5t<6Na3)k2;bc4y<$_T^BIgG*|H92T z*Z)0q&3f*3vKbq+wMtpFYGTF9b4y1DG~OZtdRKVHOZLxLWG)$icsm>l=in%LOY#$` zVHKa0PrGof@u+@cmrH=4Fg?T?!LmWTgWeI*2(#gXBL9ofOpMM77->SRZ^m--l~kiv&kLM1$>2HTG`p8ukB~;>pWfn$ht)@Z^a}6d_1{MRabT4E^PK+0v-a7J<@aVwu=`iZB%?UEaGFc=qRsABv8#+6;UJIR!H3zmGkHm|QF{!q%*gy@+pCzrN#$U&RO--l?!uNI| zd@s`VTeY$5Pk(fOG0calzo|oB>hdc4xw2M2xzp`>CHQv^S5xUmJdQ56*XfcN^R+r& zAB}YFrTOxpX&086b?z+YYcX|%*11RmfeC~d|Fc6dJtrEuAyhV!P({fhl?1F!CKwL6 zX@7Nh2wvhdtCE=hOyY$>M!F6cT?DQTS|eV#M%rqyALR6-yc{gP&`%EDmSMgK&-<6; z!ZNxb+AoE1F24nm@aACEv`88&L;V*i>Tg82gQ`ToOdg(Y^6;mjr06fm&cgEBO$ArV z0?ET8L}lstbd!fK;N#2yJxc#d>y*XG;bp`T`Un0@V(v4M#BsbexgLuFBCTICUoW0) z@5>yrAgC^vnPlAAk+^y2u}JzEp8wp zd5Btv4ZQSarykt=IxVHg#*WOX2Ujo@!lxFgkEyZc{0PlAz`Q$h>G>YIX{e(~| zJyhdeVx%KIb_54u!|HFD0IUD5FK`TfPOrZx5U0n!J1Pr>HoQvJl3Df)&1EN~)s^j_VwV(MX*ujs-?;v|=h(&A2!o#DE0 z<`HzEcO!b*>HHJwY56GY+2#Jvna-tEv?ROHJuhl7qN$f_mbKdb+LyS3zb<$=U&D)G zRY3}?RoOk!pI)x?b5P-#wsvdAt3px0suQ?CXzR^+&zJ2t%IAM#D8XtFrXHw^@to z;GJsR7^|0@Qr_EO3!PRMt02pz!9P)a*)8>xjg;T%dem>V|G?gWJ{Za1IjB$n-mX5Ub z*tRdPH3;i~u!>065A_s?PbuIzSE)OlE2o7XZH=dyKcnb_0H3zx0_M#I(H`aP+avYP z8`I-eBPkutv(24nDcXAim#GPygfPQSLKr2}yxXS!G1;sjNaDbhb%2imMJkts*BH8WeAE+LlO_6Xt zNQ`O#5x_sCjksjCJPjHQEi_BTvARbiiIW(T+mNTQO`c4o>4}<_WdKEgqw*Oiy?s&q zwuM((wlE;|3QIB^{ya@+&%pYJ(+54ZCDcscthe=Ch*(@nok0qMztAF? zr`ryby+MWMDV>A9t78yNDxSS+h;+TfNHUXvBKA2IR>8*$AhGXO50Ty* zvz61A%pNBgTuvp~zdG;|@k~ShgO3N4+uLG`v>)X@UvdlecP*f3Jb9;n0;BE0Qrz%nXKQfVvnSDa>HXM%LGHbM#Tc%E1fq=4E`rLSL z-}UZ{2RFlwK?UEXf{`-%qe>vYTjWTWl zw6}s8_?%5b_CCK9(MhNpl_=|uwP=VW_R=b1&6s%P`5lbR;!bVf7XL#fyP`Z0wF6X@ zy5h-aWGxt3*_?u2Yq8?O@MNS*+ohQeYMDkVa^2vfhKDplRtVmh2nKa|l^3=4A}rzG zz8Gbjo}PqIfRRUO_1bn@Ob0`4sBK>n!Wx|K=3-B6+ryAXp33s+i6s86W;%POX}|TG z9W#MXLiuHbmHy%as61^-x34hCklFS;Tz~a(SF4qT@MUymdPF#ejO^lhdT!%0lKnRt zG(j^w_5vi4*mmFT1`q~gd z;`1Wu+F!9_l80A@gC8K67yz#BHu5jhlz^eUrp2v*gR;y^8bFTTr%Ge{aW2H3NwHU~3^=#U zfS`y)hI&@2^sX-qD9S$W@6N~Jy0hGD@-;ftJ?Y6F7rq)BUYxjn>y2uik;FE*-(d*K z5*_Pw?|O<`Cg|K&>5rgnb{|UX=w2UE=48~Txzbu6L@$FW{Ifo(YQG_Slfgx;=l$P0 zH-Sv2A@qEy-(oGp;zqigx_J7)>vo4*qxW{qYE|MTY-|myn_8LP_OfxLTv4-w8@C;| z3QSlvjgQ*bY3q~o6WR_Gz95!{M-n4bw%Illx>nywWEHfGVVRTuCB@p-T|wbbfmp%0 zOg;YMf2&ABXyyE?b*z-jZNz@9jbYF2`;P+@o>3QZ4)XSXP0i_B*BgT7{F=yDbHAuS z0@tz7f2nh34SO6jt0IZ-@h~vqU=dbq^bgnQd&Q`gJQSn_tMXF(hX`5sU|ZbNJ|++# zD|K@&GEF(bo}G$(P&(~U$qYj;=V!?fGp~$Y_aze}T6yGO3F7A0Yv(yp*0&sM<2YlI z$)c^s--`T|YGUS&FHcT@Bpr(q>@=rdm*y?#VsBFg-vQbt^+Kmp4Of3-Dfa3)ovOoL z=Fg`xQz7R0z<$0>&Q+7$m+Cwr_(LADGRZpO*bnH1sY$}wYfwmIowshFN~};5X*7hp zk=I*oH5Z~or0WJpJ_=9Y0uNBrTMf(x-$_%vz*AR3X1CRngY*6YoFfT#njsebU>)hg zXp8aZ!S*LQ@}CQkz~Ob2D80>F7u0L3wlCo(FaV<$N}WE@=1`d~n7P8~gEd%!vR zf{{Y`@pP_N4sM-{UI9)Y3pWAAUw2MU`|I<+@wi}j?(||aw)W8NOfUZg?NHyh3g-b$ zj+dH?SZV-*Tbmc_g^y~QXBA~yHE2E=SNKU=S5Q2bK7lp5w(awUpD1Qnhl1T6bNY4m zFS>hFHUOyFI3_lajfMG=EsG-#=JC|w{T81b%oS)4l|?Js>Ns0o^TaW|IduW{(bI68 zb`FUGJgz>G#8nIAq_P6yK7h*~0wg+Vi5;@bZFEVd71I!JyV3n>VPSvf_`2~0u zSLz6@Pm!-?B4786v8`(>J688lc!Zozvf7*4FKIGiO^b8uqe8G-Dy3N$XQGTDKciu0 z$gmrnSM&oF2B`*1yJee&M(xi34~80z&*VDcx~pFeB^)TvBCw?@BH%6!F-ybz*kfch?7gelBk@*3$fZ6 zh{ZoC15a2G-83A-qdVB=@=y8=J%h`(phx=CSGG3y9dQ|W%}U^z z;-&Ag#MAy+EMRS)u|mXtUA49T#15+MbY+Un8f(O1Q#h)>xsNJ4dN)^yiFS%Y2|3cwS2vez z6&SZ|n+-{d%&F4&lf8aD%s#BpfgjU9aNjXUx|#@^RCd1eCHuE#No(x*4{g@2A?djm4PnWtfL^gjz?$r?L->5-Q%d&!)%$=8W3XXuYV zjoK4Y{+(SncQ#@w(mYYC#;1BTw(wk?CaY(9dH;$ypT~o|44tf+$d7)X@`zha`SW@1 zFo1;T>O3X+Dmgv6V?9xq2&jJOyL(evVLeqrf1=hOmmj^|)wA8e)E^YOp^H?-4*d~e zH7?CX64yI9un%?hH0;=)2I@I)r}b!7RF9^_@}p08=jz;m5n_jW8d}5}IiIKWbHTW9 z@rAD1xjL;f*iD~`v%;S)Z<14^PzT;M%`QLJGk!y$!5W5LXV6 zsnVyJ2zJDV5XgQCO89k^9lR6zx2Vr}N5le&RJxO>+XB5$_Tu@_3|)nwMyeB)9X!wEAmgLfCy!)KgF&bVy2q0AH7n( z6Ky;$_q_V?@9o9h4R`uzV@?}?QT?xbtDQkQnifw6M-JT^bSN56-C~~Vbn<4N>5LjL zsgV9&+>zQw=H!^cMoNZLA%X-))_fqpk#UtYs!0_Q^HSp~alG?e9uVa{IUv>4yC$rc zPAuR$Xo@R?jUMSw8ww!{@JHXQO@FwQPCE7^E{u{vC+Mpx3kt81E=scA{Fs>Iaw*<`y!U`FC!%Yd@i(C-DAMn&Sd2v z=Ktd)^z~W>9O^q@cLlW;Wp1Jq_-_VwipIcDL1tDA6_|xkadxy-@LcwmtGjI<``=g_ zWUF|DU;FG3wy^oCOrEZpdu5B5@70g}`3S_F#7xYMn|01+%A1mDl5N!Ssrs=O@g5r0 z5J^PxWju=^x9`8<-s9-|dn}Ye9krS7b7Sx8-1}%GaYX)oRc4p){od|K>v>=6(`U?@RU{%|K_Fu-qZRe6jD_Kp2 zJPY(WRO<6)VUabtDCg=&tf_7 z-+x$jvSu=0rZFvvD``^Z+pAa_VT+-!yS^&-Ju1^xg?~6#**jdR-fXZ{ z*duMa@*C7lEhFy>%F6n=>_JBOO(<_z@ATlpjSjJ5WnmY&P=!K~u3L1$nl9%n=ho--mQ?BnkG+BMf#WJ8yRU`gUAj(L(;&Iy zzqkkaQE=%_?8~QGkgyYI^eb)C;?UuEGAk~Vtqth~l5k|$K)e_c{bu6viKslYtZf(n zCrp&_Bc*^=(s3lv&q4gP|E7X$Q@5{(^-eZ+nPqP;p-w43c~lr*;nh7BNxZ}puli-H zCw%~p-S7w$O_ahRm2GDz>j+zzuu-}QXX#uSxE}iL>*424B7REgUz@Qu;b(L{8x?(# zYP9Lfm`Ll_JclVUQ*0>0DV=K%LPq85ew-y5a2-@MP48bhUv6*)iOEmoE9hAVa8oym*6b73@sY<{&Q}|C9uUw!MVz^2sC?@m>E}t>5JGc=0=u z;Cl4WL(b>)g>dpWy2*dZFI^Q(ek^&e%{}(i!gQ-NHsXIku9<$9^5T&~@yXg}?WX>LXd1_3DZ!vd3vpZb(sNmXg zXCAfqzJe#f1zDzScXUyP+oUX3BhGV*=^4#$)8QzKmEa`|9*b$D`a z^&%5in_KcJAsX(ci;$4%!U%^qP@&VEAD}us3qa`kQA5@9cI)}8LC;qQJ)cMwP$28~ zMv$)LFc9v7O?3M!8VJ6utZnNy=C-yk*-;MRcvE}CIse4G&p7?N7i;f{Tm6IJN(#bz zwi&Xov%=By+vM-IC0_luzhDI-rL6tzcX1@dE5gX-Urm=}bg{Qc7{1*?J>-O$Qh>k5m;|C9m17iC(9xJzbhQ&2Gyi`;U*7!>^yM;%9{(kM`EB?AjlLXjV$R3V7kL*Z|Al-?{<|Dq z`N39HE9gr7&afj#SANPgmpuwyS+LScP9JyP(8=Jiv0qkP13*R*1zYt`4FZI%$f!Nu zeGOQB=^Dwc>==xAVOOMx!;0EwD};7gt$oP$!7NCbJj-jmY?@uH!XGYTw&vD_)v{U)t$G!1GxF$ zTNZ+;C;KtyH}%}RBcQJ5Qvqtrs1Q)E=t}!v^wjbDoIiw};RL6;Y}3s)aAfFy#@}ms z3j0B>Xb1%3avxWI>$w~uYE2K5kPOu6S>>UeCZO~iYhg=}^*3fE|*nBnM#Lss}@z>EDvT>K%&v&y@=nWV_o z6DtQRN8jM)y7$}5Xs7GP>hLR#W^&5z6VOirFBOef+Dt6&4E2I!0B6A}#Fh3OgRn-Sr=6?IdnawR;z^~yo>9^H!4d60MMOm? z+SooAADO{L?Ssa=lKm0%0er3U_dE{mNsPd$S^MKvt*bKs0oJ2bP60c&iXvSS{qcC# z#1sg_<~JchRuR$9pU8QTainj|c3sG&fCf*4c>RI*mkPw+WyRrEcaA*XPT{&S)iflQ zKE-_SjlZbSJ1N%JBq%qY8#+8^_8OP-vQ&yegVeR6fmAz zB8V&69As_Pdy9B%HAnb!gO_FgAuM<;oRf2_aaH!WY9srLu%%(ag=#DJ2kHZx?CN81 zak{I3e>WaK<~|D}LOQ&&2>I|av;&Cr;gN-Nv~wP&qq=}R-6w)y@0MP`{>TNM+CPBy z!*reg^{irv;Ps_14kU#m%OK1rf_el;a%Y@5`8PE<8Lu3&t)&4Z!P8}R_OHS73p~z> z%AG%`F=Q4&r3(9dTx0SdcTJ*a^5ErQSm_Mu5#PWmkdBx6WzEbF*gi4&EM25-+Rb!A zxXq3m`di_gu|HoZ$N&3;n@KE=C;v(O`rq$5nyRHBNlGFd^5p`$gY}C*_TR-Rpap_^ zk(G`mAEzAqPJW74&B@Kp$#<|oc*%#rGt@I*XBTRDLc>Cbj@>2B0hNLkdoI|?#2#c`7Ubj7S1^D)u?v0x+Jf9sHtvilxH0J z_C)TPHbh;yU*(=@3)PkTYwnrL{NA)C2m46ZFZGhcfCT?AWaE^csq_7_7HXXgJx=Rg zyGB~?Tr;|>M>4v9FgoYYqi}{d#Bb6=f5@99I<1Ssk5pLU@2K$gR=9%~MQnFo3}JxY z4*j{R-rbe{tt(xp(%+}_{A~}rgi9sdxb@qUxdM-tklyJTsLJ_dqz-V$ED=Rg7D;G>24rYv0^9G*A;V9p;AaI0p_reJ zS>>EanMZ<24ke-vC4@40e0Y@V)@cQ%$DgWU{Q`g=Coye&z1hTkU~YeEd5~T%4m*uqfhWj;F;c}s51CO6Z2xO>3eM>3t_N6 zXGN9RqPHok(aNQ!Y@0+VHQ4LCtn4(k_Ia)n3~-kRHJRV~hpMSdpQcJLq*}t%r=ZBt zdyYe(@WEu;ix#zwim?k7!&jtnH7$u-4nWwi!kKa2#j|C)>j;GHv@rFD?!d5n7OAlh0?KP5vaQo`IP!-xcf#8oH@a}WY!{dB) zL^{2{N#no~qRcsDmt|Mbu_JgR%i)h9XUC3o)c=EqWUe zlmw7m&6cH1r~dIU3uBBtQw6C?6-w43) z%=?<{X8{|LIN;4@uXj>ZP}GbJ-gW|bLnnJDn339{%^Z7_?4d8=*;K+p0j}C=lerk& z83z>Osj~vy8TenK&Y-^*pw2tD33XPAe_U+)&fkz+J-8| zdglZIA#lmsFhjI(1w$E=`glxcF0bx^NMa6i)10crBKAVt?)F&uTEg=Oo^(-xKY#<0 zA&j{vqC5?V!yu=X#8ln|p253l*Lfd;qE{h_S`btFNnYwx`Rvf_G~)>+v#CgCl2=2?tS2v#nO));f2?pKIII*mc$Wq>J4g1G z34Hl&Qg9;3F!mq>kTXpYBrA1JDX&PtX*niGQHvw$t1_?PIMYu)4JC5G-nof|cX&T| zds2Pnx?Mm1qOGta6hlM_P+o>R-9@>v$5p!EK@DvD2)&Uhqd=2P+^1X4o|WYypRe|I zo;KSVH-j)&=AU0eD^nXgmrD0LD3Z`dl1{Mfn5jIrzvd3-rYod1aI|JYBHQm$FU^$&JCLly%ad!dBJt$(B6BV?jI58i${SeMUm9)?>kWq;;l%4CHW z;2&_|ROlcDmH&%dRW;9~G*$?p2H`x+i9mE`Ja^MgRpI3@E7J9hhSa%oN&$8ex#p$O zy|s;*W(S+YV-sU-ElUF*4Z+W7cae;>N_r>vwY@aD^w2eL5Z`|enm=Oa$UgmyZ+a62 z)=W^Jr<*JkyU_%au(p%WJ7$hAYCpSU=D4EvlRIXTq_#=k5AKt-2plqqD4!52uzPsc z-LzlD6TcYRL8jVp*073q$5JKTnPb!|Yx7IMH~Sag1Agwo;eVl{VNI9i)#1!=ww*)F zui!d2e|jh@vh9Dyk`G8ddsMoa3_;2P(|;i(Ecb}kg0%Sh@#nnd)`EO~SPKfTV}e?^ zF^r$03m&wFYhjumsK|6(L=vy)77CV*^JQ{KWghR5-(=kZ$iXW-7UZpvOZ@!L5?dI{ zf`#$_*!vdnsH&^&2__O-aDsxO#X4%#q=F0u$>EzOJKGI&4h#F zo4XwhMM`ExdecU55WRjpa zO~aVk4%;W8`wK(8zXUC6&oL$bT0FU*_)j!G1E$rupz;{2uEy7i``pCqcVJSzh<{1(F<0sEzlBs;KB?Ewj$&W zXKL?ceUgmOgwN59)Pkn(p@Aa!8^#RM)7=ZffMpBo#MyGZ?P6>&;achW#$+9z{NqKy z@H>oAdaX{=tY(JA?eqi&>KbHe?ifGn|q2f$cW>csVK2vU2In62= z*Mg*cZp2f+7h{I)M7!zJkljS^_@vK~kTO|>-G`2lB#FhufE2s|2*D_w z4|Lb_$@O@WBB9=BR3FekZDb(8S%@e9g!p*no(GXF2!LlE_>W~kZj0rY{3fwxQMcH3 zxKr2bzu{sjm{7f2Fe{6L=T%Dum!=6AsuhwMIxJZ!zu0gQ*>C=<_d}69u!cgWx zLBc>D0HK?O^t=dc(wcZ40AT~OFC$3OJ%%oI|m= z1)c!lxEw+dXPr-AFs0_zaMVt8qOk$tv<@TR6=1-XF`0dcV7=;5Yf2IZsaaFfHkK7G z<}P$=dz1bYu^JA1!0ZhhBwNF4?6t-n5kn|rcldp+@r>==W-MhqX$ybN_Wl{(Z3;cH z?@{`jBI<9Td!Y*_HO)c3rayJX&9MXvtSA>TU^uQdo`5%-07%6ij!3j%%DZL;tZA8HPia5JxSX+7n}g-wLP z(m6X9?2VMJ1?!uULU_~ANU-zSdRD5$$RH=#P@iYuoJ1}}2!;rk7F;5gI{!9L(93xe zI0{`~zpk(zd5JkztM~u2IZs|+k7@E1v>&Vj+!!#j3)%`7f~M)XAV}1RezUM%>3fT{ zpmdnUQa;pIfyU~uU+9={xwDD#G}e910N!_=Z9M_!Kk&w*$CFvZ^a*0^my%4Dw!S{x zim>F;8o*}CrGpohw)K+NHosGvJepf>Uw|NeL69o&P1}POah>QMW zlp5sh?OgvKgC2z{q<=EfZbn+=iGeHnKYhvgM8pf91y1y``@tIMEX~x@aofS_$|%ha z29LBL&KScorPuung$b%2Klv-g2TztNuV1J?2M%WRkfQTJN=-Fr0_ca`udd|HNVrrQUx*iccXo&@K!R;aoy zbx}t<@qqs<*AK~r$xKD(58Qj=2vv%Za!I}c$<_3f){KVnU@+zbu=ww=)<9;}0t^Oo z1T1I61Wi4sZ;SBxe|!S^4ds1`AS-1H`arSzKcNoA>YLi1s|(f4D>?vnFYd6G{Z69=Iw^d2LLT4Y=c78RB^{4q>E;2Zi&APP** zzyzCOva0YzHA$w~NYl4JhRkiOE$|w2GuzQcRMiL_7~K($z=5>j4N$A}ICHDHf$4F? zN|=-!?v^ezyYyLf6bgyO^Df|7zvoX9N8nH*_JPfH*=l1b_Q%y4E&4s*7QTUYauIH3 zoo}>zRh{?c1fhiC-0!nZVU4hLlpXw`dCzVGx*O*?NiYeI;=_#%y3f z?q8OGihTH_BoUTiyEASP!uiK{vU&dDwOL-M{7`~9Kl#p5`4^e*Ud0!;@v*LjCF{0G z#QRCIxKjc?X15*qE~56^yuha)wu;2vDQB?gD}3@1KjwWd=I5akEfay8ZpRln&TnV(Xyj;rbBKaGlmIhrZd>lb|~tgLARiZl6QQSdC! z&@(UnoCsbou-RIMNDf2dp;L?FV_N)+m=^z4T7hFwom>18qM}-SC*X-`@iD5!JrCXD zyYH43|6E$^Z2bAL#Qa$W_0LPqaIYBXiPG`;IGTZ0KFUveC4Cg1a8LS|C#cZ zim)QbY$T~)Wv0IyAI_zBGBsA?Ogk!Pt3NU4b@<2VuE@bw*~^GW>9W3Pw^`A-guXSk7%q280*3gGw2eh*T)1l~ zB$2v}O?WZs!-^~}2PHK970HQQ=fRp_!^CVxQZnuABNgPmo3XyR>1cu{05|YIG^Z1?&v0i!u_t|HPsI zOTiYj9Bco5cz|OzCcA_cLpx0V$Az*Lu|*QVL#m#J^cliokwkRLjKX`0nH=%MtoVF0 z3`_Cktw@JB9S>M>L-hu&OD5u{;V5Pfo(mh3xq>C&T=44`%gpn105TkOUEVmfK?zD?B_zFA!?xdOQ`)g&f$Z>5S?f>zQ#dKxg9D{#9fLw z94N51L4)Q=J?Mr#wF%rOVJ5`Z7UEa6ss0_Al>lX7+Kc^B%^Iu z)Wc*gW>~bbpirM}I4lr7EO04=(lJ6{xOyulx5F43E+CCixvYUfqTNXRV_*pVBa+k~ z9xYoXQU*oTcscmMkb%?H0I?4Ysw01FVC3A(zCW^oA)2ghdY}7Pgc=yX{*aUZ*AOMm zrG_qoKO)VA2F@2ln!vh24(uUZwpg7_u)0DajKbQY=~d{R4QQ0Gdce3k+*#stkc(}W z=!FgiE&QbUZVifvoh9~S*Tb46KK)r_miPqkVb;z04safNV)%3PfaqX_)_`!S>A;#8 z(!!;4BvX2tgw*x{!NTP5gl#~4h_FKrh({%(Z9q&$V`2tGl^PIlfkY2FAVA?94j9Qe z)p12Kwwe@g=G`deoGwW3LifRhybq{p#hE9izgl=wQv zKgD>9){4vFVtnPA0$3!?+ifqOvrKv51;iP*BLd|O054kpRlL+U9`C3x2M@i}tgojn zK1;>-Wqg5(w=aOle4uoB??I=>w$9PNybTJKm+@)JdE$AJGeNmOaKxJ8k%02EOSq z^nXbZ=56)8QFRHvkv@b%G4S+Oh#i~&jE99F@Ny&DhnT-oAPM!HfIV<)+vo9D1wn5j zNM9C`=v-TLNJTGY^dAsyUX$y^g4(<$cf5L~rmzgFIJqkKB8(yOr>A$Y-kthREd7lK z@e!t~V&ggcXE~;Km>REq!?+Et52wOOWUoVoKZVAbl|KoY!L0m`>Xnsqy(yJ%zC%@h zHqgh)Ygzfp_y`wUY@CZvAmZ9rA{6@o-a)Z9;-UZTCUmmNY>R7*MeMv#&mvoDze+uU zspmzf?vEA_*+Zb=!Ly@SL*o>ShZ%ZWm?67^y_3UI^o<{bUYYt5nGI5q1-9MB=xQY+msVqP?FvrOoiyB}q zBH)D_H+lZiEYB(M(L=#7mgvA|m=21K?x;XtjZvViXJQz@*+{HEc(B;rpg4^9&%xx? z&^;F|jbcKlAOK8g#!GBt&wKH(m{_6&<$bpSIQ9X@`Yj-F<5r09s;J+gP#N&+W6PY4 z7?gRUDpRwS`3wpJxj)1*T`Y4b%Y=2seOhK@NJj2Qj`yIX3*?ri#^HrO*=U z`Pljh_Fu<4Fu`k6^p6&xgMY@+DP~_Wggci_PzS}KGxNZnBox1%`~YhtOY}7AQ4?x7 zg* zfT7#+QgMAAIO3zdc9`3mAJj28=NjWU3eA4Q?LMIAOLf z5xsBx7<}9K2>T{_F$jkx-vz=Br%F)*x`6_W3X=V=%v7~VrJo`n>vS(alFW|@{5T39 zoQ5vMM?`6P;}lGP22CZx72uQ&Rsc5^w6g+4=r01N%-M^@Jl|w(;;c1hz`nyWx|?o$ zud%&X+1~H8y?bo$0o(hnw)giQi#-7i3wqgDZR(mngsnu5cL=F&E9>3Y&qUdSVO~ zAdvLXAez0^Bx!*=cuSlXybzQ1eJmY*FFb*C>=WwR%`CjChn#XJ^~gt zcPvKSx~9AEe-eSjo+T3pq?hn$=GVF=xZdJvBgBX3XI}pkX2s<{V?ZeDB5XXWYD%{;FbKjihi79DAbgXlG@~dn7lohCY3cQ-}O!M6_ggB&K#>w_Rf;Rudv`i6Z4|8 zn)wLaCoKxxw^3Eflxy20Aggq*I`G;p-MNv`z5C z5RwJcwKa{vo3E{DBm)p}7oKLWzGfPNZ^E`1L4fTwln0H+eQEM}-$p(H@3skcSjaL2 zfG+u(gI84%iy%f?@YENa;&A+03-vkm4GzS-yZSjCzoEO+*BV7?((l9s=Ls&6k=ei# z$T&?A!Z{Wmmy!c5Ja~NE4WzipZD6}uB~CL4q5kW7l#u(qzXcajNHM_`L>7p84xksJ zcJZ49y@<4+S82}B6~wi~OQRY8h>Dkf9=d|PF7b+<5Fb$S(vL$|uvc-t7bZ3g9EVkh z7io<17#Ld8j6m}B`_WosrYNZM0eO^#(>t7ROx`3%{dPQ;f%Z+r?(G!?lL~%QP%_Dw zhcXwTkh5Uhpe&uR8mSP ztKcKWf>Rqvb$iIHSI$NBIVyUD8T~UvgX&2RN5G+fKw)+c{XV~Gkqk`(dN`ItzasKw z!iQ_sRM*V5*g3R)a_f>q|8R{UuL!*m!=dM6P(^WQ#el2{@ZN(l>MsCN5iIUQJtyP% z>hfZRop@jeLwCwn@arw}qxtojphSyb+vxqNd7$@G5vRX>trEQ4(6&8>Jxt4L=QH!2 z;WNYUZN~SuG^fLGkm;Fbt@#eY#`waPX16U(i!DvJ2P1X;8tKjROFX6PTnTHW?j4hh zBU`7kZzQ0}Dc^OXfz2Lt}9U!*S_$$=6a=VTAi(}&5G4bg!@wfmps(jdw@f+=iFS>fv@96kM*(Y>t zRnKty6}Z^#tJqsrFiyQUgo6RSmD)e2Q%e6N69!wVOFE~*fmeal zma2c}RN?kC6AsGrhiH%BpY8D95H9UFVt#tKK8c+i9c$tlEtA+0(XqI-!-Q30lcQrX zgDIT54i~=s8bNe4eIP7~KA6T%9};h)4~e(Yhs4|HL*i}pA@N~+POdpT`cU)?XNR5W z8DUZIGL0QxiMPQk@iurR-UhG4+u)V>2)uoN0=)J0q*a3aP?%^j#BNo=pydVOVA8S* z-srzB+^<$jKW=WYsMJbzY3Ecpfg77Dt#hhAW~z>67G?3%!=dN;g~-rFdsJ|z_L!;d z?J-l@+he7&wZ}?jYmb%6)*dTWq&*BdRC{>ON~jN%2PTW0EbY0O4HM*{e=a~+tAAD) znfdr$@2`r9H{nIcsrWH5@!2u?&y9&cf#19rTsZY)i!UOi<73Ppd~S`*30?DdoJi|5 z9(YR4&fl5FJ{~0AHXbD2HXbD2HXbD2HXbBCG9D6t!g$C}>jGYcMZwE7c6cS;2Cu~1 z;FWkAyb^DNSK=e^ru`In=X3!t!lK}18auoaZ-ZCjZSYFG4PJ@2!7K3*c<1~ScvHH7 z7hzHGGL0QxiMPQk@iurR-UhG4+u)V>2)rpj1>W*5;6+#zyi8+fVW7n*<^%`XrZ#V_z3#V;h@#xEq^#xEq^#xEq^#xEp3nqO##%P$II z_+5S%@FOe=etbv4FYz|`CEf8|7U30vPu8 zFvHG~285y*Ho{u{{s_ax_j>=#2*Z{5@iFm5G4WY3@dIMwWu6H)Ag5!i$nl4QA2;xr z^Nn-mi^A*cn+ZD^_rsA!c!Kj8?8XYtCa+T}U@#VTVL39A(myo}aX8fk6KGqiyw0gI zI;X=x?1 zmkRhT*eSAU-W!vlqJSun@XS)EkZjub>+lVgm7LqAQp3`|55pE>MEmZEImZoW%+AK& zLFeKQyi9Ms;nU^qy-mJ@vJ%1ZoQ*T6nc=x|p2BWnr-yL9md3poV^s@?-i0?tO9ze} zD1r$%cov+yn6wmPE_D{XSv8^>S;W{DpZufVMNCoUEV$SgVgK3_2j6hr^&zvRntv>j zhW5v4@aM%cmvusi;Y_6)KGCu8+^<<3L&XHU9>wc-&XFRDQ3L=tt$dSMG2Th-{3Fz` zC0Z=+*d+tuSO;6;1a(^Y*f&uj z8pAs)B(pB~Gj(UcgMKQO_)5isKBJ~qe0Bt`h%ZJ}FCaB>YITYJJAgn9c#0I8wHumn zXjHay8+oZuOn5LJ*jkc$*rwV)1@_b&4W!e+?kJOLq^G_-3B%ixuen)?+UMw?p5Sa( z!>e)nN#)SPz>_4>@#CZ@_-nL9 zD$kPn(gnTI2*TV$;<>{~RqcW4*zsHu5y!3bUHQ(XZ*f0QoF|SSE60|JLpX)F@aecN z8Oub31KiG@A$I>=NG8TJM2}?I%yX_>SAbh{LJ2+`F{GCWj96zXT#SMM-gyco28Y3W z6%@q42l|R08H3#mphd+wAd&%3>V^6|%bjY@Jk!|E!au?HFA)aRbY(beFRxyFkv;ETwsEO z@hc}FqF6D0xB)pdwp8~sB zel}x6SxqluFkv`;Fyn^~^80Q2NU;9pi7Ix78M_Lxc5cowe{gR8F)~NE`D%Vqan1wF zHg4X*jm_T!6yWBsPYQ16X8fScoJUr$tT3lC+5ed;nNMD9V^QSh7uPYIj10CS=xEc@ z*CX8g8+1byH(L!5=)j|j!!DF{?|MG+_T!=JRnhwK;dcHU=D+ZN0j_IgAyQvj7FKdw z{C9QxP>b76vR=#^Z`QU$NK^dN&LY2>qFAH>rr1^dCd3Q3-KKv|+5#!BL{jd9JS4Pc zHa)7{KLEwVg`JVFuU5J1k(;2ITviKKC(*lzv;DHjctV9Z+6dE|!W>QRx0EeCe6T0Z zBSdgug4n+1W*=S{b0%U|7PIF8WnU)4IVLe}!Ce5ul5uNH!#HYbt^lYVO=ueZLaLiexK{ys@G)BYW_RFyF}?8P z&cZjbP6&LBl>YbJ!OP%AU~H0qlh(AEn~im7<#~ED<~;aYcplLyh)(lA&1evFU5Yz6 z*QM91_Sfc=Meyi3v@mhj zFjxfPc)`b1QI0 zKK^#P)7KOQK6mehRm?6=;45vfKT*6a^9TUm-qqW=MTcJb{VI~DUP-wGyVC0jJ7)Mb zS|}f$)@L9sPKLPgAG&6rCramWlF(`n4F33f*u|f#O ziSxq3zHt%kH_gW+%B*%5a~AO`a^FzgDH@!udxsLWgJO3E{2zImyv7D z<2+!V@5bDglD=7AMLs~xknE5@pC37(Bushwd{ChHc&BArim_Kp1|Z*o6orW|Lz(<@ zQkYp#$EJni-(TBZZa<2_fiN-9@JN+~9a)E$1%5NkX8plUbAQ?^Ihz!IT-%Ij*ki` z-;uYkQHWY}|0Id3V${`Ul*{ObV%6NauDu^$Fh1L}Ia5H4MqX#rZPL3R=}i;O`G%s* z!}!owg9n*e^x0hhfiF3Et^o-aOtzLwtJ|Ma6|8Pg(2J3bg{QD^m+x2-nwsb?MTwDW zRg9YGh9L$;%3O!pLGO?6_S=|K#rMOvaXFgEYK&p#3;E*a%b63S^&}Hg?uygTOJp!v z``w|0EVL7@Zk4^GZF*4q7ql^YOABnEPb7zNHPDVuYQ($J&wx06-Hks3l4*-78}2a^U~%|LGDhCMK@K!V6y08K;R-h~S* zlTV_QNPxm?Z}>7G$CL!T-^`_yMUYQ1>=H&Gi^${n> zGou$Wnp!q=O)(~UN2#XK3a%jPe7Q9K$wo?^^ zv+-O!{95=!!j*rvEgl4X!HO?}6?NWS77tPs1=E`@6a0Pv{6cV6Vlhyo_l7LYQL9r> zz|6eY>=|GDWOT<;Bv)xw2kT9QA@I)fegrX;bu3yA&URo05cVjTb7TlpzmvlO0fG`B z;ah6LZ@?3VAW*+yg2jl|w%3`A$oMVFRPSqzKbY@ee8zo|aCR-%OsYgVxQBHT$erK> z6FPvJsMcjudb71oh)y-_JxLqZ^lU^4I^}Wl%Mx@hnKWdvooc00EYkh?moi@TK39-l zku>ulN&6uwS#EEMS41Uc9EAv@7)XfV>Y)6M1}}+r`<8?O(8Q``3%dnEFmx+(Cs4*J zmn%)!SkOB<{GRb_I#sKbOwd6P_$%Rr10EPp$DO6mqo8`Y7H&KP!!uZz7%J`vj87E# zmUQTbQVhopDmf>#`+i0L%XHBS5f-+M*eAeC!&;YAl!1zn$p3*#j`A_Y#UJ8M4c9#$ zFK}SyB9O9LlKDl=5_`t|&@mBc;_~J@yhm^%#iP^(?GFpTc4O=m126@I2oh>V2JH@7 z`vfwtbTJxz(u@z|SfE(V|DX(*1rAM^hbH@fjYj);DULLl4%3`XPSPp8?PI+dEbglf z?eioYF8j}hI?CWp6zm@nvUWLJ3ll~068`Mb{yeS{cm=7DyJKT}vu*taio6~L9bSL6 zkG1y~&TvHcR}bjvq`wAr>aTYs?T_>qyhGahYd$Lak^Y)0a2%w+MkA6`JqAzdx0Us% z9Gy1+0n$%qzd4((1ybAZ4bR7i=)dik*!piPJ`8%bi|)Txc1lOZcXJ)!AI;wjvINfwqA=`EhX*$|AKjbv4H;7(!WU-~%Eo)+Ll zjhh?yIcJ~GHKRWrcYin*j>U{KGO?g7J{}~AGQ#t<9}Dy~ebu^RwJB%9Nz118zZ37*k@yM7Y-1kCxk5^%OtDCC4M z|1sq&<7_Y|8@+;^h#&f>wKQ5vSs@_f5#%gx$^t;7C32%hs3-ZQ^P?D)=;n|zsN+<@MWi73BwAA9QtT9 z8<>udDKLTPUajxD_$QPnZ9kg8E9LbcvI&2Kq%@X_SW7R;n7|KZ|h@s<;kfD6@mb(;z&2Z$)T8fa%fXQb?en z_7&{99w4(GNbS&rpaZ}O4n247aYz?GmHO?3UQ0O}^c;eRrQY2mL3w|_ z9R<7#bm@K293@lW2+&63*=q!a(4#X@>61L@B;Fh-#F}GkC4^IH;#W{5qKQTI-(&Zq z6709}J?BEY>(`$JFgaVCH&^2W%zg_JUcghl7#=BM48Z9oh4F?pIX1#hN;wmlUm2S5A@#y4L0Q003w*4qrk`=xiE^fo5cf`$?y~ zgf7yJd%&Y3EDRlg{}OckiOd|z%2OR*kB2^QjQCT?%Y6d){Ra3oczVRzA4jFuJv(P| z5f5R7`%@GU_T7o&1Wi8!JaCs<-2;*V)i@Lci7{v67xKkjD!G9~<~zASWUe?~JymQS zq@*XMB$7Q!8i<1Kj%2+BuaKugJ$AcuUvoB!3(UYPGJl~5%==$(x#(KxT6g-frP6cZ z<8cn753+GSOl{+`EDutZs0l27$ovsU71VoKtvVgB_(!0(d&K!45it(E-^};`Jw7LD z!5+!&i!|HaH^%ws0~22mr{TWxOM=hj8>Il|cv64PV~wrV#*1^T_qE0=co+U}TXW0z zN|KGy?Lv$}`3~#fO8~?`3Yuqy`CC``pwzO>9mK`H5Hf!wH~vlpXMmJR`SH;U>{Hyq z@o^lSt@~5Dzz1;AewT$$(XRvFs>SUag^2)XkuFLQy##LH0wTXRQGc=)JTHRzhwng5 zXQ`U7+c=cN&WY%W+z&PFeI}aoJ?k0T{(89mQXlkgvt#d?b_e#WdrrT`%mR%qw-9j= z3_ax^048vSoxAnGQM_O}@Pa4M;z`}7c*g44neds2tD)hAoc>|rzSmRG6YjtyUJ-2y zA3U&wW`RG3`E>5P&iU71NZ`6<=<0Ak^lDGsF6aD-WE)VWs{6{#TSUNy+@WEa*x&d7 zqz@k$9xSsH^%nGuCwD*h$P%56ZHRG)F58FIL?Z5(cjvBgHr|Rbb)oP=zIAb-FfLE> zB!JiT>JcvsQe0U6Wz2=;?ocMKlGe4{@P)!28hpqqB%}d^xUYFH^%|%y-=}H@7{^uh z&dL{XaSZQ)#sVN35x9s3XH3y0z(>q|zaHKrPBMJ7{caV%oB{&=poQIt>%?JX)_X15 z0xtNr2K6)a>bQns52hn6*gLrhn@5n~-0*N;B-d)iCz*=+_+;Zb4z}g{RB?0LS7NI8 z6AZrf3XHdC`uK_T*!MQj<8Vcf15J87o8%hm1Fj9gWS8E5f{fFN#;?z})i-hVL{hAb zhoSBNknt;b)0y;3dd0T=vu*#zcVHJYNA!2G{6lLP?(iM0=u?}T@D6rRLd9``J{$3Z z>*;OD6@Re>D0n&3*x;iY%l;jk{!rh$okHptbk0tTa_t zx+-fP%ld#wiCvEf4vxp{O@IO~2zf1hL-n`tLzF3Xo*u4KPsX5#PHv7@_*-J#zf^TE z!8_XFWZk8xTj(LZ<;UsmQ1qetH)E95zd+~#;{o_Kz&IhitEfISZso9*`p~`}k^VB3{tT=96T2w?7c4(hr9WGxA7rK9 z22|PTTir*rCv5J7g@)25Lwu(F3bLVADo&yGPJ;rL(}`U|H%%3T373Yz0m7>HV8So) z-IaCahc`3)pts}<#sU*^e2b*AZppceGo7jIewgo2b`vtC>|Vr>V8VE0p&EH1SqlDQ zEo9IH2)no^hZZs}l6WxT{+k3y-jnluay>HHRNcj>5vuNSY`dyX%x1(qQm@CTNB&&o zr-=jf+)_14NWzb>LW|vuSHKF5Gu7YxqFREbE#f<1+cNpU>|4jm-f3bd2Syiv&(n90(tWV6ga zvRpG+O%Ym{LI*PT(yy zD0s&vHC4%i4MDaLnrUwnwxw)lIn*#-m@k>{6{Lq1=$ihzbOuaNU{9FO?y$@W^HJ3- zq-K9uov0TI2#RhoyyHc4;%r*YTIQ`G^+#{lgy%!*)B97rFy}}125Ok2gQiS-Nn<$H z1TV=lAs=n;6#pXQ8xJ1;2;;dN;rhRe`~k-AG51NS*C*EZLoVKpnsoLz12PFEIj`a}PRPC0pBmjP-Ik5H~3R#6@}s2qYnQ?v*8g z3CWK}m)aEnnw}>^47~ypa7ovIhr|P63IF`98a4J_`5OdbP`W-H;K;H9cfH5yzh-+c z6r7v-5M`Gst6%5~UMD|~<>mVEm?b-{iiZ=NP{J+q(JHKpAq=Wj@xn=6Es7Jvi(-&G zR~%<^>yJIcDM8VU|5#25VPaXWpK~5yH|21-=rO$P*PA(zaE=^8Bv$FrOzIyB+WM># zEN8WVYjJyPOF^XfjEb9eVrtB2SP|wd(y_YkB&z<@%ppU(|Mf?t8wXDBtz9s78 zFdY)LJ&-ms;hXbeDB%u}5GHtq-$Tu}?!dnEmgi1JmanzPu(pYnw|?rkdBb^NL;8!x zFbX$efy9JQ_K&BawLi9hoH5Lr-!1zj-2Dudad`9icd#9 zhq1fa5y`}YPMXA8oJydz8v6K8^o^y4vvHy zF@nqu;%h-R%bm;|$~yBA6+NBNhrL1kF)~MO5U*zdO-om?*v=coISN7UMnMqfOma|x zzw-{UbwykYi{kt?BUiBdJg{ryq~-e8z?XdsHF|^i4kWiGICidHMi{U`Z0Y5o1B2ZC z!JiNnprCBcU!2cq3}xRR&cW$V78zQ(hZ*?qSN*;^bc&T;xE?ywV*fIn3`bYEtn*+F$HnSMKVj%md*0k)>Twj zm2m!q1ifdOw-yyqF$-}jr^-Kb3a`Q#MmCgkEMS0KHQb^9Nn{<^0>LyVyak>n@Qng| zTEq9SnS(SA@sKLsWz(u_XO{X(s!nGoZCy zY^k7q55<=J37V}Gj@07WOcSm1kkC(Sp%T$g|1zSPJXUH*ks8i{dYYb45KGPW$#|un znbL*{A4HEUfR`|Q8|w+*hIkV`XVdqP{Rks412y?}`I5pu!K+%2uPO(k^gSF7fft@5 zF>u}cHYf7nRM6R2ml!5DVMZ#bzb<*Q!*>P^go zvy_>%FO#M)X-*_*iX^=SNy)=xkE2kWKl_JEfYk%63q4@z%y-OX3yq`MLh8+Xsd&l* z3Qda`0NJd=hfNYTwFqiy+IluV2wHLnn@EHt35!l<0M`iKM{=I8&a>23I3}pWLZq`1eN+>1~12d{A=McoSSapydMt2{oOSZybmzrYU&)AM*#2r z+uXT3oQ>Vt9!@y?YtfzqdGPPlaDG>N573LGeH%%jc;BZrY)^!S^!hKf)7ES08?kTY z`$%DZOK=Y8v8G2uN1PAJa;yNMxE`8t0|v9RNfUZJHxBmV!89#63%2E1KU~k40Aj>D z4GjoeGeAULz#&G2;rRusRkzryto1?M-l^X+6cz^9e3E(k1{AT96iKo5DwYRUV0?>f zzpVU1|5{)~lJQI0exko;=Ab-H%Qefp5asb6Zy*b%ErKlH&4)nL@ev7A@O2pwc|`ac zBUssn4&K!ejjxxnF$w3(polQOewTh2_=>)Cu;GO`)o;#5+KnrB>Ef0xSd9KFm+Re5 z)q)r@aM4|rPh&g_qjENVfEJ)0NyU0z!M94zdY;5i4sIMla#hbP z*7K-j5!Pg@XJ1r3A4tXtk$TQo^++n#GdQZA(USb)a6QMep5Ce+axYsw!py>W^Gn79 z7^Y@>l2tvDiuJINNP9lKn(f(((KMzXAJqR<~bC#+{Qn5WJMAdVwBzJ}Dd20ab z`5M;S<`xpR*yEE6xL3#>r65))78iF#IHI8wM{3XM@G>j_5FHz3jx&BPwXN%TtIMJb zrRamj?nPdfBWN`KjLBcV*jVES>uVXmJ_l_}90}FP$7q!=l9O?_^|hW~-{4Ch1AJo} z`dlgBc!_&s8;$qWqnir+RDIor2a$w{g7-FJJM9Yr*&Z^%Y{X{cShKO4jqS*1EiO8K zqgEfX&)V>1T9kmDqB@s?qc2m0MhF4t@Djyf)`@el1bz|QX*WVEss8~pM8ljU+}X9- zm)3A??^AH^B3i_mh1MaK?(aQ@HLOXK9p~`+h^ICo+kTFz3lhyQ9T8F96G#?8Hc4c8 z3ageF-;bheBE@*UC&hFq-r5{-JVZ8+l_rTS)Nl?+stO2Hd@j14ezg%{K3os0u*CR8 zQ;a_a_P&tSBULyX)t#wUHT_skFI1!7+CRLOGg>D_-rvA`WIj}jxqV6{Q_$h!cdR~V zWnU7+3%|1TEm{zULCA6-p#l1Op8J7TDl$I6|dSxK*B5 zfE`Z&c*=0%B%=&viT>AwsWUzf4Ug*(@GM3T9xgm@q#PU`j>O2S>(KTXeSQC)Jx0N$ za_){7i^JVEw0v&h8f$P!s`|aaF^#5iObKRC+TO3-;5|5Rxe?16-@u|^GAu&hazh)P zrsclkOJ@LM`bZG-tY@Cww|rY+L;=&tT8H0CGt zBm`{e3HHr8PaTk~$Goo&NIr@;Pbl*PQX035_5fB-V7Ht?(;q?Cng=8?9yA!D6y<)2 z3)PSok~WOtcp3~0P0Y^m;v!wToh$n#6JJ~0fp@gMdx7S+ab^pZaO+0~-zY7JQE*jK zdp{ZP$7oG2`qCJ{Y`d5T9-vOua=-R}GHQrB(f__@XjyWi4~`XKlmR=p_Ir>|v=91d z!0SeLnuiRZ6J&9r%j1UT*-4yH#GJgaFi|fjN918tI7UB@;KR5ksVMFME$OIcNJDm< zsDrr)R&hsr0&roEQ-@N!e?G>b-9S*C@6(uL6}Tw>oUmZz39TgOf)NO}noPX@5%Z&k zSDx-0b2j_MpA>zt&vzwH`{BqRZ0*`lH09iwbAIRB&iN&|_kMNIbAC7gz;k{#0qOR0 zeipy5pZ7~K&-*2wNvOEc4s79}KZPK4h!(g$(P~ZvUrpk%S(q%p8vUghA0jT2UfQqYrPz&pUH(InJ%@xImztroq7uwSTM~_0pC+e*QwnsZi5}Ge|!cz z2M69#xX?uUMM%$u=ROPotLO(KtX=Gh7>=R(nYpJxqc3;h4H~J=`!a{)E3jk>8f6nr z#L0hAmFQ!`cR2RJx3z;nJ*{;l?M;}hMs^fJEhvgnTh)$&xVVQ!??CL8@oERU$<~5- z`S^r2>U%Q$*lodtGZ_?&-zdjCk@8GVl|h`wy@SxnC_KxwpW-;g{<0*98jH@RomiQ1 zJia?Bry|k+h9@Ao0l-5I(Ab8uR^dT7_7aXz{CnAk6P$u*{nsgUNIHr!6iS_!&4BHTj2OoK1&9-B&$!t)%H?SP?dy>#93e< z!~&%SfhFY(StmCIunK@s%n)$>)6kUyHUODyWmMKmo`MbzVA7E!|xb^tGh zLNQBJvqDV5OeiKFug=sZOiARniH~=vViDOm6ATAQ#CJFVIhiF8Dl_Y3m6g#j)PnKQ_6aNk`mnLLny;8mRooV|`Bsl5PFYXj5{X6-dH3ETHRXssoQB{|7TM>QsBhg5;_ zqJXi2@S0$-F=j0Q;aUih)icsq*)AI#X<;}qaICfd%UN0%PobBHeUH zK2~TH9U_SDHI(f~jx?2HWDXFD1qyl_+cWHKL6R1@kOFF)>R4>R!CQS$T}Z)EkwUJi z+_6X6BHr4`S1}4t!+s?vo2S6$m_Zg+lm@_KS_GozX(+1;y*+mJ9QhgN?7~j$*vxQG*Bel$W9qYd*9g-nbFbVgL7nBg_V#z>e>bPzek%7&;R|yM1344TKK=GbPoUb@A%Sf0R$(05`vlhIM zy^2eZEsyvy1fp*NIG%v9{CrB0p{#F%eB=?G`u?-HtQy6(oA)rp(&0H$VZt-$R+5K# zQ%*o`=`aCn0ob&HD{Mxi$0DJ+gTeF(xX!vb$9@lkjSf#188zV^uOMp)k#T4`R2-#C zI=oUL{|uD(AE3kQfFO$whbUS)3HT!&-hUd->GKqLhko?wAhkX!dWIQ2AJIoZYQLgL zZzDBa;HOB9TCmQf<^g=b`lPZWCADi$k^){9bVZR`oG^%=Mrt#OE|3~EsvxzSu~!dL z8;&F0`mGp_-0gn_+>p;S&NFH#nlq6sj;^9AE+i9`ZTgugq2^U2JM|M>;O2LeH(S?} z?9{t4>?@F>o^oMiWIa0Mpj`-s+FFdpf zOYr2zT*WfHFOUa`Kl>CSvFkJ^GmYY8`njNWO8iG7J?Ul#d(XU_L_Bvz4P_7f{z31L z0l{G&;rM!YE}&5Ds(z}W#xH=j@yr_>E-5Cw&X1Mpe1fhZ5`!fHn82sY!} z6YBpjJTSa=>YGm`j+7H6ETHuHz@sNL9xHf1FLx2C2b zq=BhYhY$YVPqaI+5l=PfZg>mIecN1<`c7a2CMCj047IS(!%-4jy-4fi)5g~W>)bV1 z=(I0{et^$-<5KhjO2uSLGm9KMxIX z2T$;X=D<3F{e%u&?Fr1m#^>lH@iyC&<32DJP+f}E{aCDZF2zCT(Mizcz>GxnChm2k z6PZ;sS4flOrnLpu?i9&V+D4|J0BDc((}l?2!g{ai7D?U%$eNMnj(Y_g)zkqv8(~%s zkdgi<2@3ReHvj=~P^bbmcEfpR)8Co2VNHtd?4`_m1dy)FoK0dT7aUIKWpZ4JqV^K~ zeaysiU8!EutTntQf?Dg9WSG9 zrF72I^08BY6yqc^1XbRz!~oQf=Be-AA1uanhdF$V%1o|TC(Y@AW%+Rk!4#i`e&-Bun@oj}hW6@xF$u_d4eVF} zb0N5-plw?{Gr&*`e3G^~{Nf;f7cUXFna}f!lC=z2XiCHhGNu;F)g=lCH4_B~reR|< z3zEgjk2^uWHAy<^pFkj{UNxPf{rbs}8#pR;Zl=PL8un~u@Rkzr#TTPrpGR_)wlTQ{ zHR(aZ0K>*ftTvS8Yv3bq1RnZg@Cr&c^yp=c2ii3@Gbtv=$tOZW8OBk4W3TY2Hm(|J zy{|SFhTqp3cbe~*`;2>R;on7BP>@ct#gBU~=EhpE&;7E$wqBi9Ptf#V7-u>|Tt4@MS*L+-;F#sgtpI_k_3p=lhH^ic{RKKB>sM;wGYm&it1F?@ zPt9BJ#}X=az$QG5CopM<91-ku625_bg-me*nkf1K|2u`v0LT9a0+Lo8Hre46nBIz0=S28(Q-Rl;Ayo@rDPw0%W1iI2B+LkHYYjTuu-dsmrm(no zapi(@;p0L;-+?ObW`jb#C%OaQ5%^b09iTV1VIKoFYYpp&`?neL9bR5j^{p2{sXuaa znG4>M=IoWf5x=eY?ZRmCf2akW%&di;BFVJB#CE{P>E5@r)*apP@PtPF3zFsM-Q5sy zQ@_I2wBRAs;}%xD+IUomrXHX=+{<2hS^a5yA<%7f@7>B{)%fs)dcR#1?B)*Dvl3=p zk1>8L?`nu^(ZB~ny)R*#Fo49NtfCWJc}N&jq!#M;Iq70J}jeC4s4YUV0S@e5JYyB8jP{#xRp3rYqbK>V41EiZqsg83@A^r7v zq?V7KPvc?SEE+w=13#??x*H!`<7N5uY3k3zEme5XB+ohztfCv*X3$lT+K&i)2>Lh+ zWr0%kb9MtOZwbe6!oPN>I;e_QM0uNksoH=7U_d~}8p?;q+B`3eJ z5E8K%;bwPI!}p2KhHd}~YmKAY1h#mBaSPIzusIu#Q$V+0f{0nn92EGpz{3Iw)^!Q` zhd+blx%8B6fC8&ZVE3Wukg#)9EZWn0LiEd!t4}6tfz2 z4a&rlSt?oQKx#)}<925w?-^LZFH~?DEBJ)+c5orng$m=eb@2vc-Q2q7DjA$+V?ztW zH!(bCi+{TfuNX{bf1A85bbTTyXx*p;3aj95+hEV4YpMG}J&sCmX*G_5fwb`^iw)-8 z4^&0^BM9Ny0)5pnRC1Fd!Y7!t(11s<$Hx`rR}ea2AaD){V9tdcB40ti7n)x107sMPz<>UmP|HcBR^x*c$mkC*PKNCdgO(F9A96G z;~*nhcPu9Y7QRf=mPPtdh+(*#;UnAmUt9h>Gk*=R{1uc)`gO;JNy0Y>f$uYLIs$>; zH(5!rUV19ndtjL;(1n1zn*37S;Ky(SILbY=`G12(uPE9o+%X zvo;7U`wiuRsUYx;+yw!506M6fh;}v}-;MM+yN@iXWXk#25&l=SwaT!+(w%n7YpsJ~ll{VZNPs{dQgrG+@1?N1?tna47MeX1)_ zOR#4>9}rZ|Ntnhrk;Aq)7p6;<6jr+PEfjyZ5C)v-C8_0U!y5ST{b$1`r!IUSv&frB z3c2+4NGw!b)G&FV~HiTGeNM=?K7ll5Aq>R3-Zls+72cX zIZP=MSzg@6^vw&B+>le;^Y)XE`0p)fJ6eX>Ud#=~%N(bWpE#XB27U|vKrkPE-d#)G z1;FNuYatbnS+?~H%#)kjw=B|Dh39i?{?+zwGpBUp5|*jx`Sl(+7J)IlRpuxFeXx3{ zsj~Bnp8i=lxj&aJ3T54acMx_B9{RmBcBO(<$uTc*_BY{r7>weRgb1U^hS{NlgZu&a zm*{0s;5t`+o!&9gaC3lI{EzWr|3BwZTUW%zm_{3_6j+#J-SEMzWE*^TIeJw$@Bzxv za<-V3P^1@Huo-_r6yaom&7w~HFZ|T{!gBFN2d^*lN9ywgyK^(Sw-%g@wf8ztYMYTD z$sxDcV3U3$KIl9Mi0EN-9K}Y|qbtIGcqE<&^yx3%@(DE8VX0hrTA?{upR(7dujDZbK4?$FGi?L@-yIH~okAJq|@zE9u z7d$>~!{hR;5-xb0AHgF8G2&56#f51%rIH@Rqje`7h3Os}792W(4f@YIzK6aBt5>L6 zJgEr9W(N4J{`7(Gm8RCH5gw)FD@loD?QdHrOXh;qm1idCAYdfNeVA#Oc#DFW$wg2& zA*{t6x2`DlE8_(4P8{U{boiUw4p{R}nFLO_O-64$TIS$*bU$a!xv;bh?6TY?(r8$Q z;YAuoJ~YzUyEtb&>@Bg@!2$wf()qVd7(QBS2hk@Q04$9RI{F^OAK1IO>)Pe3>xszUZ1#p!|jEU+Cd z3VD)I(?)k{OHpp??EHVA0A93k%ltN$9cn( z`kmI$ny791q9`;B3+D}lBlo~8xb=F~cNZa=kILI`=4Sv_-f&^|wkNJbuSBTf2f4Ug zchHFNeN4mp-gvY&l((oFFhAz7)P3vp*S`P0qu_bzf!y~YGo$UIB)T9tY%48L^$(Gt z{h#_?$Rfh{;j-Z|yi?LyjdIXmLVwGIKyhX&G7WsAHr&KW`}`RCTMd%JD){R#?d^k9 z@~zge36_sogva5s#v+g&_SL9dg9?pH)lHt}&XbA)WQkhRV>n0Pe;?XSn)sy`r=0;y`C`aL0w`KM@By+s|Bj2&I#p zV7!Y2^oF-dM^S6h0KgGNV|(?*-*rJ_6A_4eDX_4GRdDcVk{3*NL7)k2f{+#TZoedH zfdVx}Ua-0Uy%pWYG@HW&rVDcdeL)@90{Sw5#-dQ>OG00JNMCKfJE=uK9Tm5S__;7l zUwic)2$e0~qTF?}67=h_I@9l>SQ5%hpNR&0zy%eltkY|7kTI6L2vwN8<{|;e>n?CV zi@ZE|CwY}w<;Z?5{DKjk|M&NQ82CR7{2vDX4+H<-V!&NBtF*GBY+UK|6vvd(x{9eK zb-vn)s_8?9l-AUEtI8Zx{dKID^2Yzp4AuWE_;pf4xAAVnEA}xM5 z;dcdo!|>~Y-}8eIhVrA^SW;9nwYmx|_D}V>s{ECeuBvLEOE6;g_`vEK-$0}(D;-!_ zF=b%Q)xPrTszK+brw^>F@OcN+lun&dI^A10&{thMwR~Vj)znIVnRj5pIBykl)p`e3 zpfJDp+;WS6(p>H_;|ebsQ{ZtGj2&Be$vD^O0=K7dl&jR|I{iOY+37>FvoiX-YAU^@ zbzWDkx2C$*=PGqg@lSX8s$Iobk0Y?>x*Xm)HQuQ{Z<(v2%IEM^yYYXa6;$J^Eg3Mp zqON3GWp$}9C6zg+cwIo1w?9hqxuyeC(@HDryyv=HDI=@>m1Qnpc}1PeTU%RQ>ze7U zs{>+~rDA4HZS^c~*?BIPdz!1Rx)WekE^k$NY1LHkOmCGB;9lkT*7@v^raBx)hMKAA zX{sB>UOjU(+Y#=F|586>Wu|p%Zs&fmpo?@!S+##kr5AlNO1*T^FH@?kEC1KKbVw{gOR6`|$dO&_%k1I7g>Qpc>H^dTXn^m9A<2s;NFWTyjkD1O%lzB@zl+3@Mny+LEIQ;C=+A=f$z}eNcGwQ(hy#q}?I<{h_ z?RAuQ7AClifm7>hrw-)wHt-_Mb(eUj7tZn3mYzGcrp7S~Jh!@b$Pi3EwcgUo%Ic{l z6;*ZKS{w5vfq5#x1T98u5*@=F<7z9wvcZlqEun?v?2>kNg|FP@nC`9e)?ym>RhN|3 z)|Osv!d+2kl?k5lKRyMaL5kSYvIZXxwH+yckIRqC2n0U{ev z2f@T8SaDTjCgPWgo~TYKt(xwIq~e`AV~DGyq{@{tyS!p*xeIgKnb-JJUBmNaq8l)r z1pu4T(N=-xbIc&jK^bY7U~Bz#6|*20VP5r>)=u~O3YkRWv&+1`(u&F)cB32PRAN*C z5}%$4BC0I~)u=%o8=uL+Z5A>p{j%E98fze7O0Tw$t?Jqmck$>k8Ku5#h$s-w3dd%a z@Hb{`N#=CFuP(=Tb&agu}2FSB)$`mvZ=qLnMeiR>40E-y~d`uZz_+vyc z`XI2TlZ^3C+8>5?kQqu6C$vb?Ro@|&6-(zGVgUb7;MYQAPRZEtKVO#cW%m@%%uHY~ z9G+8BS&6<^-!}RqnzMf#&9*X_zra>oRH-BX9sFBp28#;+i)l+ZJX5treMh$?ny77G zX=!PkKL({`re&pNrwvZaNy|-7OHWVFNFS7*nVyxNojy1{Cp|YKEh9Z6BV$lTW=2*< zcE;e0oQ&K-X@k-SWegfLD05KOpzJ||2jvXP%}mQo&&y*i+jxgg=N>goRs(Sa$t@HXy&aJEV*G~18jKxHXwJ9_m>RCN) z8lqLoBbM)qDyk}GmR5>{H>SGE>liC~6vv3l>M4%%l~5+)oB2MfVye$E+7FS|>nNCA zTH$jPUwzS(D`;(*2KjmD7 zMu&H1jqmDnXL@H=*Ir$s9J8u0QRtMRpi+p=fQhfQUdk>d4hW!i<)t-VrP6Ug8B&AH z5X)_^C8bqmOot4P5(ulMl_Eoz)|Qk(g77-*m^ERs2bsur%&PFt28t)V@k2*Geku4R z;kWPgA3C<;w;sR6_}ztHJ$~i*72}tKp9{Z4{B~{rp<@evE%-f*-y;0#@vFgaGJeJQ z<>TkTuLfnaM++ze19L+_6Qb9_v&=ik<(B{RTuj5iBU|rKzu@;(R1UPN!9sSZ%j2z@ z4pCeRr#R&j$sE!;guE(>?g*=Ml~t69wu!40s3yu>Q?VNGc`t($KhjD%#3jHTF?WISm|DICm zv0v=(c>kpR9iQU&41V9>7k~2p4j+DB;&+j2e@EG=`#Z*+vcKam_+`Soho!rXR-Y zFH8~;xMU$~TlAViz?u@sgo9cOXfP#j?JR1^R3z!NqR!0n-)(2kpu=b=;R`o5(jFJo zrJb6N{t^k44MtP0yAI3G|CN^I9+tW&wq0yf1(dnruC&@?Ys-JLhcYq`QxEaK4daR4p`;6rW1C6XKS& z6SM*nuCxO7tn!Zfu|mVfST!`D$j#~brvRbqNH>{#jPeLIbcP`s1c}d9+OzaQ=G3HB zSC)BeWo4-H4;~ARa;3LW>D1U1z&N%fhi?}`peoERQ#5XStSi z<*vl4&$e(vT#V8MOIieGdrGgyW~8jdmzBY_nhf`3$x zfkpcWuG_I_AAA0fts5;YN9k__89&mR^w{QP63}#8d#r$PgAjlQnZcb}WcL4m6#pRC zf4>V32iK^`hHjhA$igk3hr%y3B2myDDT>1cZLb{f|2BO?OCw|#Aqx_)n*2$VK!L3R zP-Gk~oeJ&_g0)B^qVNY;(v(KY4$UAtIMKZiby!zSF;e`I;ir@O5h$@kWCzHGoG!=o zA(TXl$PTUo8XW^m#v$+%1cHlt_aA7<`9=!{DPe*89%35` zVA}e*6MO|L$Rpq6A6N?{Y zmA@m?8$vM;YdU2^VeSMBGRdCF?NfDC<<+kLN8bCu$312L|99s7zWF!FbV_MZcFd?QD}0w!1>BinOwHDT*vziXtdmbZt;{DT1OX^1<4o2+DTl&-?d0 z_x+yCWHPB>_p?6VUml&l-gDmfocsUWbI<)V74svq%djGBwhOv`N(opbt2^OJv)3Rz zo4uss?-f`0tQqs?&hKa@UO1K-!c5%aSY+9u!e!*A=Ge6ut%u60DTC&)$){}AnP*O) zCp&)3Bz%xr2l-EU9eUIz*3k7{I^q`z#|yvlRgvqF9=ru+a>-ZNL9sR5II_83)6 zWlzfwAnCb1>jnzdCA-Mz={V8Zt~+<$JeJ(pmdEcE&J#5#o1ACSoGQWtrbqX}lH)kO zjvy@Icx?}r$NpXPY_$LLEi#uK!m=gMl-Iy;MWtv)Y5YW45n)R)J{3{Kvb&6=NTWYz zs;jD}hEJV3&opSm%zv{gCd!V5|3t%%EfU3VqM{`a8QSv(5#4Sz$lDZk%rW63{{8#R z;`(6OxpSt@p3Cmu*^ybY5iq(zU9{qmp*cPGQ~n_Q>_c;Um||JSk^TAqs|{gTZcld0 znx<;>!Vgg|4;#{xo!IIJsF#Ng>B%aZrzn0{dYSzvJ=DwSH)A7s>s|jD`zh{3Mr?=b z5j|2B*zNyObrw}$UCmsKyrDfx;t#z4<^G@Tf4Tp6`(KaNr9JDsY%Z3zOcZBg(=pjA zbC=3oV6)GeQ|Fggs&ccm@_qM%cykJ7q~1Jk;!KQqv7j)^+!I8z<`{FyjgNB`<8%15h9PS=m}

nHejM>S0<1F^|&zmt{jzs-5dXlRQE31-CAXt&CG|uNx8cXbm zWDR)XmH>|V&e(G6b z=yuNMG`CoICoQe>JF{ELi(-(R2AhSwJVoqs$!i!BSq`BM3Qv^NX0s~h9Msvh;rcDu zD-m5aG}BFXUYWKfn-Yw%?xld)DCgs3Cy6Y-itwpn*CKTHUVXRn>}WWysAx(kOIaO` zOWVoibI*~}kY;oLfP=kb2qVtm{Kv88got}G1nag+#Z9^0nMp->s5i4V=P#$Y^KljL zqL441eLQbQg^D^)!fK#7zZcuKVP+Sx?HZMH4MEMHBPSd?cG#S1j;_SGTw64r&85gz zMb3|wpF2e=(@YY}Vq9)@%Sa#38KdG1kS%&R0%SrqPmW`#%BdA+vBECL3|T9m&17l% z6xUPWMHJ22)N(Tpdi`$dv}qhf43|xR=_` z%^56{y4Zg!D%zZCJ~R20@X6&9=99*!d0v~dg--*Y)qHCCRPib0Q^+TWPc|RLXWOsa zoJKwkeCqks@u}rg%_omfJ@R6)#-0u_zEsyIcBFXuIF`%4hq1*IMvW|Y zO)vW9gzqw9Z;|Nx$nU;jVk=OGoR*k@+^C23d*|SA)5VHdZWwox_}QaJdhpPnaB}5Q z-nNukXBJI5iea*1>g?%L&MunhE>Gl7oK{{j@8r=_j`Tbb#rbmzDk9U$k1U%$t(={* zMH4x7A%DZg{aKJ7nZUM(f>SuIadPDm?hd>nS)86K|Fep4!jyt3iIpeq?3=$qcy(1hyH--yGK5gBn7;SDjO# zOfmFj3SG{jnzC~-?qy;ZVT10R%0Y4s$vH4{^!^6|4uz!5yzlaz6}rt}t&wJSnIMPn$Wbyo`9=9=bm% znc}8n%Kwf-F*-4lM#VwB9;%@4)(_*>XQKA{;8^_1tu?MDh_5WST%!B#J+1uTI`6{-Qr?|SV%8Kxq=0Si^?E^EiF z2U1#9l~XwNc361Gf3)d$pG11Vu%6U!`P{i@&5uZiMtlB~aI`&pdIB#JZm;R#`P^K@ z@WB56aM7u9)f39Z_%CrEO`BU@F6TMrZ&u|{*)8I_LZ+PUt?r@<=5brg^ol`Twe??Y zKSPK2bcF%qW00izT(8j&@>ab6i;2%2+LIj@XG~|JJ3a356s!T%y6Bq`>R55%bbP_J z$ds2|C$lQNJtw%sppiGEw4<}R_GiW*Jo57~g=E!|ofvW7{(G6|iT!Uz2%}!4&B-4oqM6}T&{@POJR;md z`ES?Dkw^AK@0ct_)2vV_a&sb^;6xh@IQU@2(gW>ax|>@WCuEoqqx7(2|GQ(tkep#X zX?M}XxZGytEfG%fZjLlr`fuGkbwm&6+T5=a^=8KJDi*h;HHV5 zp!?n(2@W@FJiB(m|G#$sh!i7NVtap0$FX1f{KQZ6t>DpIn~}I)XXxH4VRUIvR&BBM z+>z0{>Vc&ncE<7rqnOF6tUfWdtT5ug^S%oEaVdUi@P7eGck=}k&n8Z zCCgRFb*Ek70c91HNyCchiP6jQyp=#%JFj3ddp4WdS-DeT_ineSpUm6(;Dul<3^R?5 z31zYy-lWuhGi8uW(Mn1@UEsFIu(zb>8ig=2v z{sgwvS`wb!0F;Tr{$8=|Ar? zhI32NTYT=^M;0b~A-X93%U( zk3O3H-{#iL@>#R-*^Qtb9es410QO?wYX63={%vjsO(=orl5mAwAH{~ho=;T1Pq}xB zRk_(^l=7~gGCH>70wS}XBCSDslG|Us3vQ!bxwA^|7?)<1RmqK+IpSTH=Mi2Fxi=pC zw}+?i)owe=#9aDiG>O?G=QYxf*U`S13_G{)oJ3YBZPTqsp-7kybY0Lnch0n#^XAT( zb)HNs#OKUtJALYG(_71o0vyC7DjI0ITg&lw!w=46FvawJ+O<)LXc10|$!@mkWhY0K zyj)i1mB92WSs0qpQ6R}G2U@548FEClexhSag34_^uDg1gJ@wo(rWbUIlkHyL%a7z5 z`GZHY#i`si4hnHwinZD?%q5A{%V&4#>|87+iH4)Q@U;5T^EuEm$17Jj5=pz{r$Q3> z6G?`@R4&Lj#v9vM!j;?=ZS1!a*Va9yV)_u*#5m*Ja47?XG27x^b}ku&(UrtTaIc&% zRqE#)UUavYu^-T750`c?;QQEBUcGfIoR-Vz#JRV)5Tj%Oju|U*b9M=Li_fYUc{T&K zxSx3*HGdWxbUXjaq;}}x-L{d0_S>0PzPbEyDd}92-^?%5=8o}uh}VA-g~SXDR?pG#shg~&*5O}P0PGOSaGUFM-v{~sFF&t3V8%<(VTJuM~?hK5YLI$DxxxEL66rK3A{ zVKI5;+_JD@q3{e>wI9zKw7XGJ4B)sD=Z6|NN9mrgHS?2s)7c134BUPeC(+v}D;tJc zgMh{SeK$Kg&NIoyfrC3mG&nMA+F2+Z*G!wpe9D`Lc`Cx>qZ3E$cKqlTOk?8H`{d^- zm^e#slp-=kS5Dtm?pk=9M7K}RJ6D;d+%u+g964IL)5|($7tvhDaLn1AZ$LPS{XE0w z&zU=q3yxU1!(Uy*Ub!QunERf%RW<(iL5ijkcnHG;H+IkChR%wjY2x{rR!~rI#>th# zW#Pwce&d1`b}8d-`Gn{ybDu+j=&YD*=p6}*LifD>Vdm1t%E@!)JehK4GuGfEh z%r5?^A4ma2PgS!+82hBC4AaszRf@`FaYc#Y>0UGCj?iwnXt{kKd%%@sj5Jjyzb;~S zR?nG!&Xnk>#VJ!O%|-w0H`>Sg>Dt}-@m`!np>tcDMxIMAYjJjh)nKTPQVrL)IE%8C z+5)cFn{YR@a7QF_5O5>-3b-Mx)casR4vv2XR)Q%vwmAI{R4NBt0iFi7{zR$%buG?h z+VEsBJY1S@jF0GHz*c223Wsw-Wyu=t6Kazf4nldyA7dOQ{-g4Y&^M|0|{D z-PPihGkuNJw>X=?Irk!;-!xnVZU(Of7qCitJGcpa2ps-vrJe`ZfS-Z=`2mb;?`v_E zfM>03aa0B0?*|#E?)Wou<}3C4b(9mA!>)Uf@~h&G*oVmH0;Mi|xW#Dz*Q{@Gwu7(y zrNznqEq5F|(&DVGR%)NWwm1VXWb6hPfwf@6LViTLp~YEw5$OhN7gKMbeKEd=M=7s^ z`2INgI!>v&4J}SH`Dz6F%fR_Wi?ajF1GkfodNA}GrJBJ-#GChIi?i-`@Pow?-&3>? zM>xB+(0qZ`Zoc199;Bc@3ECd_D$zT&00h_@^pxRD*0z=>$Fbp<; zIpAh658MWpfURID7@}Mv;1;kB+yOQUopMqiQ?FnSSO}JaWne8>4K{!)!DetB82*HK zz!Gqklqc8(=6y;#+Y3H042GLqoJn9kSS#N@r+y0OoR zya%f6R%iH+kheF#X9b4)!4Kwu>%e+&o4o%~tCJlj{Qlf)3^pFXy~tqBfykpBl=g3R zHi4m^@cUP^htL4-P?mN9uBAQX9Ks#Q{fU2It1}Ud4CXhqz$Sh>YYW(LEO#sa7`cu=x`5 z{ZsOPHTOQ#pKHNtun}AZHi7HFW^fZ&vI02+2oLrFBj5nA7R&?dz=>eqHLcDfumM~P zHh~+3el6(-Yr)XL(81wg&ULNMB(MZ52kXFEupV3mHh}BEMsO3@1U7-q;0{o&&#~tOpyx25=|X2xcBiykHn?28V;{ zdg=`ffs?^77y)y@MPMGd0xSX7fTdsq7y&nf)!;U;7HkFUz!3FY50+A&P2d7BbOUmv zp24+X1gsWY?MBK4tOK`!4Ry!?n{Oh0gTX(wI!nO%TZkVlxt;pUAsu(n{sr$O--36; z4~Fg`e?yRSFXaVB)>02(_yOtFe^Fccl9b`TaZM1=oOTD{=*!XfL#X^&WDMLf$s`z{m%* z2eAG_+8G%7nDatS#;;qQGO(eQbb-pD{f!`> ziZeuDo}YWN!B7Tgj*dZYCihZC17|M=@C4f{DV11 zlh1tOJ*LfB0ER|$7x);`2W|r+U@KS+hQ^Xkus_%Y<`u$Uz1DNnHaHO`q)&$ZwNunyb;)`Q!@22kZ82kZkjfdfGGI`6>}aE;U> z*d+D%2I-;P>%mQwcjH#tQ33gVv(1?VmcGroIItdU0M-8>NAMl;Lp{`i6Ty=As7Fx! zopejRfD6FT``r5uHvfaOc3|TNZO%ICCFdjZ4K{)+sh{fY$OY>@rXI%e{gXDQ9IOYc z!P-x$2f=3I9Z&i`qaA|v;0mzzbMBZ2bH0GT7&%}e82%^i6jc8r9bhfEP2RWCPmZS? zz)4_z8~qIox6@w15{LGD0`aJJCl71@7l2K^c4rM3vf7=kV71-uglSiily)Zq)&<&~ z+!FGY+3wVVs&~7y1FYM<-C08W3GLbLYzCXM+nxTjqwwDCP8k>h)hW=yK48f{?al}= z(y!gA0>k^YI~%};AGJH7Q<1lSyE7SVKA_!ME!dxUz`UQ3-k%ZwLG4ZrSb7NQ1ZxMj zI|C*m=P>xdhC#>!Lpj7tJB|!(cP4_hha(TH%WZekP9xk=$OF|+Ne37j(e5;W%_B)C z?Ywa;`KF!M7m_cqx~Sc;e+mCM(hoL|Z+9xeP)WP95v-ov?({ny`ZV~!&~)MhOU`U} zwh7LHZwhiF?M^Kin%C|$f+ZE?yOjK#OL>8H=e0Ylz^31{JMV$D3)`LisqigocNT%6 z#pDkxxtR3O|00*RJL|#l(srj6tX|&k%$!DhxrTIu&DXa(o9LH0w^D!8$@iU<9~il( z-Dv`I?rnF5pGiLNZ+EJ}#&x7i@Nw#K26CSyKVWDh@qpD&6P|uux~bhM{}p`Cl3uX! zIqCzk1oR?@fU=x@(7y8TX&Pq^iA%79#d7bZI>DG2;KWtxmC*J6WS?Qm*zMdD)_#eMxCE^3 z<2Wn9@E(p+4>p1ugubWaYynI9I?i@^pY1qG@&on(Yxi=T0bt(Vjxz#m21~%YeH^C@ zEa~SsRbXge$Eg7u!8)*ZKgU@MMtcJ}=7M>-@PX=R$5{b3 zf$D7dz&>E!PYDOsfs=&}RtbHC<180CxDHguIL=nE39PE({YcVx4*3Tsf~DY0Ff__> zR)f`GBUqR3IE91{7dTG#xwH##AeaZ{fhFKXuoRpLM!*GNHCPMQf~&v=a2?nPZUURY zCa@XY0je>MlXf2QgZ;oTm;>g3`CuM62`mB2!D?^`*a)r$!(*u@FamA`Yr!306FA^E z!~>21RU!2chQKl~3|4_TU=5fD)`2D9TCfgm1RKHapeiDr3kVPP2TQ>`FmwXx{4L)n zzz;^iwO}>45v&Ebg7siC*Z|tULoS#NHh}{{6uy%OhQWzo4mcC6ok%@Y^S+dPU4Z7j|<9o-bu<=74q$qnVMGX!?y;# zAD`Um_&|2!As8AH%pTV(a86paI&SZy4;j|~0AUsWd_JY;;41*5e?p(kXD0MK;rISR zAYD}D;j__kv?V-`fEG=Ugj>m{Fo9MNZ6dVZp!|jTY~Uk4pkh#s35II?CkMkbGlHRl zVA{w4^#XYtpBlm(D`EJP{I&8~2kk5&czlJW!IG1MITNGYlDGbphr+k?KwP-IvS8uZ zU`bJM(%4|>*x<~u!N}NP)!1OQZ)|XhuP|8an-nL-O+z_ys*n?(?`mk33AE+VA_=tB z(8?2N>!Hm|pfy4(OQ5|6trS{N($Ali$4+RI6KI)!e259OFtmvYwBgW75@?0ciW6v) zp%uo_Fb!1%+7A4hs6nsH#|Fa_g9Gw|Ir+id{9xXgU}1i+BtJMQKUkU{oS7et!e<{|1-Fyqq=7}@A~s8hG(QtDf=0W1j>D>e^fBL#u^z6FSf@8n|*#K zBN!eT%pMgCjS8j}1w`&-KEp4*f6=uG|`belUBn z$X#grf0q%=9%%#?2axSPy1H;69aqY?1YWltRAdCx#s((P8YJ#w-sg7nUh-7Vd+Dz= z!e0;!E%p<^LMuNQp5aD1GMH8nDDQY4hwc)-Rq%;#yDQ&W9bt>&_(s832H#A|F8fvV zo0Q=nF}x_l%l$>c>}A%dV0fur5FAjWM+S2ir;H5dE=;BK`D-(z6Veq25Q5Tni?%pW z37wE5JJU&^=v>h`;VmuBFC?C0dKAxf$Sdk1Z>%d%bWR;>AZ2ti`?GSTNX6w+_g=b3 zNxDg;m+mpaI-h^GoA87Ljt+!LpXi+2g{%$nY(I)T7#5~3PN~sL?Pb>GexWZX%zE;G znWpSN5vEi9AYEDl^)ogQEA!Emv+y>;t5|c;lK(9>eR)3E}o)*RZc}D=@w#hJ~@Zid~Ehf-lsrMiV_GIw$`@)^})w_V8$ah$FtQ zm~vxyU6?vHSnIbkx=*$WBeM>fi-_|=i8EE=to6tD7tu+%8LTg|?j-uu?JKl3qsvb3 zXyc<3bdvD3B6|mXmq~i>n`O%1)9a>8LtiB>@RT=zPX*XSm_7n65i-mf?B<3 z_pVlzaM@|tORCj*M#7yH7tY`A1}hAh_L)bxQh%#+8a(!7!cCR*&XyF8W6VyI@{zG) z4SnxE2{VE)Mz6;7Ayw|5(^6YTDs?-uI!u z8M$B1=&{@#@ICfhPwtcOCCKfQj&6#|y{qSPi{U#jD);=Za?5$YoORZLQl>Maa>sUz zFVf!_(WJk*<&Yn6Z6x?=DC0YY?-=+hl8nLe`Z&*DPq!(0oI(9btb|Qy%4g}r&tXbE zA55DNK>ZLrVSFC-_>-HSeeU_fU|zg_DW$CYqhHortgl(RkDBu}2DdS5LS)+L9yf z{TRZbnU;w?vQ+G-8ko%l2P<|hXqB;b?2(B$a?)ZEzThlX9!tv zEKFIfYwV@gGUUtnxx5$l_`@yE5j?wn<#aQC=J0g{f+q6raY@UT>0bLo-!HZ%2J8J+ z0g-z(IY#0Xc@Z9Jqj5e04*{h~OWwq2-(a^ow^7cjnhh38USZfauE3U`jClhyS++}My-oD@0j|vWvPb4( zv0)dcFjKU4CwYlQUtsK`O5|+HXmw_#BIk=TuUtoGrHs1VDq3bQ)io)F3sbEN{ewdO zMHx6j#s>WN4RX^^53ky-)p=cb)4E9m3PtKaVqIl*O+$WQ5h8XXXT@Hvj(>0DSgxF1 zjz!Jd8e<1{ij9DYz9eN-ur_5(a79XCu#W7mhE~t>x|9jQ2A((aX-xTL5H6cEB&jB> z3b7TdvuKOMTAizD9QMJa!{}qmV$9{16vgp1DHO%|{y|}@w{MWLW~WGj6b7XIt%rBf z_*Q3-)ba6d9%}q2N&EX1Q!VQzY9Kq`%%4pA6P{*xs*i7Vej_}GCGgN;K8f+jlt-4V zRsSIMHKEn{Ncw+Pl&3`EIXzfyTQg%kCLS3-HsNPlaw73aTQ>UsH_{~r6Ex}yVr`bR zAiIgZQg2IrK0#YZumO#3VRqupFHQ?K`>i`ul9p(Zq&LiGGcr%DYH|L?v;8tMVT#u{ zBcPY>PXD;J#W|F6kT{pgTx_X@qoT$xly-bVFmG0{&_98ZG2#mj7#F8NWNgYsFYKJ# z>ik6FIFC4}w}t*vqZ`7|HX%PP!rly?-TX8CNbrL-6Q)05P($V~#Q%05@&JwEFn?11 z+0gnyV;R+JAJMsLRfm0-k5bPbZ3b{N_b-O0FiwuhE`^p4jp4-n@l{pwUmi5$Hz*8- z#*qUFTjTL-H>j~S>OB7CLNj%WZ*Nv`0P42fXp97(P$S{jB3s@jScnOzc$F5=QoXRS~AB zBTPw`Fx7-vMVK)iVUF(!CZ)}EUC(vxE zhy+?Tv{vLt>q62o5ZcZJS{}3=3ABmOniFU

0o~Er7NyfmRFcy#(4SXiW*UbsuNlrx?)GT zd}s{`;U+;_pFk^zwl0At`|{Q%(3V51PoS-awkCnL9@^>zS|c=RPw{Df4_aLUZ6~yq z3A9YcgB1z1Ftp_fwBgWd6KI9dY7%IZp)HA{g(YmR2XbHc7@Jk3kKQ4UEFM_!^uV8sB=2&GqaW{PlD?NDK5SlZeu@3<`7UJgFg3w&8}It?-b2r1GJfx% zgW50svPV9=Qy5|~qcG{>osZ)KVWJ;SpKbWH=fk@hc?mwe;e^>ldbcq?*6{4wG|@4u zd;hA`gXkO?kLq}~Cm`FjOJh^Ze0Fk&MT7k$dO`NM))Rg|A7A7mwyxR!mRUkhKh%NK7GCitVq2AeIbIJy5u+`cNzrxcl` z>_J_}v)we=@r?#t`*7SHTDQ&h zPBOcjvoRA$;%_btP*XrCV@qhY@&gDy8cjv>}3E#H!J-$sn=gXo0mR{iT-PUuy znec77(BoU&bG|zGN)~y1(|XRg1-^~!IX2~z*K@uQ{Wgz1%7$;>p7Y7x|8+||zOPT~ zaXnPSmvgDd_iE4i*1@;>GLLU<&-t3+3orHfe&2Jx0sAl>FZ1~3^o*~B_E`#FsMgg_ z_V`YGX6?+Ihv$rQyGMtsH9z2A;F3E0(6Yo~{9*&b*hk0|zlEz?e~w?!xZ8}**-V_< zd4CeG?GJzM`BGv&Im~iZ9!6J|w>Z;zR`g4mx`_S__2ct<_6qZCKLBrR?M0S9SbGwW zhFKglbAmj=HxT|!34cA|Jv+kqA9MWEI`lP~sXt*NgpXW-{Vcj}aa6vj_9;>KqmjFU za5MkV>f9xAmvxbQVuxIu6C!sb;kOY!!zTP$@p5s2N_z1hN&cD%*T|muzl+>+yU4}* z@Mg@O-0;4PN!PYI1u`a_5ifU?D;FP~#9v6bbt_w)fW%)G3s>sD&kaZIN&FGQ)v;H6 zr^H_#joz1(RfKp#VH`m zSjMNY8J>RFRW}RI-ccUczf!QwE?BAyYEtqSruqwEEfm(#0e{sX({@7pDRsfGIC~@M z`+O3HLrRFbv%e-~sa|GZZcT(foG``gbH7{Uzd{&O=Ozzk-4?rHKBFrH*Nr(o;NQV_ zCfM%xf&j);%*!e6B{|xlYLMFp8;5N>%Dxb}QD4Z|8f&3FZ&0XQ(mK+tEw3Z2k93L+ zV4v=#(wMaktvn!*^xa#@ZSOWYdelCXEcsFN3K;$$br}>t#;^~wSW0TPBjLw-ov&A`@XS)I!1dn=K^Pts) zDQoQ1fW+~OUBz)vVjT2tS2s2yV{|l*gSy7i%=@O>Tb%=>pZ1Ez5%V9LeB`rb$of%Y z99Yw?eK~x8?AyCqox`R5_i<%-Hl5MwqRpOgJ3yy(vvT-0-`(mQCw!lt>a~mbb&wMO zbMYO=^Lej=Z)<(4GeyefX_s%A_;TaRzN0Nz2b1tb$17UOZCB06YQDGCnI^JskIIVW z-?P_@54Y5skVMpI<9p0K0Ke4zt2dyUHzKR-zDk)9B&wyZ6I?yPp=N2chUQ`k>v)&a+m zUzp;3q_Q4Si&9?uSgSLZ_9{C5a=f8k*Rs9+Dkc6uB#n_;5m{T2wT`n$+eFqkuWjT| z3`v~k?%KxM{>-mAzx0&&jIZpbJQz(qpYgaL^>>~KOg5pFwUUxlZFV<81&%1-RKTq-^bIHtu@W1?{)ww}< ze-sz*@p0v_pR7vVBdqy>(+8r}&LeJsmMMYw6Plb^ZGc7+&7aV6p{<7&??Wwywl0BI3TVjsoGNYD_!rGuvGJF&roK=YVl@{e%t%KDeg*|y7B_9}N-?(mR+0a4 ztMf$Xbc|=_Io8djJ4XxiK5a1h`()Q;Hjprr2-6_)NUyhVlrmdr%S4KuPq7II+EdD@ z6rT6s`L;e(DaV!YF8OrV^(}Rh`-EPus(wm_?a_cO0-yNunjaV4M4)z-Di z2R~zfio5|dnh~F|b}b!>ynFIuV~WwYM*p9{9x7`eJL#BW)^|l-74llYY<12+-^R&H z7*D)$9?SSw$;T68=Zn0J$m{nnY|PH-OD1n@u+e9I(d`PkyLVR3;cvaF&Do9ev)`ED z*&^7i-R+$%@vluf+K=nrml-DAne@Zu)Ln#U`yAq#8w{7>Svj7y)ih|5w^c)!KMV5c1v%&&_RNR*cNVRj%-^lt&rs?uG5Mb)Xyq4+bPPmnfa z#$(fOW$wC?{(`ooznl;-|DuU*E+I?|NCvd!#6a&;bimt>z|p8v66VNtN;lwje=U>;k!Vm=h% zle>-9U*G23D)EdZp7{KP+2Rq*hC+UdWDy%B#bcPTp<%?wdG3)sN8^iM9|}PpPM8Q` z;@8j%p_N0E<|ls=ZZfo)(4uQZVQ3L(WxJ41Gge{xZarzgjA#4u=9tWh!AT>7B~(?S z+klx@7!lcIenks9Sklr!3`WO#eyRq`_)JC+=Bv}VXN2(f-Gm{1;xv)*eG|mhlr8QD zQ{Pl6<%L|nZsXmWB=5xkQH#vbA35_Z@lHc#bZm{5_xbK-A4Fj`?Nj$t^cCCq=qa^uIjPaxsm?_Cbm1)0n=B*{@?{fBUZ1#!8U4 zCuJhzjs47c%B1uDc8Rl+Fat^B9TMhl!o<>P{0C8=u%G`!gZpLn*XxM}Qk{zNpB+gA(Y&!0$pmpd)C zz&o2~i8r1P$9)G|To*U<;P>DQ-O}c~=JCb%)3~xyxCx z{~Hp4R;6#Se=??LWNeuUU*@fC&Y8j&)5X;Fbho7#Rg6yIb(n}s2)jV$V#gJEWlWp! zbP}~C`{wif?8)g=#>|2xuFb?%d%LHX;^WHkcZ!T|uhdcW-!*N{#qt~rOAqif40jzP zN(k0K`s(13vm5qpgI@X+%=ZtrHvTHaq7U6E1i%81ubF+Yk|&gjv`q(Z-J~&$yzR)l_hk2Mu>EC$`20GZy%N10qVh{0=1vnSm)8hm+MlttWAls7 z?a#Bd6ZH1jZftGzw)ER7;u)~M%^7SH&-_mDByZ=G*ZSo14&%d>c9eGvvKrc)8$H`R zUME}a6t*T_)@74r#@TdS{>l52sFH?~5UzL{fr`v1^q8e``cyY!P%Yn4IEsCC~H`cD!5HO8UAUcq}yO3;mP=%JSrm z{eJR9mv26fK1G|HBW?1V{oZqutAu=$wl$<}_+xF()5x;#9_5uC^Mm;FWd))S3f=Q% zCDujBhG3$OwhLe#2ku@1vtN+HGR6UVZ zYIVE*BC=K^Ys52cJzW=Cp0o;?fr+dg$jaNqomzgrn6YM8YfRQj$;+$2 zPrBsJw4Kjki}tJ@Qi}v4nPerc#-** zl-W16i&ATRa#>@|I&LMhN;mi1=9d06JP&^kJU3D{_7nyaQ*XvbNIrI#T6@LPfEm#- zloraV)TC`z`e6vOV8_d``b=_uDaZdsLO;Z>CS&Us(wO#Yn{%p^`Cl2RO&X1^Om4^E zcm0JVvg^QQ>{O9Ca3uD^Yi-Vc=rsExWPVfGBeN#C&0>6|ru~!0rZ-sk5k1*$yYR-Y zh~>n}G2Ub>2#C(h!$e%f*o8>*C-%SGL09`0a~Y|tw^4wUL%g2nl$4m{Vj;V85O3a) zJqrEJopff8!y{2X+HA*J_=xOmit&*nnv#;c1go3coFMTfZ5J_nrX*PEelim2c*yk}>X2-^{VjFd}r;V>iWNt_1=J!3Bha$7v@{aksk4!Gp*qqWv zMdy@ewmHAcB%KyAjXjlke(RTMgTnr4gOvYiSvZsi6^3^KyrYU*+`2gu-h{nY5~dH0 zq?$V^sYb8vn5M_=k(V@VK+YP%G$GgS)m<9wXd0e#7cbp3NTt0_8d9Wvbd!b^q+wwH z?$c088VZp!iMs}0;n}2NLdUv9v78YtmbuFKj=74sq~|BAEbkO5zR7s2mH3ex#{ICF zIK=*N+wtg_AC?_wUJLHnr?ZtbRc`m%{`nYBJ#71Z=s=UXC$UJ%Okti*K_DJ})`nZVc`uZMH+_)FCsEyBEu(eI40VX2RGp zR%X4E(vtQAGq03%?vVU(4`YM{WBaFFWp+ttlixaAcP~@)Odb6MpW<%uFFqGdh)1Y&_TL+At+wcOS@Q3m^BBrJ9mriHn5v?j|4334rzBzlyvVKO?SdPIp)JM7X|c->9=?T^#r+MD2aYH>uwTP-gxZP zA??o1qVwArjg#r0n4N-t_E}%X$|ycA9MViGlSC$kS4>=CI`TyBk6lk(-(b5KJr|0_ z6K^b50!6P!C8<5dX>%^Tp! z?Dzd;N*S+0WARSa~n@j*5Q}z&M22^;!}pqf!yDFhxEf6yUH}_kF^oA4^XsHlDUZKMe6Cvt60qcoN^Y^P*j(V-2xTHs5#Z?WR4TN>%kq=u^2*m~_Uqoe>R zu>O)f+pe!_63hGpCbTAX&~IP)jq=6? z>n!V&fU-B6AOkDbs=eN|lJb8XcnK}7-WUW4(JC?Cy+`gHt&3N;Q z((e5$m9VExq`#N8JF_HR_^7(*-1u?2)PF@Xolz9MYo#<^yISeV^WgeV<#jkpj$e|jy1@fcWjHZo3yDb zkQuD)v}ZY=cpC{58q(&x$g_PpVQ6>petp(V=XHnlq~~KS?r!^&(M#Md zmSVjYN~-EQ)(TpYSwp-JNqKIOcxy3iV|VdMIV?XFKkxYN=YFI_<)b&YPi=Qbi48A5 z3PIWuZT~0YeFfC0@GXGP=ALrno2W|8m*|_2vREte!8c3N{WF)ZOCK@1#VZT!l54ta z_{d|FQfc(xZ1Q>RZpb_enNgd+FuM1b15;L82AhAc;2M^{8si_IM;P9Gc-u#{IPH@E zo$w~i<3;{V!t9*f=6n(&%+~JYe;k#+&f92h#q~4cb;{*T&IWq0L;~-mz~|;tlK7EzYO4oFzErHgWKwAlI1$WlR^VUNfephGO2561Y;^W-{ZNc50 zY1^Rlp88qf(ZW;n@feCY1Rr;s4fC81IptFZSQ~UFj5WJ*n2Vf}*2|Btb_Te`bGVZXlir z@!%cx^Z+_v?!%Jff8)$v1LODKPPiq6yV(mD-+zn5S413(xxYMbX$sFWrWEy=?WbUZq z4CQ#&3E;mUeXv62`{HB_!&d^|C+LDbdA8@lm(W*5b|qno&-7%U8kOyy&m1e4K8+Cv zga5&(Y#9JW_8RzBM`aI5E<2xzanpnC?mlayht1v#cTKMj9~t>zzsCvj`#q$6EP$`- zp;-HnboQf9uAPAmCuQ)$P_M0F*L3Xt$@6paIM&vfH;OKr2;ZKhQ{>t=!57SnUi)kO zc$K_g!0%8D6}gx4KB{YrmU5j2oC%T-ElZ{I6cOCQb7>1FPNuT^3@$A8~J;bED)0i)9Er&3X z@^Q2osApW>1+m9v$;+D@EqCUE`G{Qj_0SWIn<8f1AWxFYz|n@#Fsp2}eIIo!{=f zB=IgMOth}Jt1bR4vGqKsar}=-wT%k+{|B#^8%HVp8RYE+iQ`iE6Z(VXZ4qI%@;f7$ zdlRODFeY!^u8(2cSk_6YN%LlWoMB{cLFP2#9V~6)Q)DLCipKt*BIglfeI;H8CHOXE zf8te1W2Eg!9twYf&fqso%o(c7qj^X+2Qc-6_rhmIlF!eL|2h|$E0MXG-%XZa9CQl)h(^Y=QR*@-O?;?YrPjDEDmSM@s2iC0%WX5btVum-`p$ zglG8^T3spQ9kh7cU@f#9XnlDve-dtEDfS4o-a_C{Xj@AuBWO{3fhoPwIr-4$$@sH- zH)R&zrt56~Uz1isk-6(QZz^)g553YWtC?m$g2X?Wcat9Noc0K`iO@16Jbz;AFM?JA z&D_D5xOWXBHrw61HbMH3gkMYe#>d*72Yq}=49{jP2_N0NCUyEA;Y&BrpLll1Zlh1> z!@Ic02j|29g0J}{ze`G;zR$D0M~;`rQ_WofOYLzrn$1cR28B&|7oR~NAfD)WABLwC z+Gc1Ic$T)b5FHt2vucR_o}!E-%p$_5C-}`9zzu_0!S%DzK+huJ^6Lv|RlJ&o~;Fbqd=@pC(dmq^h`Yfoof zc(UDjUBVwY*p#C;$HULnv3IM{$6oPHZe0eHFg!cppG`R)M4j3HL{)JJC3+5>*QWex zNXO2=D)F;qsZO#7~U5eRKf<;Gz zkoFxer9bc+OXu@!e|nggKJHK8Vpfa@Q@hdoh^0@L5WbZ9n_?0Eal*&ODe7jaJw|N1 znBQj%%g>XO#*Zx#MqVBAwlPN4@oXj|JZ7sCs1lmSu+o9opHh&VwR%qLx z={Q<5wD+J%JCHvKXP?P`q%S^PHnha_4TP4MzC37>zIfh=&?J5Fw3*N(eetvf&?J3X zk`VqREw#`jeevN|L2H5*?E}*1*FoC~?OBB~SQ zeKqgHKqj^?kLHX%wH2_)-w0(reNvI;~SJIIOeK~Za z-=K>e+3P7hlXGyPJPPCllT|eG!7k108!!{H4 z0rZKH+t7Vj(e3@FGoCdvKbG{aCM>$e^Rct*-5amRu#8mtD{L`s z+fBc_KT_JncHXUfp|gFZe#O`ZExwKQfmRPKzAX-bwg#HaQ{+$NjesWd;=`3d6M6Bp zvLy1VJbBVj|1^!bGAWEbARh z=2qN8{tlC^bC_(2ei#Ao<}bYU`J3Iec>2NY752_XCEdFsd08O*JKCKiu<7jcqj@pw zugP?*>|O6>zwnsAaO7=5p8Zz4bEfF4d{CtUreFVKgtsQ&4?Z?u z#sAJif8Gzo|LhaFa#=`B51M>(;f&A!k}KG4N+Sbd(lZ&pXOWA?nl->5Be%Na_n#n{ zHGmy+sM8e3JtKwO_%j=F-+*T?A0O|plDrO_OF#OSe4+!&;0vcZ?!B#i7g`mx5zzMH zS^k7p1Fh=Ic4s$3VJrmepe=${C^Xi^qw^lA%R>6}I^LZr?ed@WSko?(&HugelHCh( zaHgx_U0}*?1@bojw8eP_+4l3u!)Y0B_*Oxe-{)LK8C*h~-?&a#(p^7bFa2)5WC@X- z{^kcEI!zadO7fdqPJYuJ=eN>!k0gEF?xpnlVTu2Rq}j!BD)QvFP`3s6txu5`h{{VS zznCvT{4vRkk2zkxr196-5dY-2X;b)O_GFIt&Z=7np%0{4VWy4^SU-(@2O8gEd>?1lWX{5g2|#D)IA}I1V3i=b6HN>u>2H+rSBV@*(w0ZO$r`qcAMYv~QP`q0wrY!mANV~o7szxV5x zY_-~wkc+e{BPsE~<$k>>RekE$kEf_7E&X^Zzb&OVrKlz?l=Z1z{5=cSd#6wHGfqn^ z{g$N`+g`lMh?3e+DZlQfF7fHBY_-J3h=8`|udG$D`t+Ta`p~a=zR@Z$Z}ymGU1q8C zH9x&b-g#W?y%08itiILMk1f^GBSOY;_GNx`tK61C`1sF0GoXFp)Bn(_&8Oec>TkCG zLaP_NaMXT6?GHY_To}+-2GymR8n)XqH9R!Lz z*h}4NSxb7U$1VN)Uh19{Qu$G;zBZ_Kq>-4XGd1sC%KFk3ahvZUAJkj?)(3;s6Mns8 zu=-5ya(Kqp*9=zo>tFq8u)0akDLfa_FAi24dT)g9#@*%R*M0OogVmLL>K6v7Tld`e z%R%bizWT|*>aA>j!C>`8wmiSFw_Z9}?bt{EWw2V(Z^)a3+*dz1NWHM1ervG0ZU2XS zPYhDd0f!?~`oe_+G|7155DD`9VGuqzY&;2gFGo(%Q1p|p@u4}d^IP{1RuA~~9fQ>S zmaZP6uGKaPzav$*4OUO3wFjQgQ49C*^W}lP+S6_t#N~j0@DbpPL;bwDZlJvRaG-u< zu)6dxji}28$qk~&4?Q7;@w`u8ZL8b7>I|)sMnr7y+3Kq(jV8>L#HTk~k^;k9dItEF zPc!?d_UkW#>P=hU-Ag&bvM@zI)l2;~ML*n2y_N#u%~btTFZKI0{g0qpE!A>ux_%?5 zR;BCbgX+D2zO0wJC=>F9S^DZ;>VYhMGm_{C+4p!VMZWbLdnW_kKU4MAfO;oQZwsg; z8EC0j1MT%JcuWEfy zs=75r2!BqI=Rc&*H(Y!C%-Y~nOC$lP9Uhy!wEW~UdAQN1w_B=39%&ZuTIv}|*4uvl zx}{#W+*AyVMtoB&Vq$nnBL0Uy^o~?@qlt1wDux9Kl~2YrKZEZ_B8+dL>H2C5CBnDW zW~hN?XJVsyq3w~nLY||8jh&-QJguQGgZCf*X`POt9|STsp=zJzmTeK(E6qnwLC>%lcJs%Z9tbZf8UEYHp*o#@@cLR zUS;WvUAtjlcl;6h7v$X1j%70B(F=~F``+ri&393%`pU1DrK&3}&GQYG{#S~+*_P+~ zY<+#Ix=tU)bEDQ*r}9IRr2Mj!y&zmH8jKR_H6B49G2;6Cy|W1(&E>6&bEtvBtZZjglBWa);zB-yQd zsqNBdnr+>@msn){K*&RqzDH6tcldvjqPOm?mWXjylcpQ@R_oLBhP~A*Qp&HVYi_~6 zJwvbATRoWp;p2d=+gtrJpqKBhUS(|GTYVw}$Eu*N-AmmS)Qk2~OM2;*d#M|HL4G8p z`4NPTA-z3Y-PBt*Wvd5!L*B8Q-kh!4cheiQ)t24$`fT;NSjAuU(W|o6EqmztY_(wz z2s`)GE3(yvef8SC)XRPK>b<0nbcj(<9_9}r$jSP~fO_56_wG!!(685J(sA{pnd&)P z|2<2!rRYzx)D5Zn<)FGdO(@T#N%-w)`l^6hmadlu)HCV(5&ohK89Z;zkl}A_hJ<`M zLq^YkX2?*uBw(ZkWaBLn@10A;Gp$E#^_->ex7AkKkgZ<#My~KNyggknWfbz+i&E4r zT3?l-)@sf3y(#+{3H$$!FznKQORGj(zpT|Kwit-5wuaK82b(}6(NBNKP_JR9r>o6= z|4ZqrP3x`c>W?YtxeaDq^ox`+zv7PWZj&i zp7rbZQ>F2vAU0Z6n^V;$E%cAIekN62o}!;gQTM0p3H?cF2IMh!H27P+HkD-Rt5Vfp z{kkDleQ4{YsbXV5xfuysJ)tGxo3$kTMNuyBj{U@Ie#NhgfO=+4RW&Au%|4+KQCtVX}Yr0g;?HL!jMLp&jX}I^<*2NiWaf<#j zU2RFx_hqO*rdqFLs12!ld%C(NO<$0q9!k>}1=NRWx;8`Iny%kXSFfdO!rUb`ib<9v zE&kK%2o`rP_UVUx>SO6tyiPw>cQswI4&Y_+UejeFVM6u!CHU(wvj&tcGlDc#W9|taGPa0ryo#?+X~e8vMAwiDu8JB}Im5k)LL$`_kPJD{4bYs9rMbJGI*1 zPFz!(WqW%BoQxEubgHOgS^CCW9DKGg~*FR${o^(51d_;Sga?tnr)ureozpC@;>)c%L6HC>8 z+kE=6H1%eRz9@}q(@m-B&Qzf=>!3{%+u-Xg+UkwAev!F`t)I-ozSa+AsmHXwD@(nZ zu0PCDU#9D~vecrCXI#TvV>5Tjp~wn+;Xi!~mYt`433J{4zCEw<=?84zyVl8QPFsIv ztBd4)bQp|gY0qh*RI#t?ZS{pE&o6iiHsf-M?i#Oo@BSmcoJtb-f%hkU>ME}^v4uYH zsk3#HPt^-a;y&27e~k|<rj`dl76-cR*Sa?`_4t&yHpeI-5BeI-5oXc_5Q(O1&L|LuWZ^?lV-LA|-Jdc4;! zH$n;Nn!f6_-6rt=2fOD>_&sj&@#V!mHD5a7*w?pvHgW!#%8>S^)D3Y-h}K~@brZVI z$G)$(eEMF$y3FsIl*Q)_z9R+a$dAy}8=2gt$gJQ+^q{40Oi`~``fn*R35W8wts7F* z1xAlwqW2@@qgua_qTbbdZHoF@ngw-zif>##PYmI=_7yYN>Fc9+F3I*?ouWVPs~%3% z5ACU*$kdNhL_z&kUvlqh9sZ`hBbX z`Zm8>CvDc{f~VgzKD|-PxbcEkPuu$|^)GBm8Jh^@Tz8^x44#NTrTK0bt`GeBo;3BE zwP$;(dM4#{gfKn)W0q=3(~oAUPtx_hLAvffZ)d66O#N_BZOzmhf@*u_;a-IOibUov z(zB~~j_qi3WjTGIGp;o0G3WdQR%1h}I z=0WLz-zV;)R!M8Zi2NI~X&OaPeW0KA^M(G4nOEz7mRgvqpUF}WrV8b9(`cI0^olHX zQ$R1yQcnhSO8}R>{ygBCnaDceq%+@6Q!BOR|8*()l{EE0n!X}kZAuf$QYm&q={aww zc77;6aYkcrwO;FGy(R9ed#f$!`XeZTY0y8&)bI3Gf6vmd^;Rue8p_u}p}fbY@LVrdKSYkN3(WgJoP`EO0$#fkx3$?}yy*3qh*1A;vUWRl{+L+kX{GTr2E(kn9 z$-#Zl>H6vvagyAYqL%n|U5a=*?&VoC(^y9$Q?RCZB&f*_y^Sx|P4}Hv!ox51lY20O z(mVQySK{qH>TXM~>f`&XrSI(H!)df|clEN?HM^@<({=mq{Qj@LxQ}X-gkP|m{-}@o zaJLMY{v|f|3w`t}eblvLZId$hk4Ufb{rXey%rHW9ch;tLM_?xhY*g06ksbPDy15<;@KJZg2HvhJKa*1KIH6;l81_+L|SD zKFkt1tyvWiR$Hl*(DCC?jr=|@9qYcGAjafa_p?J||NWcKt3UQb4Q zTQ@OZqrV&Lqt_@ImZ$577>G@=(Q*%hg>3EO)kFT%3A5PJj|~-{MEfxHg{_|(s-8sc z4_EJK{r)huBSqgnRMn>H2Zo9n`}$C|B28a9Tsr&b!_)=6tQ&@^I~in$s<%Tze|C54 zxnWYypA1pY^wnSIsO$H(77tZV?5%GfrnZWe^kP4K#W2;{PcI&(ZrE49K1^M+pT1$3 zy8cJ{{h?~xkM!$9R9jd-H$+`JK$7#=fT2Wt&B6MyVd~+7H761NbcjUt{2}`GA?mY3 z^bJGQzxaQcy5dm8-*c!YzQ#i}Df{$LO==bolyuY&)NJs6aiD%+h}r>VsJi|ziSVhz z^c6#JgX+aY)J20bi2U9`Mx#klxz@!ZRjxs|CRnricFDl-z{QluEjacqQQ2wMT%Ogb)!~yo1yk@ z&jS0(cj_8i>m&aCZs~7ag#Fd^mVW3!G21>oP$qw09Y|Nw7xtHV&a(chCPm-dUtN-R zf$yyY)nn=Ut^?I28T-6=fXr;4JV0&CIDqFr1oVmn)TaSmbAWm|Q-64XdJUz0fXrn7 zKlZK!KCY_jzZTN6N?6NI3do{@Gg({Q(q-C)ZlNhup2M)>{qry>=trw69zMdmud3qfM|fjZ^5umy zkHjx;ohdnfW~Th|2Qw>xJZ6x`4I=HF)OMa|f*;M4Civz|X@cKXNfUfCV1d5X@YwXlP36Jl{5kHckfYw zmpt9PN5u~g^X{swc%aJrBbwmOD(`#yR@_wOU9jKOE33SVXY4S1n0L*L;i-7B)RhNS z+<2IG`#}{Ty60e&`Ns!WAmir`ekE3pL!_YJMf1$`{(MNqvsK>Uq4MROhaQPv{^U@} z@mGh+FE2i<0?6fu33BCOg7})?wbxCp_~dl&-W@8gnC|^#hgsK5_kOcu#mA<5mrSep zpXuHc(<*@cW11k(OrLA!|7$Oq`em$34`REzv-jg^72nz!){i58xU=`G=@s|y>|L~T z#XUQFx9%)|U$_fS_Ecz`b`mDilw=uuLr=KxCKA_#FU<2=-#SQ*&Oh;Tc-TH!u$GEIrsVZ4)2!__f{N%f2Dl8 z!dD*Y8{a+U+fyFc-TUQ^6*odvoK}HX!M}If3Gdw9`{MM9;oU_~!wcU3X_j1wYaqYf z<;(Je@9%aT{uNI1(}q*^jZ+4ueq(nrBz<9b@2cth;qM2gSKP6??Bj0O-Men*8>ayJ z$Zi$*_piGhjDPX#oke5+3We}PQ$9WA`W=2+gC+6{Q-^B2TXw5>s>b`nZWXuHc#rHc z4aog_Roo2;wRgp6jd#Vq?LdCIf5kwp_qhYleou|}9A*`S-k}v=t?_<**ekJjuUZF^ ze^t!@&xdCw@b{yKUp5sVet+bF`0)8x9*MvI`O14GvkxD;5&ypb)fM>ne_maIFP?sN z1(=STzb|;rXN2f0$G>|DuusP1n}5V=@bCBH@{6CukHg>BRUeJNAE;i4UrXf{p>j7# zd|6j7H`uUx`?@q1w{I1@O*g3$OQ2`xv$`_|qd}Y@u7w-g1&6JyV-UWYucIRC{ z`Hh|B+b4Dw{N`OMZihibOl%MDTJgxPb~Q#mE!RN3+}pbsV|#Cyi;n!j-op1&d&^S% z#l5{Jcd2;S-rnza6XcfN@%zueqT;uEd5`Z=alzi+GkYBQzkA6R>lb@_ckNa2!@a#9 z?|C;*mIc3_wlm!n>0n;hxd8guXp6K*@yjD&K`YLdy{?>}; zM8%kL-s|PNpe<&`ZBq}z^`A$exlhB|-T>iVQ2_vCyB%9`j`yaDbskq3`_os!aDUBC zVxhfaC-3$h-tyz=vSR;oIuxfg{(fk>{C&wz?@@)TGE;H&RI#FuPl3r;cYs@i9e4~{ z`VAGZ2!4A?#q(3-eE-i=ylbY$P1TM!#^Wc*6n%TdS;V#)LCbnVYfor>3}e`|-SFt}pW!ASeqZWY%Y;Qej) ziYpHAu71Uw3l8u;y$9?F-s6=O*X{59V$X`7Bm2E7zPX?G!M%_C(SF`X_YqsegZoz8 zx1aak{e_E7Q}%>81bfLTw@tww0XsC)TD#%o;>S}xi1Y8L zh&vFr4b*ynilq-88{mJ%#Kq&I~8NjdveE$-|px=Ct2IZF*GA$^q=yPsm}tvf5(akcJv+z z(BL5FS}FS{VXxg$thS~sJ9?_5A1EF49M+=>?{4|(3fQhHyid>!WsY{VEc=LEb$!KW zc9>#}L^yG}ZwK*tJ5M?ukJy0@si@dzhdnCJ+TpZ{o2N(v;1un1k~cQxrqq^LfAaFb zFSo$UE%0&+yxamWx4_FS@Nx^h+yehaEl`zGEw%MjO$SfcbodNS$Cy^0sh^vfjxhD! zOq}T;)1kBY`&LbDlMScgDDTXBSqXZ#LLD{s+sOfC!JspRjyJN5Gr4sGQb`?RK2Ok3~b z_n**o;Io>>$ak2(kCAWlr}X>5+cj-m(g+e1m(zl*aB;VH+M>kd^-N@^U<5%^@;^0%Ld&7>P9 zT~!2~tEe@nW5Qs?*3Yq>7gx-&v+8Y7*p_wecAboTpF)8QKBH^Q{g?-#K=8PX3# zlsiMZR?=l6=&mJQGwE6*=tfA_NV?_-x~E7NBVA(zU1h7be--Is5p=UiS4q062)b6% zd8Dh1pu3E8Dz<*{$BgLS%R?T zd76Kj|NG2gs^7LQ4eGg7@!;oEnQ!H|@*Dh1e!e3G@*O=w`41(7`FPdN=R9=_`>N9J z58y5;&*(K<$Is1U`10`n8oDAJ-^~5O5DfP6j5EbGy8!D!;=}fPemn54aK5AD+x)2V zX=NHxN#?<9k8G*ZW?&nti zuG#s0BR^ODO7nGp7xlU7n(%tW&enR2{hIaoji!xEEx!!EA7?tibdc!~(-EelOzrQB zO^4naUd}?lf0XqYd``(M;$ z|1eMNPnudh_F4UG%PH?emS<;8ObiP76H)L@`A0vkU;Kxk?dTup=T8so_kZW-y==!S z{PMpqx4?fz3pB2`Gd5VA80AMJ(`KfvOfyUem<}=>Vmi!pgy|^LF{a~8y*+heRWhw& z8e`hXw3%ru(+txArh`m}m<}@?VLHlmjOjR2j}yC+X%*8L(?+JvOk0^|m<})9b-Dq)Z2&oGp%A8 zW7^2HnQ1H24ATLogG`5*4l^BLI?8m6={QqwU*^xWifN2#BhzN4txPja2bc~r9b!7n zbcE?B(=n#wOuhY>Khr9vF{X`7o0+yU%`hEcI>>a0=`hm~rlU;9n2s~`W-x!IRZL?{ z8<{pUZDpEaI>2<0=@8RlrXx&8nT|0XXX@?G{FzoUjWKOx+RU_-X@=W;()jl<64Lai-pZ%%5o$(-_l6 zrp-)SnP!*{Fdbw%#B`YH2-8ufV@$`HdIvFord3R1OdFXtGi_y>VLHHckm(T9VWuNY zN12W>9cStt%>0>FF^w^8WZKNMm1&0Q0MkLHLrjO6jxZf%I>vOIsdotTXIjNH#2B({ZNWq0FCY71J2gMyAb7TbX8<4lo^LI>dCC=?K$N zrejRUnRRWX02RZL?{8<{pU zZDpEaI>2<0=@8RlrXx&8nT|0XXX?#l{!FWw#+WuTZD!iaG{ba&=^)b~ro&7}n2s_X zV>-^%JDmA5tzsHu+Q_t-X)9B!Ej4Zl*Prb7xLj@iY-YN(pPLcENuJoPnr0iM4N zGj%@?U99g%tH$B(Z)K6YHZ-+YXpUtN8CtoHcoWBG+;5|SQ;jo^;npa0{2CHYPDUg5=3n)7B7 z+)jL$_$kC|KqmB+tCfBZ@plt%C2sef@FzF)8RGJ86L}>VMm*E0^ef2!P;88ae^s3V^GTl~ zJ}NH`!85>(ySNJoeLLwdB)ynV1aHnL{awTtz__@nv78%;5B4j)#O0LdyTnI{-%mbO zXm8=u=<*>xv|0JcbxwJ{O8jinKd7)b0|QC;SU;IplYay7DTs(T_EwcYcHEv%{L%9i z*IYbYAXWNS;+FnCi#vRFf#JsRKR3vK8Sz#Jf0w0yOOXEhy~F3-PeCvB8miO^?ZNuq z1Ot)a!^Gzh|0i)zUdn+7#Wv4E*qMaBlDMTGB;M-aKO;U&+>Yle7|Mi?x0mv_d@ds1 zOx))CPvWh_*RnkC1PkFaO1z(V11czZ^WMt;`^4=$jqRiOUujUY^R%`>d(h6)%gLvc zc$$0=uFmj}9jJV6(4Tn?81Rzs_eg&X>2D=Ic97D`I|}4EdQYYQJL&CwxtVzD!Ajq$ ztL0wDcP}o<~W~YG=p!0i-{Hd`{U*^Bq4_`R}TUcRleIhyEGj-b^K2hyIYK z5#vqj`!3Qy$#%Ggxb-u$>i1`W%QznrL4n843pp;le?1JMLOFbl4TE$TzT2USTfaGL#~mTw%6u)K z4TmYciyv5}c!u z1ebi9iI0%p%AZR5DSn^$mq>5@7j3;RI#&6_zN!5G%<{~774dH=F4wu_`4jQR?Ejh|L_}D5Fg(ojzSk<= z>fm3C$+)fbhI%wZkeSEI+X2#F%JNuwdlU1u^7c32BklP(@hb9>*KwJ4{-u_ui}*i? z+jDG-cT_9=zsbicqQ4>@ds_LdWIOCzqxAN?{;h1!e&8}6$2cFY{P{ryeJvVL%4yHv zEuX7^3!hWh1m&K*9yN^qL|pL}+@9Zmj^+6{aqCZD@##>I44;2!y{ubng81OS6;H6; zt|UH2{Cup_@|@nF<=p=aEvN05DfmP9|NC{?VAh=RF!AACwSZ$haJmlXL8je!u4VZw z#sVaK4t#_1na1`%>~)G)y-@)>FItIP{|9;hw>*CZF5|6{ejsxc_GZ0a`B=XNOaCBn zp+~GGLuvW%f^#$}PmBj_@(y5m_6C9AXR$ncvHfShDV+YeM#Zf^#>0k-=iNa(!wm$c zp?T_2VBv55C@lSFi2sj6KYOmykJ0Z!-mNRo?}$G@dU-#tJVW!8zLNeTwtZGYa7n(_ zk3`-fE6;8i7=mAzm|{NvTNCd=;@1BIku}V7{sN^RSQwo2oE>4yytdBXlWllUHsl-~NeSo-w_N8#w_a~kQtLEQTP{E_(GO*$T| zAJC79zxPDNtv}Gy#4lP*oPI%%5P$O$Esyngx8<3>ROzkX(8o#tDTB|*`ng+rc?Y?S zPwP)~Bk32c3fE`;L;5}F&tlti#&T`{^P3dek9_VX{vHQ^W3$rVLY&>}T~7QS2Y-%u zrdKB-x1-+aE0oVSNe{KkJa%0eqd$&a7w&6O`k#~jaMJ&W_>>d1{GT9x;!2j2`1gq4 zLHyjsO8;6m#8beHp2_jXY3;S1toR7=S>*F0;4*G+S)vTMUGW;w{({^45$1pZdwQce$19p$m*yovZu#3}Z??*NzbT(?}yc{J;L@hRcsF|FQ=q2AO z;&vR)1upy_XjaBUrc|ESPu$)MSwr0F?X$>-#qn;AD9;DT$KEHwG&hgsbArR?C`{xq zJJRRLXAdfFXhHLIoT}x2HE}EFXMm5?%idd=!h+g-U+3ieCenA1Pd)h$oe^H1PS8tx zj$fhuXxsBWvwrqsN`$!DOHnxDi#Q7p8>XyE$3|DVSHW% z+_cY`S`esV<~h;uIoO-+jEA>G@L5kj&2LsdwmjEF(0_#VnN~Ic^Bp2S7Tx6_-2&%U1ZPqKJa`HwdAdwI2o2KmeH!rHkG z^wQ4O?`?Nx&}aA@)k0S>%-e6Tqb% z?7beHm7dqsM!Z`2Sp3(-hrXf}{UyKnN4wIGT&CrDjQPGgE%`!UqW|3y#8&{9e64?` z<$o!0>sN2_6V@ob^?SGYr-_fZXgPT-;>}n~dit|NqczVpz(udK{^wR69qJR6A*|3vw}jrbpdn{}4`V&&5Ovz3qa zFSY#71}^k>U#R2U%9F2<-rg6yh5Vm0_#U41`y=fAoAfdIz3##v2x@>{=u+s@s@?R^eA9%g2gkN37<`TL1me_`8h zcM*5T=W*vKpJCR^_TyWLS6!_2ZPUcNs7L#^)#=|K0hjqQFsSYJUbg4!dzF6Jp?@E6 zp>I7`%l~nr_ui=eW$zu@ez_C4kDPYivfXZ`CB)oi_gkYO z@)??`Ihyx>;Wz&zKDI0Zb{rjaYNTABo$0&(^K&$@7$ty$5RfeENLF-S#{Rf>Fv7bL7Ami8sDY z`-R7T-d6@lPycnx{|~@LPCh{Uyp@xQ3zWXmk$dL@mv*-Iwd{D?=R&0)VZY2|`>Z8C z{ylAnorzyz@c|vr*oK=YbCL2HJxAxuEaE#}tavN56nT0$PMU}hIO8FHiPGEq5cA0A zY~WIUdymhS{~e?sifg+yk^Zc=E1&VRHQ$}|YwvmBB3JEwZL2@*d#U2~-m~2=JPBO- z%ibHX_S?fQ6Z(Vk${iH|w*7m63%$KxVaNY1hJJ=;@5kHne~a|?KD%9ir(Ul77jb`K z`OgO~{2MRU_P70Yt)Yiqj`ut3*$xjAxA$XDB7XE0te2y|olD%_Q?T-X1h~|9> zAnR4B0L`!7>#tJW#yPP4zKM9NBbS~bKDJ!>*cSk=yqe|b_+P;CA9ank+W_y|TuZze zxQw67f?&NqMI3MEF_hNM{2ck%dpmaA9{wKX)6Dh7j>EHnOMgA;$n(2MZ{sq2ljZp< z>GyBfadiyw{`V?>S8w0VCvLJPRY@;CHoH)kE$>qAN(;{R-SH-=Z}0!=y9IqtRpAfsqJ9nfnd2b z&nR%=Gd`jjevSAi?@~VYey5%9j{rCHj=gBbrxnle-sW!Pa~W{yM;jmKY2ur(&FghWM*1=KubtP^zo>i$KcofuCF^wx@e#)!awG9dyjxD5HKbqqCFNt|WLbHBIq_k} zD}kD1o*#W#>216$OTW*E;+4+6U@h^{CT$oy-tGr36>ur% zI%i)0n)KFBdnU{I>aPkt_9>1&a|!V(NB;ku_&P`K9S4CR^=f6DgkO<=6L8@_?${qb zU~y-@JWW1h2Wh?Rda=(pl>cC}R@C;(8jCZ|ft4p80WR%e<7inq@Ojd^_TMrLUs>p@Mv3#9cdi;+sl8&VwdI@;1*I4=Fx& zlh*fDI&HiUd`s(P<7(M@t^2mpkF;w!?fUpu;L;A89ev~dq>u6bz16$!BVOg~E8p-P z;d8L}oYRl*050t_;>h`fzpJ<_pI=A3YEiHshky(JQD;2-o%A*?g>BCxMwS0C^>gdL z)B)Vg)3bHH*!CGDedCH?zP~o~&^Int`kmNcv%aVNs~*$#Tub~q;KJX=U9&&1ZNhxC<>J>f}14||9+{*U{K=G*GvR{}S3;|tm_x3hhoG4xnRT7%>B%Ac~F z_bK6Yt+w}`pD8}Pr?!}thu?l&@`WDGcxUIazn1-6@PoXY+0KN$(+!UK<;eNlflE0n zpU@27#BwekBhHSscA@Kln|bQ!=YJ!;jT>X-$;@A9c{V%x^BaLnzBYc8o!2A8M{d-D zWY@tbl-|an#PBxH$;1Z+g5^Ksms(D|1jNwUc|8nV^0o0RZ8^VBe0Ylv5ZhnJKB@H0 zrzn5B?p#9LJx{rfxGVo3vGk4}SoJIA@1B!31DAGgbjI@sNI(1wEhpWQy!$PF=Z>bn zR?m6H@WFo28E+eYt^BK;_W26&n3M0`ztM7L=4(6rOS{Kg4_wBfjYs$~?G|sJr&zB` zm2f@jW5jJdGu!^R5g(YT<-bF-@ecm2@)`c9;(yp#GuTW#cBJAbQO?Z%oz`o_sn_Me zO}{(#^Bu9#5WJc8%q=Yc8-B0#vT*{hCH|Pf5B8d!b*kzQLO;VBcjVXy4Gup6XTCg5 ze1PSwB>#DT)bfux>&5lJrJOcCE7S(_JVLy=OB*c1b{_ka(vLgq%j$8(8=ZFeJ8>JI zHbee@`m@$+|Fd)+9Yg%eXIM_&8`_EZqXx%#pj_HWe8peb&a^9@O1ulWjPq97?Pn4H zq``6S@U9Ajyps4Mq#ttR*sK1k{4?|$--YxS5+Aul=L?o0^L&E%nA1KB{+I3ZMjcm! zT5azR;4;qf0%r63e)2iyS*0IX9Fz~20XK5e8He8^eHF_&hkOqG8_VykYtx=1exH{A zD9(#M;x=xTt?y01CEo#OeEz}Um7a|wdN%Vt$2jx! zRsYd^y%V&fo@aku4P5daap-?XdK+hXPnPEmpf>I7oC}N)A9wWm6Q?NsQ`BcRlTR=4 z&5m69BXHB6j^3V`s(fr*Pust@0~h*1>bG`0{G9mMRa#MN_fGGi^y3$5zuWd7BR=qT z#XBgscdgKTA8_h*32@1Gi1vhiS+CC#Z*}BYYMRow($C+{uOZ@96>Tej`o zwF~*s@91mfzh+nB)E{hneiXQjtIM2m^*7SH`+|pd(|i|kf3%ML{|elU2lnqm;tjhi z?wzR(u!8udN4aq3sLUOyzgyDwgY_&UPBxkJap`&i!_flEJDviz3M{~8?jCED$0lfHT% z<>PHoKK8u+E{i+s$ZPji{>_wocD;U!!4LATb@cNafJ-@T{P$pLg3|=x2D(rGom& zalobgL(V#T266WsVGD8hJXhk>NWY9RU)wL!5hq6Y+qmp!s5tYEAYOTkwuAMDJoP}O zckL=aAnwkiBn&u4KCBMbtNLK&W8*H^_B_|%&;l^t{m%uYxA7_Mxc#`roqg`YL)dPN zYiRZJZsMcdki9~y=N)>e(vRPv6D!W_F9dG-m+>H9$wJ*q`q(SABOnnS^n!@sqM4QX&>(>s`E6%@3EX$5_kQ$eh6IJf1G;09sh?M zt^5b%f+U_{^0}J0YyZEO_~5IxJU3|Kz3mv~f2t$5A2B%efBNUV)8qoNbFAj;>J?8A zw{cLdK781#lzy1|4!iEp2QKsL0Y@+TvZdce`-N`YUhAusPvhl^U#W>#V-Cvq^eW$P zXO!*T*ZL!7nTR3HIZP*DCJX^Sg-K_c>TTZ;UCuYkyt=T;za#|G_WW z4znRBq#gEm>h(V0QZE~K#mF6~_DwDaP6cwk_!k`r8Sak`f?-z1RpVJfec*S%g?{Kj9f!!yJbTYk zyy`c~r+qiY+lkxv|5*8OjivvhGI*AJo+du{b}c`Tqr8XbvYgZtY=1pZ+_ksNoTue) zrT+hb7RQ@sa7a7G)o&zzDd{~&ul_P{sjn-4-aTLOaq87}zwi+8v9z{N2j|xw38f$6 ze(Ff(dm8a7%EPt9?_$>x9?f7cKMb-GPgzczf};{v-~3Cf19A#X6#D`DI50{Y8czaSu2zW|97b zzz@eb$vmq4K1}?6;#F7c0Jr*pw}a{v-17*62L_73=%g2>RK}!uekufqxCS)YltU{!6ub-k&1qcU!LX z!`ExQpw5{mPW;y^6@c1ip0j|5)pu-C7*)_05V`-LBZITDcf5;gpKOsIipbcZ!U+<*w^2{baM#GiWds~QC-mK;S zsCJuoF7UAOe;@+)R)o`^0o;s#9=uz7_$Ppe)%T~QAE3WeI~(I)#K&&Xa=w=1>WCJR z1K8KokHyN#dBDTUpN_yEi@>L1{7Lz%-l^^UWzF7O4m_;<8-a)A`@snO8S)=uzcjMG z8&1}ID<#k$o+0xu;;<1P#;MYNZ4#~U3@jOWWaz9+^JLcSj z`yb+!@6dLr;C%eH;RF9qN1xg4l<@XB1o&RyUwMe;+sk~<05179^B(91;$O1(ceTSm z&wNLT54~LhyYGA6@aKO$GBeaP5ws$597Z$0`CVN*3Rz( zZrb@e&G4<{|C0zl)6NRd_h{f@{1->iuaCgbkHD{rz&{;24{QG~N8pb|;E7g_D>{(w$#o(Whkdg^!^-p92z&i3Qkz$M?IOSPgOV|hLt!RPJ>{14#w_4tQrP|IN;R|8U@DTycMG_eYnK z{sD*nHsE3Hv)lS`e02o=&Io+3&Tu~I2>i+j{1d>XUIUACoZrp%8IPc!u^~L)<0J4E z;L=}>&OE&bxa3>;cg@%8Y3Fvaot<&=e&Av4`K<_ipKhVYx!A!ff9!eXn}AEczd0LtSUIl%F6}VnjFX!p=pQHlF~+;lT6o?+BIpl1 zN6XXrW*z5n*ECN(aIs^I)1Tb>Ei4BfR?hwi{I&@EKM{CsPdNXRfSZ12|6a;+9?~05 zzXiCIzl!^ab)^3laN(1=Nad|PAN+R&pSgY7{_C7^_(|Yl?eJ&>{x{$z-;FB35-iW& z{o#BXfJ?tO^FFVY!)Hd&e;By%AL2pW&Xki!ZPa!c;(7Ej%y$KFL(f1FA12opU&=`Zeo?L1u^LH`!gkG)$9V%u{tg8mjm&qDZ; z2O{YI6oDWAmhkfTMc|(UF7smO4JsMhnD?RQYJFX~bO!LSe!nUL{~GWxz2!ILKje(J zDd&arKOVT)0je(6a#}mUnh5&aBJiEg*Ltp2^eG~91U%T5sPXjLFaQOSWzMRP~ z&Ly8gt~*x#JY?wYuP4&SNgumQ$Agu_eHyo&t)3_PrV_qjmj&oJ%MR{ks@evGpotphIQaqS6r z8hZHoGhWse%(wAE(lg%9LB!9r_{BP}aZSKHL%_qz`6b|CnZDH_5b~_ z9!U8I&>r&G{%QgqmhX>AKX$r~lLeGB7hSCLE5rI)ef~qh!}vT1T=E@q?2t1q38!y} zz)uEl>gDLA>m%qd0WSHvdenlqE8fWaV#k`+@VxVYhn4@=5%@oWhxJPh70un*f|G=k6HBJhK+2rvKf z5qJ}DDgUrDPu~Sx#$hA%7CXQ01YX6y@F(9PAMXQN@L?Vt{tkFpzx;#rL-%WatsH(7 z_FpNFtDnc-skl2%P696DWQ^yGR=#ZjF7n@_{BL4m-%UPM|ImSB_oqYe3NPo6BJjTh zH{;*gH#J_VU9^C=h=}eE4p~EuYzg;q$!iYX{OJgMpR2T-L)@3q z+~PG7Z@p8?S;hLk2Y6U{?geh-2Jd|w%65L7eB5)WX|U@_`;2}q*bdde!}u?az`ths za9q)><~@2%IG>jHgyTIC_*KBAoK>II26&R~a2xT;hqe5#ApS6LX@>#c6S4N&KahT$ z=d%fpw`uPUFV7s{!l#jTb!*@22QKAt&&xg(!RK4Rt3t;APsnG0@rdkxeE0W-muJ5S zyb-v_s|OrC=TzWQPIupNSp=UuNI(9#$_H!b`9%c%LGKSQ=bL~_yTvG%>{vP(xRmGW z71~k9vcKLH!RO%!{P*PJ-5VUY3$G0?&uIpS9gOnu6!yiY2>M%qOF4&mK6oh0`5^JZ z-BpnAYww4U`$9k9=mY;G?)pEx@`K^!yb!qX8KeEv`g`9TL4P0VtCHGZR13W4fXjGj zrk|BvzYhP9@^7ZTaW328b--;O`G@I3(pTQC4e$`#p*4d365x{Wh;#4z!x8jfkHDXZ zz+d-2Y-gU&TK)C{;9=!|7`R!-X*afZgWnmv67k)Y!%S9sj}I&VR@#B>_>Th*eCzTZc}%X3r&el~D39-RHoB@y)B0WRZk)RE`E zfPo^c9i|K^y}M65koW-ieRdp<0T1J|<8@L_+>fKYwfl~Pfrshgvw%xED;;~Zd`4?L`#=aAm@Tlf<3N@w5kzr;tlk8b6kW;d^JS9(WCKsqe^pbR4ExuiGN{JVg32>LGUg{|$Ip`_KHC@^|+; zrvVS6zZrN~dp;O}x7;N3=wIF=_%X}h3*7W0<0xAFdHKf`cl{jh1RhqNuaMsLbND^* zuyP)7v+{TCW3MIdt{2OI8~wrYL+B#C>+kVV;9>dROFrY|W9P*W$Y+4(GFE>$<`dz3 zRsffFYn?;An&W>9a2Y@2&c6F&5qw4>@O^IKICSnMy#~1S%Pz z_)jD7|Bb+B-m3Wy(jVp?&bx)c&G`Ag&I`LQ_^zS%6@-5td7IL&J4qYtShm|a#49%| zpJzz_8{lF6xZCaF_z4mCnGtwT1pZs#VdMOePlo6Fh6wz8;4*Fp_#PH(H@=zp*j2%G z^V`6sJ%=ght^ApFhvLm&)b_cQ?RG11*Z=b=;F9mydzAm>Y|s8-9arnPjt>%lCveF( z!}WS+;22R|KEOBJf@ARQ|4fJ_5M3 zgU5C7wXD}F(hpNUYb$x)Wx&Jgb*tsi^P1%>|AP_qzly*Q$GRo$;OeD|h!3#+_hUKN z6R&dQ^TJPumoo)i_{ZL>?RXl@O*dutmf<5lbeByJlx;WV>^I{ zwe!1xhmD6XN6`N@0^k30TAtxsv>m!AuWlvo(Qan@{fEHA%JWPF{yOOKB8Qvle`xo` zDd1+_F>ZjJzvls$ere@>WNWXx$I#=x58KD;bqhbQ^aBrS`482^d%MM*eeT`B!^;0C z>0@tJJ?$xuhk5s~eH{DYdBDT?+)jFTz577~{qH02sxO4|pC5s50dCsu{n~-H{J%2v z(5tx)+A{Bdua38Iwr3UR(M`aG&tOX1VT|pt*B2Fc{ZdW^F8$a_zqLciXA5wf7a^0} z7J)wmJZwDw3AmZ3&UxiAUkc~H9JtWCcFr}od7=DY^W|_p?Gg9@aN$4B^G&O-ewes> z&h-c2(mo@!kJ)j)^N5_&;r_nkXL}-WQ+~&v%1fTUtUA@#(49$l^`zIvtCI_tET21P zNiw-C8LzM1)RW3&(ml!Ewp3@jJ=xyfm(28ZxA(X8CD-)#0!j7trMvn%y1SCy8`C{K z9qsAV>dtg>b*gRsrc_URV#VTREy?8{@HoQe3YD?4VB^zoJfv-9{ z+gFpWZ&r6kxTiXs+7o%pT32t&T54g#%D7^2i!bYz>SUsQ_3DHIZ}*IC}F)xF8?{=Oi;6)0ui7OKwv zo_*Kbo=LW?>+Wbv2RUimhc$t=!0No&)Mh&(+XPyDUrQuYYu0phrIWpV{i}m{OD)&YY5tPsheoU;5Lk$H$7L%QE#%?drLDiWuX{r>#gUO* z)0IqhcBFbcy4EIqaxL$0Ety*p1)OaqK}Z36(tZ6sU6>f@&NamsvK)?u3mTv07IbxQ z(Pb+1<81aTf-|(XV?(C1sd;f*^8#7gvWrh$GLcBEXo)3PCYP^_B}HU3Bv+)@cJxAG zEbm#B?&(;wCE=T8+k$Vl(EeFjR}-9-6H>`eh%nhyUwD$#))ivx7k1U#fnjL=3WZsY`vVKoRUu!)J$-#%Ngax!s%7?TgbrUFpvF>P=r9_`qEuIVXFAc`(pSGR-3G0pN!AEm%@ZxDrk2?v zf$(f>7U7fqot7-o(qLp#?fj13OscPK-Qx6?mULgDWnO-@E?#ZRwy`6Li{LsM5DO5yZYa1E@^Dp|ehzGQEAe@|OFsR~vSQ*c73hniSO@60X& z$x zrdyqDc`3oVR3@$3L6A+}K4_U$pp;ovMC*vQ_iw23^Y-mvr03(+OVb;=d$ug-?%9yS z0La@s)p0wj(i<{;TcBpk-mX32Lp_R|^F>RbA%tFhGRem9v(ZRzYz zwPP#Yn{F}{%ud}|$)z&lARIOxpGo!f>Cb%{MQ|=p4E2=km$V~G7)P^`fz5D3GJjW1 zes*C>L$Hdq*?Hmwt>Wn&Sj7s6CzUvctid1~lc zZ=w!1%g!xLP@GN_a}rb?lyr8w%UC8m)-;_uy907CwIQ9{*t9YUvP5G3%C^qlo|Z+) z2COJzxLh*7E76;2>uhLGw{>oW`4KxtFiIq9`@~Q;KeN20DVaFA6M`H@&E{kxnTW?A zmN%rjAaY>pLypNgU0agvslF75S|CH?klXro5}GSiMyQ^cgDaczwS~0cB(z06+QKZt zs>4Z5@-|BSM5D|rrBUXyQL-v9lwPa)wk$(4iO`7{H-X)HJehxp#3p5k%$6a7tO~S- zDDE60iBei)0b2t(VEE)6AkF{J1_+uomTxd@dm&Qm3XwV1wD{5pDAS zL3R|;4BLBX_yq^n793%9_jmQl;bgiq4NGoQv#VJ&%*$k!b+=;|Rd6xQQ{P1M0R?eUNrB z$5J&NU41?My|8j5GkxZ)C?2B=3l6a5m^?oF5>MXgOv@RD*Tk#~C%CKJD!gABUF?B1E|psNiI+XH*|iOqkF+5C&qTXxv7 zWL*-bk6sLT@zQ}SS-L&f_)%S-g56|es;9#kYLdmI6TU)xfH(S6Yd}$_X2Kz&)jg)FhEJuz39-+3cKs#)$4I_^n zw>GZrx(O*bu1jxNnDLiFgU2~Y51a~{nsH3B5nb9Kof)GoF4&DZhbHkCxo#aVC?`=@ zOpN}nwf(R>Vkgsy8IR+mj;`KxPhY{gSH^HoOjuW>sbQJLp-ym$)+XVFmh7~%$aD+l z*u)Hg5w9khOtqg4Q*3fwI+fX;1?@zEeewh+j3Q5RO53t9xiX$ySsg5PJsd7f(~8?F z{4!stCJrri%K{N)fo3-4g+e1m#Swb<_HF4x&8!7RGf$NHzssJzY_T!(Pb_MVMZ2vt z0n0Z2!|{#(Jon{Rh-FXQA}szBVkod~{+EcY3S|8(QEP)N2nsorAQxhElHzY@2%x*;%4~tl1OM66}`wH+S)d9@s|VN0Qa5 zdv$MIeujBe)5zR%t*^DSO6`H5|C5}()+S|-HE~ZwPJFYMHKLj}Uel7=m~LOiO@>kL z{VUj3n2_^#1x>;$S8cN#%wR9SV+$0YmcjD*=NUMY(KdP5Z)L;e;XHqvwhGN}#lYzS5-5>W5Oj3Ijv+<@4OJp+)rxGH0}{i`Q5KI}p4uhmt}=k+Sb>+x#1 z?RBQ$Ln@B5O(*J^yBxj4lVYV$b#$kN&nqNXd<1>Ks`K=a-7asx!z!40R<1&my~sDG zVvTY})=Rd@qlA4a70N$X{)>iD{)(1y!nKtnDSvskf5T@7Q;67!$;HF<=01UOuME9< zm}RionO~Eq*42pVwnHxFrh3-)s;6iBmaf!>?yL?Oub)R6Iklc6WFnbb3Lk~%?GWkg zt*BQXNgdy&m5R3I*($83^gw5Ys@lVvL4G4>;;hWKEHOw#RKf~Y}JLNW44hV9V`Tb+GW6V$|`#AOl} z>T#oen@UYcLx@cyA9Hpfa!?pDbY9#A-j>b0RZ^&omM&p-JoY)|L~{ozO6>7VZSw{&(kGaUG##yx5|)_zumY0_ES$lvVtw-7R}0*|H2Ae=kBen zT%NJ=U)Yr|!rEs0v1wbFK=V)5@@!WtTV66H-6n0CXAr%Z8g$Z&X*LT>;??VY_mItI zA;j)C`^6A@Agk_$6tHHy1#8GxirMAg7bIjBT023>VSNsal)#T)v@IK^qN$EXP*tGFB zYT@ia4iN;ib3^So@p1C}y0*kXLU~xlW0MtfByT49LipGpE?i|IojMXGUft$IbTYpA z+1N~BF`Zzt_Z@E=&)Mn#o8)RtzxAC6EC;L83zKW;0mWogmCYI4{`!McAR3m&N^47- zP&OuF1wk9)p_gGSq)(*1C&E0mn~*?yr!%oJQD-i9npP~1E-yR(Q1G*>uE#YN3TT#e zVY*9$dL>ec+BKc2wJ5`q_$=JUFj9Q2{km7|ipBE^Z9O+jZJb~WlV(wO_j=oSfo$Wm z;Vh383rj%)&Ox%#^>FDl(LZ#C@-Ba)<)zwmqKr8ev??UJ>a{)L{!DgG-J5mSe z^vS+X+#-@Y7Ve~V8ZBPe*5A_urB9+|W&0#gHqqq^@1q9D8|{q+h+)^<-QAh!KwwD| z1vq@hz?IUJ&3!!*P%jvy%xLJfaDDE?su_=UbYb~Q z%3Vx5)J;^}nz%purG!uH@A8GhG!ZKiSVwcNiJyY#?{>*Xt-4nr&;qt927>f%LV#Ta zg4Fos&RQd3p73Su+Wx-YSyB$shO&Ra0Z=?0@c7Cd8WhunshbFuNp(!_M!In_qr+}4 z=kcFR;CL;XldU)pK_sOiv$5agj$*P_zOAlp!QKtysk>`=PYYuC;RdEQy#}!|a!1YZ zY^`|xN(mFrDB(>QKsNe=$#xd9^>v`ldLqyqa?Zy0L%S4H5Zr*A5P;jJe_Q(M`uZ`i znojg95-s&f{m)luxkqMUaXpa_bL2lcpYwTVBe0ZsXU~;T`U#8zsc7UNF${!FA}K#Y zIHm11W>#^`_|6ukAN`8I__e~Z9y-j%bYeqbo>iNWr*sihzah*QJJ&GgN}6OrEzaD0 z^G$wVABg|RL{sAnH*O_k%tkjjIuWgnAYSsI)e|aGLZ_zicPF8Y0>c2!p?YztVhk39 zf?})$0fhiHeubrWq_DzSVH+$1WH#Y z(;+L4q$m>tLLyN@Xd)zO-V)ZyH0%T=C9IV~jxENl`SEJ3TRox)EW=8efH$_N?-8pqvp<4lfA!%fKz=-F8!aq8ut zY6+zm!qnQCpKeJxr9%^?8<*~4&~ls?Yt<+Hf#&xKEWx~qLD?1fNs_!@^S zRXwSUs?-E6MRaQ@mJ*->ib$e)V=5p#N*O)w;0}x!e3)2L3QC6h{NoBEAFk{YD>2&~ zrZGw#@c!|RKEeeatO$ja1vBkT&WxyJ)>L;XFI2@c7u6E_!?d`$f0m;T)#Ptpy?;Gl z?_$7%pd49})3ni}p_mpIhdVqvPyB`h*yOcmN5%}ht&iJbD-q9SWu-=S%t{Sw zX%fj{T7uen?L_#HKDd9II4o+M>gw;r5e3v$6FuGC#%?kk&trJ6)l4!ViMB3je`yi^r;p1*Swizj&sSRH|N(qo?Pos;{1)%H8A? z=up2Z!{x269S6G%vp>5Vmc^v88D>{4z2LbrUuh zz&JOrT6)O7)UVcb8UnnUc=32YM<6e=n#O&vxW2!| z=*d=AOWCaAmDL#oHuZR1or2m@ZR^q#_QHC-lM%0x`+=S5&5LmOknSPdvO2__=Dq>s zXo_3`XU*?G$S>qh{m`t=!8(`fihw>TM$44R>3b`ZZ<`!l=u#K72-7JhqWRA(7Wm5; zYWT17N>txh=9Q@W*|+$K1+~Ch=i?!nz&7VYd|U$?`!d(4?O|&xa=lhFfnOEL-DV1_ zGF9{!ay6c?Sz&~kGb-`YYHFU<-J{Fpk_|P8J-uWDEUBWu$SDCVX-jcDtJ_0)qc4`I zti?JWwowQOKnPpzo3Jim?i4&C$vv4UQ|Xn`7t&%Sv#qm9y(Fh^P9PjJy$Y`>)ioM|6k#^|H}rSHp`aZ{<*7cORui$lSOZt!)tp9Io)E}l z=|&{EXYUKWDaQrWLYVSUFZSyd>n;bJ=|+IZ77Dju*R6`IracnLH#Q8Tf9wl_5xHZ*lfgiP&d zwHlgjNRA;488*BBv^aaVGqaRpAA7s%?x`s7kL?b2-k=iM8Z zg9COn^@xrM*Cpd2VqPxRBprF88OZT?3xeZzuClLytj($yOU|xCjr>N4M8fwjH{AJw zgKZ)SKdpw1${q*hP|1xY_C)*?=Na|J?-qyqy|QD)PPJ*niAy#ZZ(exsCO3BUN`xK_ zK8yoAywXT&o!w`YAcqS`MQK;QOA<NUo&fUU4^7R$GCj5|lqe$DGHqs=aA znZUpE&DpOB_M5PpVmbBkw>!pPvFnQO#GzCb(4S*r^G(45sog@&@;PUY+hqe;-2PE! ztzT|#>+Pc%-Pui0xG&CDn4s+oQv<@)5!Hpv$Wd14a?L^8aAajwfKa{A%z8cHDoz!Y zLAOwj!^2-em$jA@GVUW-ys_=e@uu{a zErxkfZnpbMSY-TtgH8!5fxfjPx5MDCA%iii|M7EH{gXgB;_o66aMR&V;qOw1qM1mr6FGTQQOb?VHm9Jt%K#|dKXRC967 z*&Th=HCUT`9zEimyh*%Q#j$p_R!ofgSzPiJ(HG`g8r=01WjnD8=E{}@LFce|QWYKF zn~JL)fe8iM4BiEffvQ@6Iw4(`VS{b8ciC3>y#{M?#bvgNVWuKTk*v$wR_Fu~*6Zq? zbQ^B0B-0!R@W&FVI^hmdhOIidzxQpK9j&f(*6n`1cf#sjIwSwO7|yd)>i}9UI2r?U z)2X=`X{Aa(f0>cn#Ap+PPGE>RCE+T)UDtA2h}A_SS{7xONVK<9owZ*`nxNMIRopm2?7);DX#z;VI%TRxFl~tdwOh6s619*1Uq?l+f|3XEc9-f@* zIA-$kD(&(G3tI33!{0jq;Ok?}cxWT)}!B^zp?Klr}#`GojX_DT0N z$tNl4)u3f7PBvPNw~!q^?u+!0yDp2IFRu&CLJO$u*1UZDO&Rt3f?L z9%0j|mUXCEZUf_X(uVGyEm?(6dE{E>}IJOe~9Z67T6-y`>Mz@VZ0-=9`r=S;`VBS(}GXSmhk+ zo61_P3|FbwqI2 zu298r46LXyl>~;SWrTJOMk`y)nH1^CWqKOrjJ}9w^GxtF)5u2ZmZjVVtVgBlaef`c?R>wv2j2zv2`_-1EW1uyH7j>r z%5g=QTKtDxw?n1fnEy3 z2?ra286ex)3DqLUR$Q*<9~LcaR`6@*_DRl&jxveSTH+MfoWbX)LxIy2`04o8ZW*(Z zXL0(9+kpWiy&WidkGbOh{jXa25@tV?chfwpA17{U7=GGYB2KTv&C@7vc2Gm?7Pfub z<1NF^NX|+3Tnbpp)IomYfJ#3{AA6(o)bG&04sUZ0CaV>jh@-ioWMHyE+ivY`71QKa zjz=7~)vIG~CY@UEdi2H{QoXRj+j|o&I2M&sO-$XuSimKkPK^38EiHZlAv!evm%$Yo zC)1c#zoOD;3tmiojGcqKdsSRrAx^oBl_H;c=M19r{L|lns{43qG4}9 z0KC3##A0qwuIa)FRbPKkS1mL==4%?tKM?7r|v=mn1|V1L@$ z!*leoSu>mgyA9&GOHSLUP2IJ{*yXSR@DBrM_lN{GtC-+nSu@{U2_80&_pHTlw+#OT zWo9`^BUl|WXrs%00jTxiHAgvCa>Rq@y2i{pVHrViEtvO2FD`oK`aQ}fwg{ms|3vpq z`c}ZNx1i(9kzK_6E@4*qYO&3B1}6oyy;2vP;1(!&QaLH-STYdG)YzOpF`9iupU822 zu`WYj4n4))k`nZUG_ZP)W>T7;k&rtI#Yr&?G77ra#3(%;AERu^>|k*crs-T zXp6dik$&Uxp428>NtTekJ?f88o@y0w`eN?b8`F!K_Wl#j7t`Tqm_Mz5c@h21s-VT4 zgvw4hd*7y3_nQ8V2yR5m*OOptG=pRN=u7#Bd(d|+9`Ej%C#I|gczKJ2@`xqR#{aTH zPAq8d;mL&OoY<$ZYY|T`;}VCvl!)-!*Cf|3H!hovd!k+a$+pd#L4&uc^x&rF#Mwf# zgq)NP$hzL;5cXW469PO^`NpIm2>e7;#M zg2=+=WEdZC&1kM0CK&=%uFB3cS7l{s1CO|S+-v4}P}oN7mS?vi)^)Nyg#e|^8=Dtz z%m#ljipAs}*aR056Q3k&e$CZAWQ8vf!^D_mJ7xc+cVCSUaH%S@pNYtV2yHp!P_!{9ffVGy2E^f**Gb=9&AY3nWg1%R3#714y-wVWO z7)F!m-k9#`ft-<75e8lFpitsJ6vuCMLaT~;P$%kvojUm&Hz1XZ63e19nyo^4rq!41 z#w@oB;fsi6p|`ug2Z2F({W9NsC($F5aPC=Pe^s1e#0Zth5mMrUU&a}jPh12RXvchC z9%n8i!NWC&b1_PVe`?7MPTx_T!%dYy-H}X$FVrn{>2;8RP0>dhrIy?;_kxVG+EA_& zp(0x+ z!1>oV--3gcK3Bmhylumlje#X;b#k0D3f!W~?%U-(Y{^Y{8x>wx(sZI2bzsvl|1p5- z+&VGI#KqMbF!p-pKWQYp&x`P^@oWE9!H)2yx1`P_(y{W=f0Ih}R=r6@R-&?N^ZCCt!r=W&5RY`Rf_B|@I<(RwS9m8ekVUBxM&cltIk zA|%{%0%K%ylx=gwyH#Q9Cd9Q7;HxDrp-;7@oHOylZ3G%4R9&j z%7Rfe(HPjv7I3pOQdO2d`$nWvb(H)XkEwI1D_TxXqGv)e^<#KF?H?^r$RP@&!Ar}B zTGIR)rL3KS;FrZ|4t8@}Y7_h0qp&WUGs3dbe`Fxq+n{z4E!MTb>|o- z$`M9RT#z#5pnn|5p;V!{E#nQ3)GTnMkbAIG9oLvV^KqPjmys^-^53#FYe|7y ztR~LGRxd$;(IggT^oy^V@pW)9K%)nZ%%O1%3(j603fe0sW}(bwyJIoSRPMXR5U6;| z2Du|QIdUqy)hcO~hz><=VoDYeMFox&$`qY}6HL{PpaT|R&=->_o_woAoc6;>YV(jO z5KFD@<*R2+toQgV_2_THDW-(_nJYIWQN)6B9_Q#1dM(lHgv?7}5YIMo`EW*#tG}~M zY_?>FyjFeN+dUx{ikS^Yk_37q+M1wN12NT@Q)R0FWH$%4KewT3D$>p;CUB@!vUf3O zd#U=O6SB@Vcm^k64WmctvR)6fOggnew=%k0v)Wyhg)2KZwx%tMS7*3IDqZ)8h^1dT zBN6z{4a9;5<`AtiXNxFT8*9pY2jpZ(MHPm5{5mjz3kggU5y$tc;^5erbVHIW)lK#V zoloGe!>~Qu!}MVdM~;mVUSGO-dHJ@#u|Sy_6c&)JOqb-q&~S!!al@D^O>*T@iD@@K zImr=9vOAvK^}ep5)br)!YQ$>Wl)}m$i17lcI`N;geeiu{6n*{dTd;h`(1o_{+aZc& zP{j_&+iXjUQk-3_tbHx8bvKL0{8NtaNv}Cx%#oOsH9d${-?1T`#A$tZPq})*F`>3| zWa2yRFK&!kBWKuFKJdCjpTRXdX~Rzst<5s1xYpa(Y4=0!$=<8F2Cjap#!5J+g`M52 zQ=QG--TIDgnw`uRG{=WKy9fup#>;+pqT_Oql~ASzbm8~4?CeZ`?>ZyH6U(6lO3x@| z2db4jCgL&kD%FTM>ee(_#7%3z2r1{tK-9afpAjKuz~Y!X*FJn=AUptAs9irWS7F^L zZj@UmfHJj{oQ4mS0v{$k@+M&$nbpGGEr*?0lVWs*ye{du$+fS><9*%YAKR{nGv&lV zHH)D6`AVz|%dB@N!&a&lCs*CV>&-L$$u+neB3hJaP{z7e3;ANKD~nQH?Qr31_DKz_ ziT1vR?;2&4xh!Vl@3!MANN>DOqUt7lQyay5P@MNAyGimoH|5Tj%Q~P-bZx%_x_tII zpc_9^8TLibWxrL!*;KoxhH?g$t4bNcRE)~b8*A>knq0fRf93`ztr&)JuI^Ky@pVvZ zwd2vc#M;mHL_|L06s*uD8@U8MjivVz@%1^#2p&Ub)&-2eN z!)01D-71ziFlB<3S^-%J>}Uy99m`;tYrEQt1vqdgJUH3|<#h!^UiBv8Wib;6IoobZ zo2^G+UP~8b-Q&f--#5d>a9zxF#jmnrHc@7D0LufTXrduX(DmHW>z|+n>!5};NEy2w zDMzsTMW>lNdD^+e69HQ+bPnA2jbkA0b@Gjya<6mkOZiox5Mj7H8u=>FGR2u23d5W? zqF?;u8=E=AwZQphdt!A$Zu&)Ee$nGf{5wZNVa#PHj1rWJ=pZ#&xn16Ua(CpHA-Bzp zv%A#U6Y=!KFcR=WDdSC3TyQP-?3K2F;i^3C%rCnF;@GyRnUcYPE|@KPlcl#kgL`w` z5}Z23sOSt7b9)7fiwqQzP(EE*L`7C9<>ei}p{vwmBey&Kf-LtkY8i(XR+vP%NQK-d z<$Jj0)IV!U2noM3(I_kZ(O1gUxN^j)oN5$u2{N-(#7TBf7`iF;3jrBKRtG6=b8JmO z#qQ!KlnBR}FyAnpeNI1K)+}+&u}w?jT6Zs^Pz44myo$|6&hM#L?&|4pZ0ThiGEpy( zPE-Kj1+}2!oIyzCs%FJ>r_VvezSNSIwTa{76)}R?5Cnxh z%yMm3rK+yZskR*(PB2$f)nNDF`hagu+qxIe7p!K4@jPI?1ci2|e)3r<#q7zsY_L=a+_U65Zji>v3z@|$T!xOE|VQw zb}2i#;w|zbsI!B7_L}YWrVqO-JlIyf0+QoFIj!y}^K`ma>k7zCJ=^j|pHl#9G zd`o|G(&}uO2E6D1Qeq-ZgM6N_k}ak}Tki7V>MSLype6VLxr{8bX6xZix>@xv5<oI7cvPLw{V=yQev8XmhCqXR3t(> zGOr9MN5(+eKUz03`5Kt9MDPQtF06}%3Em%^(d*{rO=mx)I2%_ghJB_5&#m*DMh z4dR_AbrdW;GRnjWFeicY)BG{*4e$n_#>yC=ccQ+3YdJXkbkvxreCL z4IzQ6&4rDQuC;P5Yc)4fp)mCtlJ-Sc47JkJgqUkl0beBKjOhma6T|Ha*QR6~fttm# zE^l=0=-SwATx44MW+&%#X_PcbR@_{-3fxwRkC0Ub?O2{Lk(kU`VN!gV0>e|gFwbF| zSXWBcwcg|V@MWXTv#RqbfjfCG6=lL%*@W0et2Gl-dmbn)6vzQT%E>-G>zDx?Rb|qBv3p# z@up}i;d3oBb*u6E?e%b=RHNj?waIceGu8kb*ad`bX0~SikRcr~Y(iuPEN5leHj7zf zkde=hDf61HNnF2eD~GS6MX8a+z78q(Qe8E4%)Xxf-j0ptHI2zWyfio6XV$c>x&>_J z$=39jSrNBx0$UHTe6}QGY1LH#!y)8+wc3Ba+O%;w7Wh~)UY}f*f&r&XRGjXf)Y^2D z=zS-ubcnYg%x>o@2)b^>;lw`Kp0O`EyAHkY7gM;6W@7guU{_*e;^fZduzw{_gaN35 z=etcO%Gz#J04)D4$&Gy`K-r_9;v~_eZ&3UVit(Q>{69{ji_s*EV}*#E?czsnUjk6xa0)7Z!IuCO ztZUYZBfCAxas76OXmRI#)){s}pym23bL$H0Qm_tjjmYR%U#p!}2>VT|wr{nx3{(&w ztk^;C7gl?WyAPHop0DLfkm`8SL}gwB_wyvYM&;%@PAG!{1^J*ly2Zd@rzZ(h{hiIcOasUNepoYlRq=|)CX?gbVGztRPueE)s2DOEN`d5!pH zKsFGUwP2Zg$n}~RSVQqkP_w=iVf(PIpcusgIke4AoM}we?v+NfrmF35$H?wbvK_WH zd2iiJwK`0B(Ubj}Ye2Zl&4Cwir=4AXLp{N8wHJjgc(ar*5a{UY=o589I?FDDxt5Ww z=~DNA0u5&_jQSK2C|FzPwd$7<*;<6Wl+{mXN9AO%v;p~`2a{#4yotcdn``FCQ6jb` z9)%MX)?UV5waiMP;Wdb=4LXFkjt@7V?;NUvqQ*#6>DWnbDr}RX=vz7HFEVj8l3qCT1bj1_IP>`|*N z7zd@lHPEjRhaX~Dr8r`Rsh5p=2i`gJf9+jMY$RD$HGdETGk}l~AZVcx0wgR)X8I?A z)b(+9d+KME-8~})6rD~-*;$pDk;+WF$|I154I4JFKp=LA4X}a52)469njN#8y#2!Zzi91#>u(DbgG)xTVlNSe%Oh zb8>g3e1hGJ?(}HVA9g3#53mDS@n$A^S!|u{o(ZZ9;2QCDIgNY4=WZA{C4Bi7kgOk9 z8`9f`Y>IIrUnr606}-( z9bc655(sm{E^&`Cz_IzePv6fE-pk*4e(%Xa{@%mq#4~O8qSIiRssj0Mk#s%=g()5uQq&Mxn1NYVECY;1COY$>mZw> zX2}{7mVd5K$fTmf7dPym-;2C@mRHopU;HlBhGoc_4MIglz zt(a!{%`+>=?=-tjUZBntA4U+`Id?qn0m)FYO{YUbGBGp3D08x?soqodcGbGy4@JEt zV@u3FG3KzVh59=0K=ejS0*fkaJkBL-Z>192(EQ09Yxd(xo?I9Ls#vQRAo6$0Te6J|1>f;IRvV*rr-? zbM{R(J-f+9s=W6#m;It}(FXL#b{K{v@j;?3iHE{&kOH=V3n7NdOp!Pt{AR%+p`M|d z<&6@;)w?&a4Q+(lguUH4z7jT={0sj;!SjuBRe?hr;bQ|$9)lyA4%F;BHSQm2;&op~ zwmZEZ^)4pm2!S9j1~d%_l_7TeWMdZEFezh>*_AYwl{K1hnbw%ZLZ(H%GJmf-y^x?f zCCtx$Y_BnaX!4rv@}_;(S5qTv94^S0Y(gWbV>F?$0o3~OuR~Brr27(G3wV5uc%Z4p z28#Rc6jQgO^3tU(c}7(HZ1-d;B%jDi|3Fu|vv+Tp!&Vn2?LA+gLTgh5Wh(OS3&fQd zz~e4YV?o1U)3bqRJS#`ZNMy=0LMZAPrw|6q3w?J7U*|k2&??{@w z+lfb@0N)eS(wcc`w*9;qmy_AO(b+?|b7w3s4T@Ps?N0g9LZL-Y`oF*`Z_g(kH?3PV zOEe>}&wzK@&hx<-eQb)-SCoCOGKC4}@hmB5X5DC-1Bb^MS|pBnYPQe7lCaww!kSB9 zl7PW}a^||MxkNp~t`dS&IJoKmw|{27mc)syPe*A^Ura~~D=#$UtXHO<&X5ZTFtCa# zE~qw)ZJeH>aN+srGUx8cC|rmVC&-k^`);SF!y{l@Q2rsMB}gPR9ooyW@=!}iIcl;z zV_*Yh9jkebIJCQ}#=dT>Q+&zk6w*zTElt}@nnJ&1nB8R|7pTWJaA6^yICxa3TRC)5 zOfdY1VUZOV%%?@mB$YI9@e+g@!{-I*x+=k^kwe*})V1Sm57}J#w7i@k+)K6ATy{1y z6&8afX;&+h)RuLLojeRmImMb1mk1IsPbBL$vhwOC`s(5KGSv-RIc2_9_@iPnDs;nv zJID=mk<`A{I58P4+oj$Y@&_{_)+MBnMPL>dxl0+Zyc*WZq8U|Ar&}@!oG#1mX2@6M zRE6~;?Otkg0#$cmdN;CK!hs4c)w&J?Asy7J0_8WuDCTO1ivqm3Dn-Tbj_a7>E()$t z2XMMuLG;nk$+)e8#Cd;oOlEf*0u(h3yb-My`$PyYFJ@)TC>S zmE&|GU{~^Ztg_tH9yRZVth=Bjz6sHJyZOyA(UkLb;BjQq80r%~D96_*_PP=~qR(wr zJ1^Bj5_lqV#d;8p%fWZdVvVEvxw3G|$x(?^?qc$8F}pWIFx2VgteA?l#}dW3q2Mi} zAW%^56sj2V<^o-;l|M#0e?Be;{oeIKt75R(*3--3X))oza0eO-eIJ0(FvU zN3yEK6xqkVZ)3H>9oiSv9ot4@`aOx>8l9hBzBufmwOJ1j&S#Tue>Oer@%El)uz1V9 z-qSB}vVQf!c5hI0Cq#N$`C&Qbzgc%yy#4Ou!}|}u@|C>3*|9}n=ywMpB)DkA9d?k@ zw$sBtkuc!#E@-J#RJpN(y{q_Mj?=pQu zo#q_#tka(!U!KnJ@!5+5#J_-tY~@GCC3}}fe=)MEwKMKcrv+o>&^wOrXgbCWj%I^v znOJFJJHtGqLg9<<g+b(lcnX-F{x2_orAT z2pzh@AiO^|H8ib2O0bZECf zy?@ysaHPx06G`OUKqr{csq>V#Hd~w9c-`6H=xP|WU~^x`V!wFPE5`4hlo6bt7$6P%V?Zms_Pxtlc|Pf$xyU-BB#uJX(X3>SL@*kdw#pK^$)-b@Hv~9g zFcuF^rJ8Z}un#5$(tlK3`(YEkztvGcj1R2UwLR%hG`=?5((`O3Dh?~HB)3${UP-wk zto1oR+f6U7(_>+WW6CK#x~KiC3>9}EUqwRaGqea3imZfSHN_2ZhF-9FTHvIw@W7Z4 z+MAtx-hW!oNO_W4gmptPB10&YunQJ@9~;PJTjvZ`&QH0S99E?F(0ExT1W&*2`eOoK zy40Yf{lv}-b+xFz_+)Z%CP(wAqm*U&$t5}_A)SYhhb+B9@-rV1U?lk~&onHSZe=^+ zXDOiCR;#u-fO@g`P8qU2lS)~pWA;KsR5)nf(~apx>Go;wq8N5tTf^=vaJOYR6JvEQjF1`*~GRRr_zhZudP~QQ9NM*=BCwl#at`??_|P~%|O?;XB8csxWx zy2%z}kmyN5nOPHMdSw=LmGE8>Nv)lkl>7(7U~Pq7FDElbr3EM|C=;|T#VgQ62lc5C z@oUH%3vF2A4=yKE@LUetv2K7ac?t2}=yG=@1*u7e^+S_ZX4_Wa@MwZ@E=Bd~c%4z) zUn+HtnTN)`6MD{YZatjKK04Og}S4Gq#gtxQ22A%!K1z zK-hh3iy2dNN72j_Q?qGDYBW9BH5kGa3@%pFsIr6j0)eo)9d}TF6kmE`b#$bE*E?!}q#{NJb3w=_I zRN=n>Q^dFsbqjQwa^R2*!#P>iWUV}DEFR1i3y=v9s<}6 z;i;WN#rNh@v7BnAVmz*qib=sr$q;FJ7y(S3zy0YHGa>S_Zz zK6t2~5;cSBd?*Tj-2PfIDG@*5EP-ck-UG$6(!s3oCAGADR@YR4bfrb+TEj~TOw+wI zSsw0+!OTY|ps15Ass={xbtHy(#hG>orZBm0M6Vg`In|C7sm?km-qX8ya8dMLy#q4j zng{8=#6P}=n~AqV^DTTDA9a%fry@{|H69|=?zVDiwV`kzoZ_TZrA|bFXs~7$!=^0I zM_BQ1`m;ch-rV&q4#yyD@uv14Dr2_u;OKpiRB;|UQ&c7up>ftdibib%Yj04DQgXmk zI(a#swozhFAz4vdO&RqAtzNW_ux^2Bblkkj5h}1)g9H{E!~AhC&#fQmZ`6`HbU0jz zu|jxb0I+DTse+P$O@b1nEs%u@*X}7MFqjz|3L|@AXof0=y1Fyq>UA6<>lo}Nw@jrD zEGt*WQcGBR1Z#9T7)Un`LCY96r|B4gvk!F;^GZ=m%Li};j5dWUrq@7Xr&jCr1xiJ{1}?%?$-R9=JyB)nMshiX9J8GnOjK4> zJ1JH6aIC8`Yitb{&-Rq%Em+oKBrtKdOYD%v;rb&WH?(A=MG1Lvh|OM^PhlU-qp%Qa zm{-@*Z8VNX?upGk>S7D^XX7rKYaChm%Jbp{wMJ(-dR@^OjoCg*77tOhm{!B6IEUqR zMZsXR;SkH6S^2mG1({;l&hKAI`A@~#AyVPA%~;Y zNgeR%ngPgBA3*pYmns9Wcz_2B%SqT}0%3#W;f~+f1cHkj%2R0nf#)J9j z|NP#bHI&9=e*IfOEkB`M=nuiQ7{;D(|m_ny~T7l1lP3 zctr(L!E^-?_EDKs5EJ9aChJ#q;yuU|ynqgjh13?BO_r}<30a%COdnD%FV4=T6l$=_ zeAAqe*b}c7M^rd}`aqJM;Ya}Ve4AIdtlQ9I5hL?j}@%si%6-f>SPGfbJwoyd$Ye!h zA3pIX2n2w)dve*E9Sj?QolpgaxpO|dP!%{qf%tw>yR#o%Mcs2&$%0===BPrq;n9R& zHOr9#WQAhIyVD80JI7LY9w>NNBp_xzI9X>NPEDkc%rf}1r^T_L?i8r&50Gy0+q5{d zHVmTv`6ySPIaYBwQThG=LVi}zNeL7Kw!-XMC#O+;NFJTF8@AVAioE9q$qfQXc1GP{ z5#T`Yk6v>#hv&r1hogr(Jius9Zl7gdHQgb3K&0XSF zoXUgPkAjtfs{pptDivbAGHF{j*ml(FvSPzYH6;PRnMrW?&d05<%qjSi<sA*6z*aCJ2;Z9KKz8(#vGnZ+@*^0=_DdOPr?F#~_cX`0+x-nkbf{%J|zljz}U z3~7rw06^u7EMpsr5gh=!4|po2unt5}$&8@kb!dqjmlE!U@#kc%At4lgcAs(7tx;Ky z!&IWAL0j-iSbIdHQ20#6J75jx-xW#&^urdlsisyenDihwh0SQxp%Th(wRY0! zHNL>kA=6iR@uujhXIomdKKrFn=%TNl!_*66w!%Omq;*{nPZ_O-fDdJ7t&H_qbBqa` zdn3dtPjdiRPS>Iw&f2PuK0AFG<4#-=m8azJGwH}NVd2(Gwp&$1%cs{v+AoV%BbT4K z$+33gv{)=9lBQ%0Y|rR+KWz54OMyBF1C-A9f8#<3)$biGsG`>e4r@eRk}74wLSxL( zDZ09CmlnWC4PFBP2CS+22B_F8C^!)bWqa(%QgL^O9l?Z%>9u>+N14}d6F`hXu`a7s z(K@;Ykpe$L)w!AR*GF8x!9H2r+IL~{+eoLpx02YLS8TZL{z@%a}p|cT1 z(-5gjn8^h@5oorY!ljBVEneF5RD2ulc{#I!H7_LtAdSY@L5JNvjy8G~zKr_VLRO)A zDZNe@z|d5ZNBFc*Sgz&<7d(QUM4$@>$dfO)PP12S=zVHK+;D%MNA3Q6kD8onSR4Ad zMvof3Y2Kq|L`TBPA8M>d-o1PyyBxvaX7L*?s3HA1Og2#MDQ*>L%we~NfwiL@h*4;N8!=G)M2 z*;0$trN$@;QI}StV)*Zk0S5~pk(qIDrck};Pc9}p1TkQrX{aLw)qyVs4d+K-D)T-B zYzma~zU{1My^Ai*cAb|E-Zo83a(sgvK_kc%TXb5Z1jMSoL#F~=BLNt>`?!CK7`tnu z5nWTd3rZ+hg#^Zz*jw&O3t%S3zc92teeoe?o4beHhud{e{Gf1Jia)0AY)yj`g>sNe zI6~+SOdUqM^s6Y1=Q<14=xvRdf+_oqEG&C49$O>}4R5d@3lM%b`2dFOiz?=jz!T%H zx&z-!+_4geQ-<^j*;wl;ye()3PT>_AtW5N558BkU57o2~7c_&aRgKs+jGgqBnwYH} z@tEb8qv_?CE?k6T&p^nCIPD%FNQR?9e{y?*bOeU3$mNN=0;7zyvj1nwp2c9*I!nu~ zQH>Q#Qw+reMu}Vr9DJ$;zrpp=9Q@`1$Q6+#YVHbCRn(u4+RmY83!=Dox89Y&EC+s^BY9YZRCe13^~vgfRiT=2IZ(YA+s>PX;LBU z9K4|&ab7qK+id5?T(?j)g7QEQzxeMl_LWZrst%O^YCOU^0mAo&V=tLpY>9BwNbz;@ z0o-HdXhaPszr8O4^H@0}xFVZ2HvYJ=8g6tKjw8Qg9KCPpp5iq}$0%zBmhmJXc1M7d z!KuO>lF^lCDE7nPBt4CY4ayuabk046(E{Z-%lgxMR|t!#1j|C{ZnBX~EDWQR+3lFb zaWEw=s)bGpmAIKl)3QwE0tjj=qTS@T?%@><<-l#5MDik4W;M*(ay~iCn zrAiWUWUF>6vjPOZt;mi+SF-8e?4mzyNWJq0b93m(TBGQ3?eyNSr(Ohq%R1U5W zU?gzw3{|L4$3VopGpKoV6UNkz>#|jdohhvxo7;?jqw$DEk0yn|#ZW)Yh!))%&dr+` zmkyr@eDTtOYI2tWmPvQl$H|1eX<3@#A*^Lb&4G8svFp{;036g9G!&tlGWCNlDaSw} zM!>u&wosnArEcK(X+cPQMHt4DK`ftMDBsL2Vfetg;ozIl^bn)Ro1J1?I;$|$05~FR zMAh8P1I3{iRa^suIt=xaX$rW<$|cy8(&AOXUQ{s*9>SN-DTO>}!By`e!N(6rSSLY} z9ISkBS9+=V_=KZoTk18H%u9IvED6(7R8w%aBV)y>7f51xp$v{Ci{kBW-@k|DB~&>b zDX}i=kik}uDxVsGvMku=-7j}1BUriMeprZp<7LT`$dDmAapz$nf_>~gL?TR~(OGta zfFZg4T?B85sSWDPxCPNNAP!>+76(8^BNhkcIg+Wxo@(&cvag-j=&Lj0to43kB7A{L zBz4sBwbxNIjY(g!bUH`}t*OqBo(I&$a5-Z=0Znhq{<3vDh9iPr!XVLJGO8DDw6d3i zoTJwf+*|8c%EQb*D~xU>OP*v;tkTH!P*K&nd<5i5lLOmPE-*S$At?`kfea=;@Xk1( zGz&cyjzK#Ng)Ss$Z)_rfB}h|m_|3~f!DZfDkz3$|YtBLG%*Pt#R5R9y$IZnWeGZE0 zP3&B#D<@oKeaEEZ^rXASfJU#L#=AB{X#g+_&CbZDvI))Mb*6Dm&czkY35I=L!^stkM#)D~WY+ZkyoMFTux< znA!Mfl)YQ!=`VwR0&|R09I%~vel_ebNpo=S@w&Si*r>Ui0oM1=FUw1U>SOgda>rdLkX0F64QZda-92g3QX`z>P)H=X6Xa@Xh+&Fu zxXyqImI`w8*$Z-f=5F9~1bKVDAeU3kf*g-)6yz$zT{00!o4GwV#1)@a;8i9yNrV<6;d30<>lt z?J5jY^t&!*xrc_N;+$R%$MB6!kbz8%g#vJGcqw5pN%DfzhJ}g+8EW^+*V#5>aZvTe z`}4#`3S39|`H;mBkgqk%B?N-2B2WSERLorz6{#ot^MNo-N&PVXQeirF<% z+&VDcvVC6@9>99;&}o@+s6@pba3NQsr`75X|fP zTjV!7HM*}H1(n7(b{NW|CbacTC_jpJX1~PpNTdj|yq;muc{<|jPr75x6~oWBFd%k- zb5m|es0pCXIT0od>GA>tpDC%D4Zr!~f@hkg#R6er5hlX*`)@E7K29^jX*qNuLgk%c zp|EYxtZi`$OJ1DWOwwF`sCDAPRu~PK?x9V{X9z$P4t#bt@fx5M?W%}6)PC%CQFt3{ z1D75u?6p?;$m5otlex-v+fpV++#7+W?-!lRwT>i9%IYJP9~x-HL>mzBeZhX2Oy1h^ zA5R`1lLbzvs6fOAIKif1BUU{+P$5E`JcT~-y1RnabmHok-^0=3N_^r{JVOntQYr$- zx?NI$5ZXt7c5npQRpsu$0j|{zp&uoT2Zo7oic}(jRj-g@A{v4)&uj;dXBNJ(vUo1a zN3K|)S&>RLC|Rc#?M4Wt7%*_juTm*e)c+thv6W3P&(9GymIFYnGR-uKNctL9w7~n7 zCgjYvrv2wAtu{e@KiZu&^I+`F;w?`ZH7|V}N(WH_jeZpBg4m9YU zfER8f%pgjI!84d?6qqN93pu@Pea<0tguV?}V{kac>m;rV?Fk`0rG(IF_*parvRfDe z`T3hS+)if7NGoL0chYKNkL{ycUJepIG&qC{~vwl~lhWBXy!3J0K(c$||j;vb_UOF&si?_Z~hju8IMQz#uJ{ z^5FD3@1MORasjFG- zoZ+bLr7rGR1KFdo9eIqL3KpqWa}+MGukP7erj48|<63O1;&!x~yQLQO;N)me)2Yku zjY@2uan0PMJ)Z|eEhm_LBr>D-7M{J>jeBwu>ch7#XClpLWpp>1!?ZZFuu+*)$X3lt z-_oq~LH||Jkd+*P{Z4t1TEv2rQFNn^JHal}_Ji81Skm|bR+0O;Da(vc^_E{ttv;=m ztz)0C&i@la+AO8Y8euO3^rgl<*RT1ftH3N5nJ=e?*NnY5D#rjNQ7ju`#DV@1TMB@8 z(7mn@i}bcf2*spEcxFbp#HJmkU6plcF-a-&n^)1%DTY3?8>qXO`UgV&bUwU2E4)Lwp@>4Fxo#wz)Zr#X2OLq5C^)qmYgVtX zI!k1+dda9Zhl8sA4`)Hu$2DBoRaq7UaiqmBAtM`tajE?)h|tQEN8!3Y@<|{g=cTzr zk-J%@yBoGiAwdoeI~x-&hbt)M{yp za)5WQWiQ$gZ z$b5XAYt@E;gf^M?sF;f{`;4McSm;*lrc$SNp|xiI^|+L4#Gj^S3bOjFP78~yF#)LR}`K&V23qbhH{ zhq5aq41sow_xW%(-tng>Xxt#Ww9aM#zICFcJqNwXKqtzFDPAU@#huWg_a-Fx6bh^< z^2Q4~zSE?xroaUGS=V$D(wLF@s(--+97 zUrTSjhWi(;%V9nR63A^a;8~WEx}vwp4e(eEE(?cq8|k2-YeJZb3=|CzMHc(09cjsv zNyL)OeN_>@iJhjnC~OQSoP@Znn^G;;AQ7{EEW9VR)kc{vBqK_&k+@W+ng=SOl2NLs z6CGYkQiE~&o;esyF%D>VH=^BRgf#shwtK`V$tC?5v9dkXhzJBFsWoAgBKa3>GD01h zgG<&I2cFlIk!p~muC=!sCLLp6XHY=fh-JE z{)B!+$rUs?nQ=CX=hIEvbUPxmx}^MwhqCSy)(SG$_#OSj;o$g72(#D^O09&#Tt-0O z7TI^uX`DsHmW28ej2`(b55~!p9JLvMgtFB-6&^o<<1{s=FjKL`DnpvETXdx!LLg?1 zV-|{;CdCYrmkSWvWYD2L+7f3_$_qI^>PJR_OrZ|9wGzC|NRZ4??M8*aty?x_o*lf6 zva5xQ2nlH5CQ2D5m~mar@?T0AER8GVTyoPvE5fi$4Oc&zVL^2PfhiaAC($ga`N4W^ zBh+p7NN7?!MhbtZZjZdb5u&l1ETdCeMoEMOs1jauONvvb$&~2%KG&OSAo#vbJdHEeh396e#leaz6zXOBLoCG$`R=T&>M?$ul?5 zOVvD_tC>6PqbUP8i5J{Q$$nSQt6on)d59OqqOGLRit->K%zfGKEVjIa+(IVZ+Eshc zVyd_~2)yc=@*zSS!Cg%Ami@LQQ6jMlWx45dLx=P72d8q;lu^?w?sSGYScb^9xFKDI zVPK~_>_gi{h59|Kzo)C~Qo#10>jtl7t#zp`8tqdM3QJd`KN!e`4e&TsU|-cm3i9CK-Etwx zO%C`bz34NTQxo#srebL#dTYhf^0qpgSnITyplTS*j+)Y?B9MHFS@G|WV9@Z3w;1lG zLV|XnAp}O`G3oK$51)o|o>!wCHVYBJBWQk(tuR)g{Hm#%6JmznGw2 zXnt{qr`b2W21BD^qA1?SA{d*q)9F;7op&iW^hx%aPh(^7U}<<4J7M7|<&5pXFY+>Z zLa)UBf=DL0^`D=^6MhK)zSlkJyY7P>-*Ug(;fL*mJ8t3_-oNpE?u0vjF}eQxlkfNN z{l7l?sI>S0io4H_U-=Qe!R|}n{|Kjl6#wl0uYXRjx8qT!-+usy@4-L2|7Y~)jbj}7 zO1uBtzo{qe_%9Q8p*!#y;h(SK%SS=`zwvi^){Z~#8gA{k_t^0Qm;ZNg|F^%P_uKJ9 z_d>Rx&#?CERT~?x-S76it-seZcHG$4@AiJX-}?Xi?*99F*~Zqtx%=JmLlOx7@cy60 zzx4h;#BXfk58|Kv`GRY(JG$HLtUSkm!G}Mjg2q3uXg<7HF>YR(yNy{y$Id|08$59p7H^{=ZD_|8_^;Z^z&GPe;(M9oBAZ)?X+0f9}^f(;FLh z{4x1f{b%!Nzx*ve@QpVA8(Vt69Y23=SL*St`0K_$;Cp!g7yeZ5x8pZ{D!w6o|34=8 zf9r4bemjn*@%=_W_Uk_-_wW6JQj#5i`=C}0zH(%0YI^OB~ z{}uQCIPb?l{z>ouP=2lcv-|D%A2{`S{ImPNUtM2Fs!%>A_zxJFzN#6hUq2BlP-_he6PguARf2`eh z{As*DZ2!hs@7wrLk9&@P>)rMHe<8X5?Q6a7?Y801ei7%{7VG~vzoz&7 z#Y;W@yLHsv_m(@_;aA=Jx9kUZwEKTO`Q4fm9+M%GJHGAS|E=HE2mB%qzl=uOxEcTD p@L1dRO8x)O{#D!md*8G->F@Ekjg6m}bN^5OmAm_T_#b#)A{_t# From 174e10f1a6ae5ff1d8701b8b7d81745564c09555 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 23 Jan 2026 10:55:25 +0000 Subject: [PATCH 090/194] [Device] Rename deivce PyTorchSimDevice2 to PyTorchSimDevice --- Dockerfile | 2 +- .../CMakeLists.txt | 0 {PyTorchSimDevice2 => PyTorchSimDevice}/README.md | 0 .../cmake/TorchPythonTargets.cmake | 0 .../csrc/CMakeLists.txt | 0 .../csrc/amp/OpenRegAmp.h | 0 .../csrc/amp/auto_cast_mode.cpp | 0 .../csrc/aten/OpenRegExtra.cpp | 0 .../csrc/aten/OpenRegMinimal.cpp | 0 .../csrc/aten/native/Common.h | 0 .../csrc/aten/native/Extra.cpp | 0 .../csrc/aten/native/Extra.h | 0 .../csrc/aten/native/Minimal.cpp | 0 .../csrc/aten/native/Minimal.h | 0 .../csrc/runtime/OpenRegDeviceAllocator.cpp | 0 .../csrc/runtime/OpenRegDeviceAllocator.h | 0 .../csrc/runtime/OpenRegEvent.h | 0 .../csrc/runtime/OpenRegException.cpp | 0 .../csrc/runtime/OpenRegException.h | 0 .../csrc/runtime/OpenRegFunctions.cpp | 0 .../csrc/runtime/OpenRegFunctions.h | 0 .../csrc/runtime/OpenRegGenerator.cpp | 0 .../csrc/runtime/OpenRegGenerator.h | 0 .../csrc/runtime/OpenRegGuard.cpp | 0 .../csrc/runtime/OpenRegGuard.h | 0 .../csrc/runtime/OpenRegHooks.cpp | 0 .../csrc/runtime/OpenRegHooks.h | 0 .../csrc/runtime/OpenRegHostAllocator.cpp | 0 .../csrc/runtime/OpenRegHostAllocator.h | 0 .../csrc/runtime/OpenRegSerialization.cpp | 0 .../csrc/runtime/OpenRegSerialization.h | 0 .../csrc/runtime/OpenRegStream.cpp | 0 .../csrc/runtime/OpenRegStream.h | 0 .../include/Macros.h | 0 .../pyproject.toml | 0 {PyTorchSimDevice2 => PyTorchSimDevice}/setup.py | 0 .../third_party/openreg/CMakeLists.txt | 0 .../third_party/openreg/README.md | 0 .../third_party/openreg/cmake/GTestTargets.cmake | 0 .../third_party/openreg/csrc/device.cpp | 0 .../third_party/openreg/csrc/memory.cpp | 0 .../third_party/openreg/csrc/memory.h | 0 .../third_party/openreg/csrc/stream.cpp | 0 .../third_party/openreg/example/example.cpp | 0 .../third_party/openreg/include/openreg.h | 0 .../third_party/openreg/include/openreg.inl | 0 .../_C.cpython-311-x86_64-linux-gnu.so | Bin .../torch_openreg/__init__.py | 0 .../torch_openreg/_utils.py | 0 .../torch_openreg/csrc/CMakeLists.txt | 0 .../torch_openreg/csrc/Module.cpp | 0 .../torch_openreg/csrc/stub.c | 0 .../torch_openreg/openreg/__init__.py | 0 .../torch_openreg/openreg/amp.py | 0 .../openreg/extension_device_interface.py | 0 .../openreg/extension_device_op_overrides.py | 0 .../torch_openreg/openreg/meta.py | 0 .../torch_openreg/openreg/random.py | 0 58 files changed, 1 insertion(+), 1 deletion(-) rename {PyTorchSimDevice2 => PyTorchSimDevice}/CMakeLists.txt (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/README.md (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/cmake/TorchPythonTargets.cmake (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/CMakeLists.txt (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/amp/OpenRegAmp.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/amp/auto_cast_mode.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/OpenRegExtra.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/OpenRegMinimal.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/native/Common.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/native/Extra.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/native/Extra.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/native/Minimal.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/native/Minimal.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegDeviceAllocator.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegDeviceAllocator.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegEvent.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegException.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegException.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegFunctions.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegFunctions.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegGenerator.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegGenerator.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegGuard.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegGuard.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegHooks.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegHooks.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegHostAllocator.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegHostAllocator.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegSerialization.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegSerialization.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegStream.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegStream.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/include/Macros.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/pyproject.toml (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/setup.py (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/CMakeLists.txt (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/README.md (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/cmake/GTestTargets.cmake (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/csrc/device.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/csrc/memory.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/csrc/memory.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/csrc/stream.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/example/example.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/include/openreg.h (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/include/openreg.inl (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/__init__.py (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/_utils.py (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/csrc/CMakeLists.txt (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/csrc/Module.cpp (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/csrc/stub.c (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/openreg/__init__.py (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/openreg/amp.py (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/openreg/extension_device_interface.py (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/openreg/extension_device_op_overrides.py (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/openreg/meta.py (100%) rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/openreg/random.py (100%) diff --git a/Dockerfile b/Dockerfile index 1b4d08f3..1c52d32f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,5 +12,5 @@ RUN cd PyTorchSim/TOGSim && \ cmake .. && \ make -j$(nproc) -RUN cd PyTorchSim/PyTorchSimDevice2 && \ +RUN cd PyTorchSim/PyTorchSimDevice && \ python -m pip install --no-build-isolation -e . \ No newline at end of file diff --git a/PyTorchSimDevice2/CMakeLists.txt b/PyTorchSimDevice/CMakeLists.txt similarity index 100% rename from PyTorchSimDevice2/CMakeLists.txt rename to PyTorchSimDevice/CMakeLists.txt diff --git a/PyTorchSimDevice2/README.md b/PyTorchSimDevice/README.md similarity index 100% rename from PyTorchSimDevice2/README.md rename to PyTorchSimDevice/README.md diff --git a/PyTorchSimDevice2/cmake/TorchPythonTargets.cmake b/PyTorchSimDevice/cmake/TorchPythonTargets.cmake similarity index 100% rename from PyTorchSimDevice2/cmake/TorchPythonTargets.cmake rename to PyTorchSimDevice/cmake/TorchPythonTargets.cmake diff --git a/PyTorchSimDevice2/csrc/CMakeLists.txt b/PyTorchSimDevice/csrc/CMakeLists.txt similarity index 100% rename from PyTorchSimDevice2/csrc/CMakeLists.txt rename to PyTorchSimDevice/csrc/CMakeLists.txt diff --git a/PyTorchSimDevice2/csrc/amp/OpenRegAmp.h b/PyTorchSimDevice/csrc/amp/OpenRegAmp.h similarity index 100% rename from PyTorchSimDevice2/csrc/amp/OpenRegAmp.h rename to PyTorchSimDevice/csrc/amp/OpenRegAmp.h diff --git a/PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp b/PyTorchSimDevice/csrc/amp/auto_cast_mode.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp rename to PyTorchSimDevice/csrc/amp/auto_cast_mode.cpp diff --git a/PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp b/PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp rename to PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp diff --git a/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp b/PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp rename to PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp diff --git a/PyTorchSimDevice2/csrc/aten/native/Common.h b/PyTorchSimDevice/csrc/aten/native/Common.h similarity index 100% rename from PyTorchSimDevice2/csrc/aten/native/Common.h rename to PyTorchSimDevice/csrc/aten/native/Common.h diff --git a/PyTorchSimDevice2/csrc/aten/native/Extra.cpp b/PyTorchSimDevice/csrc/aten/native/Extra.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/aten/native/Extra.cpp rename to PyTorchSimDevice/csrc/aten/native/Extra.cpp diff --git a/PyTorchSimDevice2/csrc/aten/native/Extra.h b/PyTorchSimDevice/csrc/aten/native/Extra.h similarity index 100% rename from PyTorchSimDevice2/csrc/aten/native/Extra.h rename to PyTorchSimDevice/csrc/aten/native/Extra.h diff --git a/PyTorchSimDevice2/csrc/aten/native/Minimal.cpp b/PyTorchSimDevice/csrc/aten/native/Minimal.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/aten/native/Minimal.cpp rename to PyTorchSimDevice/csrc/aten/native/Minimal.cpp diff --git a/PyTorchSimDevice2/csrc/aten/native/Minimal.h b/PyTorchSimDevice/csrc/aten/native/Minimal.h similarity index 100% rename from PyTorchSimDevice2/csrc/aten/native/Minimal.h rename to PyTorchSimDevice/csrc/aten/native/Minimal.h diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegDeviceAllocator.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.cpp rename to PyTorchSimDevice/csrc/runtime/OpenRegDeviceAllocator.cpp diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.h b/PyTorchSimDevice/csrc/runtime/OpenRegDeviceAllocator.h similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.h rename to PyTorchSimDevice/csrc/runtime/OpenRegDeviceAllocator.h diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h b/PyTorchSimDevice/csrc/runtime/OpenRegEvent.h similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h rename to PyTorchSimDevice/csrc/runtime/OpenRegEvent.h diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegException.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegException.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegException.cpp rename to PyTorchSimDevice/csrc/runtime/OpenRegException.cpp diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegException.h b/PyTorchSimDevice/csrc/runtime/OpenRegException.h similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegException.h rename to PyTorchSimDevice/csrc/runtime/OpenRegException.h diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegFunctions.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.cpp rename to PyTorchSimDevice/csrc/runtime/OpenRegFunctions.cpp diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.h b/PyTorchSimDevice/csrc/runtime/OpenRegFunctions.h similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.h rename to PyTorchSimDevice/csrc/runtime/OpenRegFunctions.h diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegGenerator.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.cpp rename to PyTorchSimDevice/csrc/runtime/OpenRegGenerator.cpp diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.h b/PyTorchSimDevice/csrc/runtime/OpenRegGenerator.h similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.h rename to PyTorchSimDevice/csrc/runtime/OpenRegGenerator.h diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegGuard.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegGuard.cpp rename to PyTorchSimDevice/csrc/runtime/OpenRegGuard.cpp diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h b/PyTorchSimDevice/csrc/runtime/OpenRegGuard.h similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h rename to PyTorchSimDevice/csrc/runtime/OpenRegGuard.h diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegHooks.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegHooks.cpp rename to PyTorchSimDevice/csrc/runtime/OpenRegHooks.cpp diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h b/PyTorchSimDevice/csrc/runtime/OpenRegHooks.h similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h rename to PyTorchSimDevice/csrc/runtime/OpenRegHooks.h diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegHostAllocator.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.cpp rename to PyTorchSimDevice/csrc/runtime/OpenRegHostAllocator.cpp diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.h b/PyTorchSimDevice/csrc/runtime/OpenRegHostAllocator.h similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.h rename to PyTorchSimDevice/csrc/runtime/OpenRegHostAllocator.h diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegSerialization.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.cpp rename to PyTorchSimDevice/csrc/runtime/OpenRegSerialization.cpp diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.h b/PyTorchSimDevice/csrc/runtime/OpenRegSerialization.h similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.h rename to PyTorchSimDevice/csrc/runtime/OpenRegSerialization.h diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegStream.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegStream.cpp similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegStream.cpp rename to PyTorchSimDevice/csrc/runtime/OpenRegStream.cpp diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegStream.h b/PyTorchSimDevice/csrc/runtime/OpenRegStream.h similarity index 100% rename from PyTorchSimDevice2/csrc/runtime/OpenRegStream.h rename to PyTorchSimDevice/csrc/runtime/OpenRegStream.h diff --git a/PyTorchSimDevice2/include/Macros.h b/PyTorchSimDevice/include/Macros.h similarity index 100% rename from PyTorchSimDevice2/include/Macros.h rename to PyTorchSimDevice/include/Macros.h diff --git a/PyTorchSimDevice2/pyproject.toml b/PyTorchSimDevice/pyproject.toml similarity index 100% rename from PyTorchSimDevice2/pyproject.toml rename to PyTorchSimDevice/pyproject.toml diff --git a/PyTorchSimDevice2/setup.py b/PyTorchSimDevice/setup.py similarity index 100% rename from PyTorchSimDevice2/setup.py rename to PyTorchSimDevice/setup.py diff --git a/PyTorchSimDevice2/third_party/openreg/CMakeLists.txt b/PyTorchSimDevice/third_party/openreg/CMakeLists.txt similarity index 100% rename from PyTorchSimDevice2/third_party/openreg/CMakeLists.txt rename to PyTorchSimDevice/third_party/openreg/CMakeLists.txt diff --git a/PyTorchSimDevice2/third_party/openreg/README.md b/PyTorchSimDevice/third_party/openreg/README.md similarity index 100% rename from PyTorchSimDevice2/third_party/openreg/README.md rename to PyTorchSimDevice/third_party/openreg/README.md diff --git a/PyTorchSimDevice2/third_party/openreg/cmake/GTestTargets.cmake b/PyTorchSimDevice/third_party/openreg/cmake/GTestTargets.cmake similarity index 100% rename from PyTorchSimDevice2/third_party/openreg/cmake/GTestTargets.cmake rename to PyTorchSimDevice/third_party/openreg/cmake/GTestTargets.cmake diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/device.cpp b/PyTorchSimDevice/third_party/openreg/csrc/device.cpp similarity index 100% rename from PyTorchSimDevice2/third_party/openreg/csrc/device.cpp rename to PyTorchSimDevice/third_party/openreg/csrc/device.cpp diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/memory.cpp b/PyTorchSimDevice/third_party/openreg/csrc/memory.cpp similarity index 100% rename from PyTorchSimDevice2/third_party/openreg/csrc/memory.cpp rename to PyTorchSimDevice/third_party/openreg/csrc/memory.cpp diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/memory.h b/PyTorchSimDevice/third_party/openreg/csrc/memory.h similarity index 100% rename from PyTorchSimDevice2/third_party/openreg/csrc/memory.h rename to PyTorchSimDevice/third_party/openreg/csrc/memory.h diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/stream.cpp b/PyTorchSimDevice/third_party/openreg/csrc/stream.cpp similarity index 100% rename from PyTorchSimDevice2/third_party/openreg/csrc/stream.cpp rename to PyTorchSimDevice/third_party/openreg/csrc/stream.cpp diff --git a/PyTorchSimDevice2/third_party/openreg/example/example.cpp b/PyTorchSimDevice/third_party/openreg/example/example.cpp similarity index 100% rename from PyTorchSimDevice2/third_party/openreg/example/example.cpp rename to PyTorchSimDevice/third_party/openreg/example/example.cpp diff --git a/PyTorchSimDevice2/third_party/openreg/include/openreg.h b/PyTorchSimDevice/third_party/openreg/include/openreg.h similarity index 100% rename from PyTorchSimDevice2/third_party/openreg/include/openreg.h rename to PyTorchSimDevice/third_party/openreg/include/openreg.h diff --git a/PyTorchSimDevice2/third_party/openreg/include/openreg.inl b/PyTorchSimDevice/third_party/openreg/include/openreg.inl similarity index 100% rename from PyTorchSimDevice2/third_party/openreg/include/openreg.inl rename to PyTorchSimDevice/third_party/openreg/include/openreg.inl diff --git a/PyTorchSimDevice2/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so b/PyTorchSimDevice/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so similarity index 100% rename from PyTorchSimDevice2/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so rename to PyTorchSimDevice/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so diff --git a/PyTorchSimDevice2/torch_openreg/__init__.py b/PyTorchSimDevice/torch_openreg/__init__.py similarity index 100% rename from PyTorchSimDevice2/torch_openreg/__init__.py rename to PyTorchSimDevice/torch_openreg/__init__.py diff --git a/PyTorchSimDevice2/torch_openreg/_utils.py b/PyTorchSimDevice/torch_openreg/_utils.py similarity index 100% rename from PyTorchSimDevice2/torch_openreg/_utils.py rename to PyTorchSimDevice/torch_openreg/_utils.py diff --git a/PyTorchSimDevice2/torch_openreg/csrc/CMakeLists.txt b/PyTorchSimDevice/torch_openreg/csrc/CMakeLists.txt similarity index 100% rename from PyTorchSimDevice2/torch_openreg/csrc/CMakeLists.txt rename to PyTorchSimDevice/torch_openreg/csrc/CMakeLists.txt diff --git a/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp b/PyTorchSimDevice/torch_openreg/csrc/Module.cpp similarity index 100% rename from PyTorchSimDevice2/torch_openreg/csrc/Module.cpp rename to PyTorchSimDevice/torch_openreg/csrc/Module.cpp diff --git a/PyTorchSimDevice2/torch_openreg/csrc/stub.c b/PyTorchSimDevice/torch_openreg/csrc/stub.c similarity index 100% rename from PyTorchSimDevice2/torch_openreg/csrc/stub.c rename to PyTorchSimDevice/torch_openreg/csrc/stub.c diff --git a/PyTorchSimDevice2/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py similarity index 100% rename from PyTorchSimDevice2/torch_openreg/openreg/__init__.py rename to PyTorchSimDevice/torch_openreg/openreg/__init__.py diff --git a/PyTorchSimDevice2/torch_openreg/openreg/amp.py b/PyTorchSimDevice/torch_openreg/openreg/amp.py similarity index 100% rename from PyTorchSimDevice2/torch_openreg/openreg/amp.py rename to PyTorchSimDevice/torch_openreg/openreg/amp.py diff --git a/PyTorchSimDevice2/torch_openreg/openreg/extension_device_interface.py b/PyTorchSimDevice/torch_openreg/openreg/extension_device_interface.py similarity index 100% rename from PyTorchSimDevice2/torch_openreg/openreg/extension_device_interface.py rename to PyTorchSimDevice/torch_openreg/openreg/extension_device_interface.py diff --git a/PyTorchSimDevice2/torch_openreg/openreg/extension_device_op_overrides.py b/PyTorchSimDevice/torch_openreg/openreg/extension_device_op_overrides.py similarity index 100% rename from PyTorchSimDevice2/torch_openreg/openreg/extension_device_op_overrides.py rename to PyTorchSimDevice/torch_openreg/openreg/extension_device_op_overrides.py diff --git a/PyTorchSimDevice2/torch_openreg/openreg/meta.py b/PyTorchSimDevice/torch_openreg/openreg/meta.py similarity index 100% rename from PyTorchSimDevice2/torch_openreg/openreg/meta.py rename to PyTorchSimDevice/torch_openreg/openreg/meta.py diff --git a/PyTorchSimDevice2/torch_openreg/openreg/random.py b/PyTorchSimDevice/torch_openreg/openreg/random.py similarity index 100% rename from PyTorchSimDevice2/torch_openreg/openreg/random.py rename to PyTorchSimDevice/torch_openreg/openreg/random.py From 89546d788e599edf64ff08241c094b83d0d218d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=9D=B4=EC=9E=AC=EA=B7=A0?= Date: Sat, 24 Jan 2026 14:18:36 +0900 Subject: [PATCH 091/194] [Test] Add YOLOv5 test file --- Dockerfile.base | 5 +++ tests/test_yolov5.py | 88 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 tests/test_yolov5.py diff --git a/Dockerfile.base b/Dockerfile.base index c5f200bc..0fd950d2 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -45,6 +45,11 @@ RUN wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2 # Install torchsim dependency RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0 && pip install "transformers<4.44" && pip install diffusers==0.34.0 +# Extra Python deps for YOLO/vision tests +RUN python -m pip install -U pip setuptools wheel && \ + python -m pip install --no-cache-dir --no-deps ultralytics && \ + python -m pip install --no-cache-dir opencv-python-headless pandas seaborn + ENV RISCV=/workspace/riscv ENV PATH=$RISCV/bin:$PATH diff --git a/tests/test_yolov5.py b/tests/test_yolov5.py new file mode 100644 index 00000000..197b597a --- /dev/null +++ b/tests/test_yolov5.py @@ -0,0 +1,88 @@ +import torch +import torch._dynamo +import torch.utils.cpp_extension + +import argparse +import datetime + +import requests +from PIL import Image +from io import BytesIO +from torchvision import transforms + +import os +import shutil + + + +def run_yolo(batch, config): + from Scheduler.scheduler import PyTorchSimRunner + device = PyTorchSimRunner.setup_device().custom_device() + + torch._dynamo.config.recompile_limit = 64 + torch._dynamo.config.cache_size_limit = 128 + + model = torch.hub.load("ultralytics/yolov5", "yolov5s").cpu().eval() + url = "https://ultralytics.com/images/zidane.jpg" + + response = requests.get(url) + img = Image.open(BytesIO(response.content)).convert("RGB") + + imgsz = 64 # 이미지 사이즈 줄여서 시뮬레이터 체크 가속 + transform = transforms.Compose([ + transforms.Resize((imgsz, imgsz)), + transforms.ToTensor(), + ]) + + x = transform(img).unsqueeze(0) # [1, 3, H, W] + x = x.to(device) + + + model.to(device) + x = x.to(device) + + # Compile and run the model with PyTorchSim + compiled_model = torch.compile(dynamic=False)(model) + y = compiled_model(x) + print("Yolo Simulation Done") + + +if __name__ == "__main__": + import sys + + base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim") + config = os.environ.get( + "TORCHSIM_CONFIG", + default=f"{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml", + ) + config_prefix = config.split("/")[-1].split(".")[0][ + 9: + ] # extract config name from config path + sys.path.append(base_dir) + args = argparse.ArgumentParser() + args.add_argument("--batch", type=int, default=1) + args.add_argument("--dump_path", type=str, default="results") + args = args.parse_args() + batch = args.batch + result_path = os.path.join( + base_dir, + args.dump_path, + config_prefix, + f"yolo5s_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}", + ) + + + # setting environment variables + os.environ["TORCHSIM_LOG_PATH"] = result_path + os.environ["TORCHSIM_USE_TIMING_POOLING"] = "1" + + # only timing simulation + os.environ["TORCHSIM_VALIDATION_MODE"] = "0" + if "pytorchsim_functional_mode" in os.environ: + del os.environ["pytorchsim_functional_mode"] + + # Clear extension/inductor caches to force rebuilds + shutil.rmtree("/tmp/torchinductor_root", ignore_errors=True) + shutil.rmtree(os.path.expanduser("~/.cache/torch_extensions/py311_cu126/npu"), ignore_errors=True) + + run_yolo(batch, config) From d5be66ec37c8cbd8306a324addc4908f734c158a Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 27 Jan 2026 10:26:48 +0000 Subject: [PATCH 092/194] [Cleanup] Fix indent error --- tests/Fusion/test_addmm_residual.py | 3 ++- tests/Fusion/test_attention_fusion.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/Fusion/test_addmm_residual.py b/tests/Fusion/test_addmm_residual.py index 917628e3..a2c17207 100644 --- a/tests/Fusion/test_addmm_residual.py +++ b/tests/Fusion/test_addmm_residual.py @@ -36,7 +36,8 @@ def addmm_residual(a, b, c, d): y = addmm_residual(b2, x2, w2, r2) test_result("Addmm + Residual Fusion Forward", res, y) -if __name__ == "__main__": device = torch.device("npu:0") +if __name__ == "__main__": + device = torch.device("npu:0") test_addmm_residual(device, 32, 32, 32) test_addmm_residual(device, 128, 128, 128) test_addmm_residual(device, 512, 512, 512) diff --git a/tests/Fusion/test_attention_fusion.py b/tests/Fusion/test_attention_fusion.py index ebbd3037..93a17347 100644 --- a/tests/Fusion/test_attention_fusion.py +++ b/tests/Fusion/test_attention_fusion.py @@ -67,7 +67,8 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512): test_result("MHA Forward", res, cpu_res) -if __name__ == "__main__": device = torch.device("npu:0") +if __name__ == "__main__": + device = torch.device("npu:0") test_MHA(device) # test_Attention(device, head=16, seq=512, d_k=64) # test_MHA(device, num_heads=12, embed_dim=768) From 5ec144d92b62a993bbf20d5c57baf5d879aa1e47 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 27 Jan 2026 10:28:48 +0000 Subject: [PATCH 093/194] [Test #204] Add yolov5 test ci --- .github/workflows/pytorchsim_test.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index 8444f318..2d32ab5c 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -684,6 +684,27 @@ jobs: -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Llama/test_llama.py + test_yolov5: + name: Run test_yolov5 + runs-on: self-hosted + steps: + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Run test_yolov5.py + run: | + echo "Running test_yolov5.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e vpu_num_lanes="${{ inputs.vector_lane }}" \ + -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/Llama/test_yolov5.py + test_accuracy: name: Run test_accuracy runs-on: self-hosted From 730fce9bf81f6ea3a94778f2a3428728da3021d4 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 27 Jan 2026 10:30:11 +0000 Subject: [PATCH 094/194] [Fix] Remove comments --- .github/workflows/pytorchsim_test.yml | 2 +- tests/{ => Yolov5}/test_yolov5.py | 30 ++------------------------- 2 files changed, 3 insertions(+), 29 deletions(-) rename tests/{ => Yolov5}/test_yolov5.py (57%) diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index 2d32ab5c..9589384b 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -703,7 +703,7 @@ jobs: -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ - ${{ inputs.image_name }} python3 PyTorchSim/tests/Llama/test_yolov5.py + ${{ inputs.image_name }} python3 PyTorchSim/tests/Yolov5/test_yolov5.py test_accuracy: name: Run test_accuracy diff --git a/tests/test_yolov5.py b/tests/Yolov5/test_yolov5.py similarity index 57% rename from tests/test_yolov5.py rename to tests/Yolov5/test_yolov5.py index 197b597a..d9e6b261 100644 --- a/tests/test_yolov5.py +++ b/tests/Yolov5/test_yolov5.py @@ -16,8 +16,7 @@ def run_yolo(batch, config): - from Scheduler.scheduler import PyTorchSimRunner - device = PyTorchSimRunner.setup_device().custom_device() + device = torch.device("npu:0") torch._dynamo.config.recompile_limit = 64 torch._dynamo.config.cache_size_limit = 128 @@ -28,7 +27,7 @@ def run_yolo(batch, config): response = requests.get(url) img = Image.open(BytesIO(response.content)).convert("RGB") - imgsz = 64 # 이미지 사이즈 줄여서 시뮬레이터 체크 가속 + imgsz = 64 transform = transforms.Compose([ transforms.Resize((imgsz, imgsz)), transforms.ToTensor(), @@ -48,41 +47,16 @@ def run_yolo(batch, config): if __name__ == "__main__": - import sys base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim") config = os.environ.get( "TORCHSIM_CONFIG", default=f"{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml", ) - config_prefix = config.split("/")[-1].split(".")[0][ - 9: - ] # extract config name from config path - sys.path.append(base_dir) args = argparse.ArgumentParser() args.add_argument("--batch", type=int, default=1) args.add_argument("--dump_path", type=str, default="results") args = args.parse_args() batch = args.batch - result_path = os.path.join( - base_dir, - args.dump_path, - config_prefix, - f"yolo5s_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}", - ) - - - # setting environment variables - os.environ["TORCHSIM_LOG_PATH"] = result_path - os.environ["TORCHSIM_USE_TIMING_POOLING"] = "1" - - # only timing simulation - os.environ["TORCHSIM_VALIDATION_MODE"] = "0" - if "pytorchsim_functional_mode" in os.environ: - del os.environ["pytorchsim_functional_mode"] - - # Clear extension/inductor caches to force rebuilds - shutil.rmtree("/tmp/torchinductor_root", ignore_errors=True) - shutil.rmtree(os.path.expanduser("~/.cache/torch_extensions/py311_cu126/npu"), ignore_errors=True) run_yolo(batch, config) From 47c563e14530b1940072d53ea98154b0a4e137ee Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 2 Feb 2026 06:46:36 +0000 Subject: [PATCH 095/194] [Frontend] Fix Identity handling for index expr --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 3 ++- PyTorchSimFrontend/mlir/mlir_common.py | 9 +++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 1565a26b..c5da1f56 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -338,6 +338,7 @@ def convert_index(self, expr): expr_str = str(expr) if isinstance(expr, ModularIndexing): + dim = list(expr.args[0].free_symbols)[0] replace_str = f"({expr.args[0]} floordiv {expr.args[1]}) mod {expr.args[2]}" expr_str = re.sub(r"ModularIndexing\([^)]*\)", replace_str, expr_str) elif "//" in expr_str: @@ -1233,7 +1234,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe if isinstance(sub, ModularIndexing): if not str(sub.args[0]).startswith("index"): continue - dim_idx = int((str(sub.args[0])[5:])) + dim_idx = int((str(list(sub.args[0].free_symbols)[0])[5:])) floor_divisor = sub.args[1] # y: floorDiv divisor mod_divisor = sub.args[2] # z: modular divisor current_tile_size = self.kernel_group.tile_desc.get_tile_size()[dim_idx] diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index be491925..f101b7cb 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -852,12 +852,9 @@ def rename_indexing(self, index) -> sympy.Expr: index = index.args[0] if index.args else index # Replace Identity arguments with Identity.args[0] - if hasattr(index, 'args') and len(index.args) > 0: - for arg in index.args: - if arg.is_Mul and arg.args[0].is_number and isinstance(arg.args[1], Identity): - index = index.replace(arg.args[1], arg.args[1].args[0] if arg.args[1].args else arg.args[1]) - if isinstance(arg, Identity): - index = index.replace(arg, arg.args[0] if arg.args else arg) + Identity_args = [expr for expr in sympy.preorder_traversal(index) if isinstance(expr, Identity)] + for expr in Identity_args: + index = index.replace(expr, expr.args[0] if expr.args else expr) index = V.graph.sizevars.simplify(index) sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name) From d3cf8633f463697ea2660abc8ce22e813532ff1f Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 3 Feb 2026 04:33:22 +0000 Subject: [PATCH 096/194] [OpenReg] Add Python interface for device stream, event API --- .../torch_openreg/csrc/CMakeLists.txt | 4 + .../torch_openreg/csrc/Module.cpp | 292 ++++++++++++++++++ .../torch_openreg/openreg/__init__.py | 117 +++++++ tests/test_stream.py | 22 ++ 4 files changed, 435 insertions(+) create mode 100644 tests/test_stream.py diff --git a/PyTorchSimDevice/torch_openreg/csrc/CMakeLists.txt b/PyTorchSimDevice/torch_openreg/csrc/CMakeLists.txt index 4ff321c4..2a29a89c 100644 --- a/PyTorchSimDevice/torch_openreg/csrc/CMakeLists.txt +++ b/PyTorchSimDevice/torch_openreg/csrc/CMakeLists.txt @@ -6,6 +6,10 @@ file(GLOB_RECURSE SOURCE_FILES add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES}) +target_include_directories(${LIBRARY_NAME} PRIVATE + ${PROJECT_SOURCE_DIR}/third_party/openreg +) + target_link_libraries(${LIBRARY_NAME} PRIVATE torch_python_library torch_openreg) if(WIN32) diff --git a/PyTorchSimDevice/torch_openreg/csrc/Module.cpp b/PyTorchSimDevice/torch_openreg/csrc/Module.cpp index 052a9ed4..31d0c6a8 100644 --- a/PyTorchSimDevice/torch_openreg/csrc/Module.cpp +++ b/PyTorchSimDevice/torch_openreg/csrc/Module.cpp @@ -10,6 +10,10 @@ #include #include +#include +#include +#include +#include static PyObject* _initExtension(PyObject* self, PyObject* noargs) { HANDLE_TH_ERRORS @@ -135,6 +139,274 @@ PyObject* _getAmpSupportedDtype(PyObject* self, PyObject* noargs) { END_HANDLE_TH_ERRORS } +// Stream functions +PyObject* _streamCreate(PyObject* self, PyObject* noargs) { + HANDLE_TH_ERRORS + torch::utils::device_lazy_init(at::kPrivateUse1); + orStream_t stream = nullptr; + orError_t err = orStreamCreate(&stream); + std::cerr << "[DEBUG] Stream created: " << stream << std::endl; + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to create stream"); + } + return THPUtils_packInt64(reinterpret_cast(stream)); + END_HANDLE_TH_ERRORS +} + +PyObject* _streamCreateWithPriority(PyObject* self, PyObject* args) { + HANDLE_TH_ERRORS + TORCH_CHECK(PyTuple_Size(args) == 2, "stream_create_with_priority expects 2 arguments"); + PyObject* flags_obj = PyTuple_GetItem(args, 0); + PyObject* priority_obj = PyTuple_GetItem(args, 1); + TORCH_CHECK(THPUtils_checkLong(flags_obj), "flags must be an int"); + TORCH_CHECK(THPUtils_checkLong(priority_obj), "priority must be an int"); + unsigned int flags = static_cast(THPUtils_unpackLong(flags_obj)); + int priority = static_cast(THPUtils_unpackLong(priority_obj)); + + torch::utils::device_lazy_init(at::kPrivateUse1); + orStream_t stream = nullptr; + orError_t err = orStreamCreateWithPriority(&stream, flags, priority); + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to create stream with priority"); + } + return THPUtils_packInt64(reinterpret_cast(stream)); + END_HANDLE_TH_ERRORS +} + +PyObject* _streamDestroy(PyObject* self, PyObject* arg) { + HANDLE_TH_ERRORS + TORCH_CHECK(THPUtils_checkLong(arg), "stream_destroy expects an int"); + orStream_t stream = reinterpret_cast(THPUtils_unpackLong(arg)); + orError_t err = orStreamDestroy(stream); + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to destroy stream"); + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* _streamSynchronize(PyObject* self, PyObject* arg) { + HANDLE_TH_ERRORS + TORCH_CHECK(THPUtils_checkLong(arg), "stream_synchronize expects an int"); + orStream_t stream = reinterpret_cast(THPUtils_unpackLong(arg)); + + orError_t err; + Py_BEGIN_ALLOW_THREADS + err = orStreamSynchronize(stream); + Py_END_ALLOW_THREADS + + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to synchronize stream"); + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* _streamQuery(PyObject* self, PyObject* arg) { + HANDLE_TH_ERRORS + TORCH_CHECK(THPUtils_checkLong(arg), "stream_query expects an int"); + orStream_t stream = reinterpret_cast(THPUtils_unpackLong(arg)); + orError_t err = orStreamQuery(stream); + if (err == orSuccess) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } + END_HANDLE_TH_ERRORS +} + +PyObject* _streamGetPriority(PyObject* self, PyObject* arg) { + HANDLE_TH_ERRORS + TORCH_CHECK(THPUtils_checkLong(arg), "stream_get_priority expects an int"); + orStream_t stream = reinterpret_cast(THPUtils_unpackLong(arg)); + int priority = 0; + orError_t err = orStreamGetPriority(stream, &priority); + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to get stream priority"); + } + return THPUtils_packInt32(priority); + END_HANDLE_TH_ERRORS +} + +PyObject* _streamWaitEvent(PyObject* self, PyObject* args) { + HANDLE_TH_ERRORS + TORCH_CHECK(PyTuple_Size(args) == 2, "stream_wait_event expects 2 arguments"); + PyObject* stream_obj = PyTuple_GetItem(args, 0); + PyObject* event_obj = PyTuple_GetItem(args, 1); + TORCH_CHECK(THPUtils_checkLong(stream_obj), "stream must be an int"); + TORCH_CHECK(THPUtils_checkLong(event_obj), "event must be an int"); + orStream_t stream = reinterpret_cast(THPUtils_unpackLong(stream_obj)); + orEvent_t event = reinterpret_cast(THPUtils_unpackLong(event_obj)); + orError_t err = orStreamWaitEvent(stream, event, 0); + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to wait for event"); + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +// Event functions +PyObject* _eventCreate(PyObject* self, PyObject* noargs) { + HANDLE_TH_ERRORS + torch::utils::device_lazy_init(at::kPrivateUse1); + orEvent_t event = nullptr; + orError_t err = orEventCreate(&event); + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to create event"); + } + return THPUtils_packInt64(reinterpret_cast(event)); + END_HANDLE_TH_ERRORS +} + +PyObject* _eventCreateWithFlags(PyObject* self, PyObject* arg) { + HANDLE_TH_ERRORS + TORCH_CHECK(THPUtils_checkLong(arg), "event_create_with_flags expects an int"); + unsigned int flags = static_cast(THPUtils_unpackLong(arg)); + + torch::utils::device_lazy_init(at::kPrivateUse1); + orEvent_t event = nullptr; + orError_t err = orEventCreateWithFlags(&event, flags); + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to create event with flags"); + } + return THPUtils_packInt64(reinterpret_cast(event)); + END_HANDLE_TH_ERRORS +} + +PyObject* _eventDestroy(PyObject* self, PyObject* arg) { + HANDLE_TH_ERRORS + TORCH_CHECK(THPUtils_checkLong(arg), "event_destroy expects an int"); + orEvent_t event = reinterpret_cast(THPUtils_unpackLong(arg)); + orError_t err = orEventDestroy(event); + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to destroy event"); + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* _eventRecord(PyObject* self, PyObject* args) { + HANDLE_TH_ERRORS + TORCH_CHECK(PyTuple_Size(args) == 2, "event_record expects 2 arguments"); + PyObject* event_obj = PyTuple_GetItem(args, 0); + PyObject* stream_obj = PyTuple_GetItem(args, 1); + TORCH_CHECK(THPUtils_checkLong(event_obj), "event must be an int"); + TORCH_CHECK(THPUtils_checkLong(stream_obj), "stream must be an int"); + orEvent_t event = reinterpret_cast(THPUtils_unpackLong(event_obj)); + orStream_t stream = reinterpret_cast(THPUtils_unpackLong(stream_obj)); + orError_t err = orEventRecord(event, stream); + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to record event"); + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* _eventSynchronize(PyObject* self, PyObject* arg) { + HANDLE_TH_ERRORS + TORCH_CHECK(THPUtils_checkLong(arg), "event_synchronize expects an int"); + orEvent_t event = reinterpret_cast(THPUtils_unpackLong(arg)); + + orError_t err; + Py_BEGIN_ALLOW_THREADS + err = orEventSynchronize(event); + Py_END_ALLOW_THREADS + + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to synchronize event"); + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* _eventQuery(PyObject* self, PyObject* arg) { + HANDLE_TH_ERRORS + TORCH_CHECK(THPUtils_checkLong(arg), "event_query expects an int"); + orEvent_t event = reinterpret_cast(THPUtils_unpackLong(arg)); + orError_t err = orEventQuery(event); + if (err == orSuccess) { + Py_RETURN_TRUE; + } else { + Py_RETURN_FALSE; + } + END_HANDLE_TH_ERRORS +} + +PyObject* _eventElapsedTime(PyObject* self, PyObject* args) { + HANDLE_TH_ERRORS + TORCH_CHECK(PyTuple_Size(args) == 2, "event_elapsed_time expects 2 arguments"); + PyObject* start_obj = PyTuple_GetItem(args, 0); + PyObject* end_obj = PyTuple_GetItem(args, 1); + TORCH_CHECK(THPUtils_checkLong(start_obj), "start event must be an int"); + TORCH_CHECK(THPUtils_checkLong(end_obj), "end event must be an int"); + orEvent_t start = reinterpret_cast(THPUtils_unpackLong(start_obj)); + orEvent_t end = reinterpret_cast(THPUtils_unpackLong(end_obj)); + float ms = 0.0f; + orError_t err = orEventElapsedTime(&ms, start, end); + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to get elapsed time"); + } + return PyFloat_FromDouble(static_cast(ms)); + END_HANDLE_TH_ERRORS +} + +PyObject* _deviceSynchronize(PyObject* self, PyObject* noargs) { + HANDLE_TH_ERRORS + torch::utils::device_lazy_init(at::kPrivateUse1); + + orError_t err; + Py_BEGIN_ALLOW_THREADS + err = orDeviceSynchronize(); + Py_END_ALLOW_THREADS + + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to synchronize device"); + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + +PyObject* _addTaskToStream(PyObject* self, PyObject* args) { + HANDLE_TH_ERRORS + TORCH_CHECK(PyTuple_Size(args) == 2, "add_task_to_stream expects 2 arguments"); + PyObject* stream_obj = PyTuple_GetItem(args, 0); + PyObject* callable_obj = PyTuple_GetItem(args, 1); + + TORCH_CHECK(THPUtils_checkLong(stream_obj), "stream must be an int"); + TORCH_CHECK(PyCallable_Check(callable_obj), "task must be callable"); + + orStream_t stream = reinterpret_cast(THPUtils_unpackLong(stream_obj)); + + Py_INCREF(callable_obj); + auto py_callable = std::shared_ptr(callable_obj, [](PyObject* obj) { + PyGILState_STATE gstate = PyGILState_Ensure(); + Py_DECREF(obj); + PyGILState_Release(gstate); + }); + + auto task = [py_callable]() { + PyGILState_STATE gstate = PyGILState_Ensure(); + try { + PyObject* result = PyObject_CallObject(py_callable.get(), nullptr); + if (result == nullptr) { + PyErr_Print(); + PyErr_Clear(); + } else { + Py_DECREF(result); + } + } catch (...) { + } + + PyGILState_Release(gstate); + }; + orError_t err = openreg::addTaskToStream(stream, task); + if (err != orSuccess) { + TORCH_CHECK(false, "Failed to add task to stream"); + } + Py_RETURN_NONE; + END_HANDLE_TH_ERRORS +} + static PyMethodDef methods[] = { {"_init", _initExtension, METH_NOARGS, nullptr}, {"_get_default_generator", _getDefaultGenerator, METH_O, nullptr}, @@ -147,6 +419,26 @@ static PyMethodDef methods[] = { {"get_autocast_dtype", _getAutocastDtype, METH_NOARGS, nullptr}, {"set_autocast_dtype", _setAutocastDtype, METH_O, nullptr}, {"get_amp_supported_dtype", _getAmpSupportedDtype, METH_NOARGS, nullptr}, + // Stream functions + {"_stream_create", _streamCreate, METH_NOARGS, nullptr}, + {"_stream_create_with_priority", _streamCreateWithPriority, METH_VARARGS, nullptr}, + {"_stream_destroy", _streamDestroy, METH_O, nullptr}, + {"_stream_synchronize", _streamSynchronize, METH_O, nullptr}, + {"_stream_query", _streamQuery, METH_O, nullptr}, + {"_stream_get_priority", _streamGetPriority, METH_O, nullptr}, + {"_stream_wait_event", _streamWaitEvent, METH_VARARGS, nullptr}, + // Event functions + {"_event_create", _eventCreate, METH_NOARGS, nullptr}, + {"_event_create_with_flags", _eventCreateWithFlags, METH_O, nullptr}, + {"_event_destroy", _eventDestroy, METH_O, nullptr}, + {"_event_record", _eventRecord, METH_VARARGS, nullptr}, + {"_event_synchronize", _eventSynchronize, METH_O, nullptr}, + {"_event_query", _eventQuery, METH_O, nullptr}, + {"_event_elapsed_time", _eventElapsedTime, METH_VARARGS, nullptr}, + // Device functions + {"_device_synchronize", _deviceSynchronize, METH_NOARGS, nullptr}, + // Stream task functions + {"_add_task_to_stream", _addTaskToStream, METH_VARARGS, nullptr}, {nullptr, nullptr, 0, nullptr}}; /* diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py index 81c2fc60..b7d28291 100644 --- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py +++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py @@ -65,6 +65,118 @@ def _lazy_init(): _initialized = True +class Stream: + """Wrapper for OpenReg stream.""" + + def __init__(self, priority=None, flags=0): + if priority is not None: + self._stream = torch_openreg._C._stream_create_with_priority(flags, priority) + else: + self._stream = torch_openreg._C._stream_create() + + def __del__(self): + if hasattr(self, '_stream'): + torch_openreg._C._stream_destroy(self._stream) + + def synchronize(self): + """Wait for all operations in the stream to complete.""" + torch_openreg._C._stream_synchronize(self._stream) + + def query(self): + """Check if all operations in the stream have completed.""" + return torch_openreg._C._stream_query(self._stream) + + def wait_event(self, event): + """Make this stream wait for an event.""" + torch_openreg._C._stream_wait_event(self._stream, event._event) + + def get_priority(self): + """Get the priority of the stream.""" + return torch_openreg._C._stream_get_priority(self._stream) + + def launch_kernel(self, task): + """Add a Python callable kernel to this stream. + + Args: + task: A Python callable (function) to be executed in the stream + """ + torch_openreg._C._add_task_to_stream(self._stream, task) + + @property + def cdata(self): + """Get the underlying stream pointer (for internal use).""" + return self._stream + + +class Event: + """Wrapper for OpenReg event.""" + + def __init__(self, enable_timing=False): + if enable_timing: + # orEventEnableTiming = 0x1 + self._event = torch_openreg._C._event_create_with_flags(0x1) + else: + self._event = torch_openreg._C._event_create() + + def __del__(self): + if hasattr(self, '_event'): + torch_openreg._C._event_destroy(self._event) + + def record(self, stream=None): + """Record the event in a stream.""" + if stream is None: + # Use default stream (stream 0) + stream = Stream() + torch_openreg._C._event_record(self._event, stream._stream) + + def synchronize(self): + """Wait for the event to complete.""" + torch_openreg._C._event_synchronize(self._event) + + def query(self): + """Check if the event has completed.""" + return torch_openreg._C._event_query(self._event) + + def elapsed_time(self, start_event): + """Get the elapsed time between two events in milliseconds.""" + return torch_openreg._C._event_elapsed_time(start_event._event, self._event) + + @property + def cdata(self): + """Get the underlying event pointer (for internal use).""" + return self._event + + +def synchronize(): + """Synchronize all streams on the current device.""" + torch_openreg._C._device_synchronize() + + +def stream(priority=None, flags=0): + """Create a new stream. + + Args: + priority: Stream priority (optional) + flags: Stream flags (optional) + + Returns: + Stream: A new stream object + """ + return Stream(priority=priority, flags=flags) + + +def event(enable_timing=False): + """Create a new event. + + Args: + enable_timing: Whether to enable timing for the event + + Returns: + Event: A new event object + """ + return Event(enable_timing=enable_timing) + + from .random import * # noqa: F403 from .amp import * @@ -88,4 +200,9 @@ def _lazy_init(): "get_autocast_dtype", "set_autocast_dtype", "get_amp_supported_dtype", + "Stream", + "Event", + "stream", + "event", + "synchronize", ] diff --git a/tests/test_stream.py b/tests/test_stream.py new file mode 100644 index 00000000..70077abe --- /dev/null +++ b/tests/test_stream.py @@ -0,0 +1,22 @@ +import torch +import time + +start_event = torch.npu.event(enable_timing=True) +end_event = torch.npu.event(enable_timing=True) +stream = torch.npu.stream() + +def my_kernel(): + print("Task is running...") + result = sum(range(1000)) + time.sleep(2.5) + print(f"Task completed with result: {result}") + +start_event.record(stream) +stream.launch_kernel(my_kernel) +end_event.record(stream) + + +stream.synchronize() + +elapsed_time = end_event.elapsed_time(start_event) +print("Event has completed! ", elapsed_time) \ No newline at end of file From 5224cc965421172e48b3b1607cd0183f1b5e3c33 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 3 Feb 2026 11:45:22 +0000 Subject: [PATCH 097/194] [Scheduler] Reimplement Scheduling mechanism --- .../torch_openreg/csrc/Module.cpp | 180 +------- .../torch_openreg/openreg/__init__.py | 206 ++++++---- .../torch_openreg/openreg/random.py | 6 + PyTorchSimFrontend/extension_codecache.py | 79 ++-- PyTorchSimFrontend/extension_config.py | 3 - PyTorchSimFrontend/extension_op.py | 7 +- PyTorchSimFrontend/mlir/mlir_autotune.py | 26 +- .../mlir/mlir_codegen_backend.py | 3 +- PyTorchSimFrontend/mlir/mlir_conv_common.py | 2 - .../mlir/mlir_conv_mt_template.py | 3 - .../mlir/mlir_conv_sb_template.py | 3 - .../mlir/mlir_conv_sbs_template.py | 3 - PyTorchSimFrontend/mlir/mlir_conv_template.py | 3 - PyTorchSimFrontend/mlir/mlir_scheduling.py | 12 - Scheduler/scheduler.py | 3 +- Simulator/simulator.py | 388 ++++++++++++------ TOGSim/include/TileGraph.h | 6 + TOGSim/include/TileGraphParser.h | 2 +- TOGSim/src/Simulator.cc | 50 +-- TOGSim/src/TileGraphParser.cc | 4 +- TOGSim/src/main.cc | 165 ++++---- TOGSim/src/scheduler/Scheduler.cc | 25 +- scripts/stonne_experiment2/tog_gen.py | 4 +- tests/test_scheduler.py | 44 +- tests/test_stream.py | 16 +- 25 files changed, 585 insertions(+), 658 deletions(-) diff --git a/PyTorchSimDevice/torch_openreg/csrc/Module.cpp b/PyTorchSimDevice/torch_openreg/csrc/Module.cpp index 31d0c6a8..e4f3e8d1 100644 --- a/PyTorchSimDevice/torch_openreg/csrc/Module.cpp +++ b/PyTorchSimDevice/torch_openreg/csrc/Module.cpp @@ -145,7 +145,6 @@ PyObject* _streamCreate(PyObject* self, PyObject* noargs) { torch::utils::device_lazy_init(at::kPrivateUse1); orStream_t stream = nullptr; orError_t err = orStreamCreate(&stream); - std::cerr << "[DEBUG] Stream created: " << stream << std::endl; if (err != orSuccess) { TORCH_CHECK(false, "Failed to create stream"); } @@ -185,171 +184,6 @@ PyObject* _streamDestroy(PyObject* self, PyObject* arg) { END_HANDLE_TH_ERRORS } -PyObject* _streamSynchronize(PyObject* self, PyObject* arg) { - HANDLE_TH_ERRORS - TORCH_CHECK(THPUtils_checkLong(arg), "stream_synchronize expects an int"); - orStream_t stream = reinterpret_cast(THPUtils_unpackLong(arg)); - - orError_t err; - Py_BEGIN_ALLOW_THREADS - err = orStreamSynchronize(stream); - Py_END_ALLOW_THREADS - - if (err != orSuccess) { - TORCH_CHECK(false, "Failed to synchronize stream"); - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -PyObject* _streamQuery(PyObject* self, PyObject* arg) { - HANDLE_TH_ERRORS - TORCH_CHECK(THPUtils_checkLong(arg), "stream_query expects an int"); - orStream_t stream = reinterpret_cast(THPUtils_unpackLong(arg)); - orError_t err = orStreamQuery(stream); - if (err == orSuccess) { - Py_RETURN_TRUE; - } else { - Py_RETURN_FALSE; - } - END_HANDLE_TH_ERRORS -} - -PyObject* _streamGetPriority(PyObject* self, PyObject* arg) { - HANDLE_TH_ERRORS - TORCH_CHECK(THPUtils_checkLong(arg), "stream_get_priority expects an int"); - orStream_t stream = reinterpret_cast(THPUtils_unpackLong(arg)); - int priority = 0; - orError_t err = orStreamGetPriority(stream, &priority); - if (err != orSuccess) { - TORCH_CHECK(false, "Failed to get stream priority"); - } - return THPUtils_packInt32(priority); - END_HANDLE_TH_ERRORS -} - -PyObject* _streamWaitEvent(PyObject* self, PyObject* args) { - HANDLE_TH_ERRORS - TORCH_CHECK(PyTuple_Size(args) == 2, "stream_wait_event expects 2 arguments"); - PyObject* stream_obj = PyTuple_GetItem(args, 0); - PyObject* event_obj = PyTuple_GetItem(args, 1); - TORCH_CHECK(THPUtils_checkLong(stream_obj), "stream must be an int"); - TORCH_CHECK(THPUtils_checkLong(event_obj), "event must be an int"); - orStream_t stream = reinterpret_cast(THPUtils_unpackLong(stream_obj)); - orEvent_t event = reinterpret_cast(THPUtils_unpackLong(event_obj)); - orError_t err = orStreamWaitEvent(stream, event, 0); - if (err != orSuccess) { - TORCH_CHECK(false, "Failed to wait for event"); - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -// Event functions -PyObject* _eventCreate(PyObject* self, PyObject* noargs) { - HANDLE_TH_ERRORS - torch::utils::device_lazy_init(at::kPrivateUse1); - orEvent_t event = nullptr; - orError_t err = orEventCreate(&event); - if (err != orSuccess) { - TORCH_CHECK(false, "Failed to create event"); - } - return THPUtils_packInt64(reinterpret_cast(event)); - END_HANDLE_TH_ERRORS -} - -PyObject* _eventCreateWithFlags(PyObject* self, PyObject* arg) { - HANDLE_TH_ERRORS - TORCH_CHECK(THPUtils_checkLong(arg), "event_create_with_flags expects an int"); - unsigned int flags = static_cast(THPUtils_unpackLong(arg)); - - torch::utils::device_lazy_init(at::kPrivateUse1); - orEvent_t event = nullptr; - orError_t err = orEventCreateWithFlags(&event, flags); - if (err != orSuccess) { - TORCH_CHECK(false, "Failed to create event with flags"); - } - return THPUtils_packInt64(reinterpret_cast(event)); - END_HANDLE_TH_ERRORS -} - -PyObject* _eventDestroy(PyObject* self, PyObject* arg) { - HANDLE_TH_ERRORS - TORCH_CHECK(THPUtils_checkLong(arg), "event_destroy expects an int"); - orEvent_t event = reinterpret_cast(THPUtils_unpackLong(arg)); - orError_t err = orEventDestroy(event); - if (err != orSuccess) { - TORCH_CHECK(false, "Failed to destroy event"); - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -PyObject* _eventRecord(PyObject* self, PyObject* args) { - HANDLE_TH_ERRORS - TORCH_CHECK(PyTuple_Size(args) == 2, "event_record expects 2 arguments"); - PyObject* event_obj = PyTuple_GetItem(args, 0); - PyObject* stream_obj = PyTuple_GetItem(args, 1); - TORCH_CHECK(THPUtils_checkLong(event_obj), "event must be an int"); - TORCH_CHECK(THPUtils_checkLong(stream_obj), "stream must be an int"); - orEvent_t event = reinterpret_cast(THPUtils_unpackLong(event_obj)); - orStream_t stream = reinterpret_cast(THPUtils_unpackLong(stream_obj)); - orError_t err = orEventRecord(event, stream); - if (err != orSuccess) { - TORCH_CHECK(false, "Failed to record event"); - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -PyObject* _eventSynchronize(PyObject* self, PyObject* arg) { - HANDLE_TH_ERRORS - TORCH_CHECK(THPUtils_checkLong(arg), "event_synchronize expects an int"); - orEvent_t event = reinterpret_cast(THPUtils_unpackLong(arg)); - - orError_t err; - Py_BEGIN_ALLOW_THREADS - err = orEventSynchronize(event); - Py_END_ALLOW_THREADS - - if (err != orSuccess) { - TORCH_CHECK(false, "Failed to synchronize event"); - } - Py_RETURN_NONE; - END_HANDLE_TH_ERRORS -} - -PyObject* _eventQuery(PyObject* self, PyObject* arg) { - HANDLE_TH_ERRORS - TORCH_CHECK(THPUtils_checkLong(arg), "event_query expects an int"); - orEvent_t event = reinterpret_cast(THPUtils_unpackLong(arg)); - orError_t err = orEventQuery(event); - if (err == orSuccess) { - Py_RETURN_TRUE; - } else { - Py_RETURN_FALSE; - } - END_HANDLE_TH_ERRORS -} - -PyObject* _eventElapsedTime(PyObject* self, PyObject* args) { - HANDLE_TH_ERRORS - TORCH_CHECK(PyTuple_Size(args) == 2, "event_elapsed_time expects 2 arguments"); - PyObject* start_obj = PyTuple_GetItem(args, 0); - PyObject* end_obj = PyTuple_GetItem(args, 1); - TORCH_CHECK(THPUtils_checkLong(start_obj), "start event must be an int"); - TORCH_CHECK(THPUtils_checkLong(end_obj), "end event must be an int"); - orEvent_t start = reinterpret_cast(THPUtils_unpackLong(start_obj)); - orEvent_t end = reinterpret_cast(THPUtils_unpackLong(end_obj)); - float ms = 0.0f; - orError_t err = orEventElapsedTime(&ms, start, end); - if (err != orSuccess) { - TORCH_CHECK(false, "Failed to get elapsed time"); - } - return PyFloat_FromDouble(static_cast(ms)); - END_HANDLE_TH_ERRORS -} - PyObject* _deviceSynchronize(PyObject* self, PyObject* noargs) { HANDLE_TH_ERRORS torch::utils::device_lazy_init(at::kPrivateUse1); @@ -421,20 +255,8 @@ static PyMethodDef methods[] = { {"get_amp_supported_dtype", _getAmpSupportedDtype, METH_NOARGS, nullptr}, // Stream functions {"_stream_create", _streamCreate, METH_NOARGS, nullptr}, - {"_stream_create_with_priority", _streamCreateWithPriority, METH_VARARGS, nullptr}, {"_stream_destroy", _streamDestroy, METH_O, nullptr}, - {"_stream_synchronize", _streamSynchronize, METH_O, nullptr}, - {"_stream_query", _streamQuery, METH_O, nullptr}, - {"_stream_get_priority", _streamGetPriority, METH_O, nullptr}, - {"_stream_wait_event", _streamWaitEvent, METH_VARARGS, nullptr}, - // Event functions - {"_event_create", _eventCreate, METH_NOARGS, nullptr}, - {"_event_create_with_flags", _eventCreateWithFlags, METH_O, nullptr}, - {"_event_destroy", _eventDestroy, METH_O, nullptr}, - {"_event_record", _eventRecord, METH_VARARGS, nullptr}, - {"_event_synchronize", _eventSynchronize, METH_O, nullptr}, - {"_event_query", _eventQuery, METH_O, nullptr}, - {"_event_elapsed_time", _eventElapsedTime, METH_VARARGS, nullptr}, + // Device functions {"_device_synchronize", _deviceSynchronize, METH_NOARGS, nullptr}, // Stream task functions diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py index b7d28291..66ec022a 100644 --- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py +++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py @@ -1,6 +1,8 @@ +import os +import threading + import torch from torch._dynamo.device_interface import register_interface_for_device - import torch_openreg._C # type: ignore[misc] from . import meta # noqa: F401 @@ -8,7 +10,9 @@ from .extension_device_interface import ExtensionDeviceInterface _initialized = False - +_default_streams = {} # Dictionary to store default streams per device +_tog_simulator = None # Singleton TOGSimulator instance +_launch_context = threading.local() # storage for launch_kernel context class device: r"""Context-manager that changes the selected device. @@ -57,43 +61,28 @@ def is_initialized(): def _lazy_init(): - global _initialized + global _initialized, _tog_simulator if is_initialized(): return torch_openreg._C._init() register_interface_for_device(custom_device(), ExtensionDeviceInterface) _initialized = True + # Create default streams for all devices + num_devices = device_count() + for device_idx in range(num_devices): + _default_streams[device_idx] = Stream() class Stream: """Wrapper for OpenReg stream.""" - def __init__(self, priority=None, flags=0): - if priority is not None: - self._stream = torch_openreg._C._stream_create_with_priority(flags, priority) - else: - self._stream = torch_openreg._C._stream_create() + def __init__(self, flags=0): + self._stream = torch_openreg._C._stream_create() def __del__(self): if hasattr(self, '_stream'): torch_openreg._C._stream_destroy(self._stream) - def synchronize(self): - """Wait for all operations in the stream to complete.""" - torch_openreg._C._stream_synchronize(self._stream) - - def query(self): - """Check if all operations in the stream have completed.""" - return torch_openreg._C._stream_query(self._stream) - - def wait_event(self, event): - """Make this stream wait for an event.""" - torch_openreg._C._stream_wait_event(self._stream, event._event) - - def get_priority(self): - """Get the priority of the stream.""" - return torch_openreg._C._stream_get_priority(self._stream) - def launch_kernel(self, task): """Add a Python callable kernel to this stream. @@ -107,75 +96,149 @@ def cdata(self): """Get the underlying stream pointer (for internal use).""" return self._stream +def stream(flags=0): + return Stream(flags=flags) + +def default_stream(device=None): + _lazy_init() + if device is None: + device_idx = current_device() + else: + device_idx = torch.accelerator._get_device_index(device, optional=True) + if device_idx < 0: + device_idx = current_device() -class Event: - """Wrapper for OpenReg event.""" + if device_idx not in _default_streams: + # Create default stream if it doesn't exist + _default_streams[device_idx] = Stream() - def __init__(self, enable_timing=False): - if enable_timing: - # orEventEnableTiming = 0x1 - self._event = torch_openreg._C._event_create_with_flags(0x1) - else: - self._event = torch_openreg._C._event_create() + return _default_streams[device_idx] - def __del__(self): - if hasattr(self, '_event'): - torch_openreg._C._event_destroy(self._event) - def record(self, stream=None): - """Record the event in a stream.""" - if stream is None: - # Use default stream (stream 0) - stream = Stream() - torch_openreg._C._event_record(self._event, stream._stream) +def launch_kernel(tog_path, attribute_path): + """Launch a kernel on TOGSimulator. - def synchronize(self): - """Wait for the event to complete.""" - torch_openreg._C._event_synchronize(self._event) + Args: + tog_path: Path to TOG file + attribute_path: Path to attribute file - def query(self): - """Check if the event has completed.""" - return torch_openreg._C._event_query(self._event) + Returns: + int: The kernel ID assigned to this launch - def elapsed_time(self, start_event): - """Get the elapsed time between two events in milliseconds.""" - return torch_openreg._C._event_elapsed_time(start_event._event, self._event) + """ + # Get TOGSimulator instance + sim = get_tog_simulator() + if sim is None: + raise RuntimeError("[torch.npu] TOGSimulator is not initialized. Call torch.npu.init() first.") - @property - def cdata(self): - """Get the underlying event pointer (for internal use).""" - return self._event + device_idx = current_device() + stream_index, timestamp = get_launch_context() + # Create a task function that calls TOGSimulator.launch_kernel + def launch_task(): + return sim.launch_kernel(device_idx, stream_index, tog_path, attribute_path, timestamp) + stream = default_stream() + stream.launch_kernel(launch_task) def synchronize(): - """Synchronize all streams on the current device.""" + """Synchronize all streams on the current device. + + This function: + 1. Registers TOGSimulator.device_synchronize as a task on the default stream + 2. Calls the underlying device_synchronize to wait for all tasks to complete + """ + # Get TOGSimulator instance + sim = get_tog_simulator() + if sim is not None: + # Get current device index + device_idx = current_device() + + # Create a task function that calls TOGSimulator.device_synchronize + def sync_task(): + return sim.device_synchronize(device_idx) + + # Register as task on default stream + stream = default_stream() + stream.launch_kernel(sync_task) + + # Call underlying device_synchronize to wait for all tasks to complete torch_openreg._C._device_synchronize() +def get_tog_simulator(): + return _tog_simulator -def stream(priority=None, flags=0): - """Create a new stream. +def set_tog_simulator(simulator): + """Set the global TOGSimulator instance. Args: - priority: Stream priority (optional) - flags: Stream flags (optional) + simulator: TOGSimulator instance or None + """ + global _tog_simulator + _tog_simulator = simulator - Returns: - Stream: A new stream object +def set_launch_context(stream_index=0, timestamp=0): + _launch_context.stream_index = stream_index + _launch_context.timestamp = timestamp + +def get_launch_context(): + stream_index = getattr(_launch_context, 'stream_index', 0) + timestamp = getattr(_launch_context, 'timestamp', 0) + return stream_index, timestamp + +class launch_context: + """Context manager for setting launch_kernel parameters. + + Args: + stream_index: Stream index (partition ID) to use for launch_kernel + timestamp: Timestamp in nanoseconds to use for launch_kernel + + Example: + with torch.npu.launch_context(stream_index=1, timestamp=1000): + model(input) """ - return Stream(priority=priority, flags=flags) + def __init__(self, stream_index=0, timestamp=0): + self.stream_index = stream_index + self.timestamp = timestamp + self.prev_stream_index = None + self.prev_timestamp = None + + def __enter__(self): + # Save previous context values + self.prev_stream_index = getattr(_launch_context, 'stream_index', 0) + self.prev_timestamp = getattr(_launch_context, 'timestamp', 0) + # Set new context values + set_launch_context(self.stream_index, self.timestamp) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # Restore previous context values + _launch_context.stream_index = self.prev_stream_index + _launch_context.timestamp = self.prev_timestamp + return False -def event(enable_timing=False): - """Create a new event. +def launch_model(model, *args, stream_index=0, timestamp=0, **kwargs): + """Launch a compiled model on TOGSimulator. Args: - enable_timing: Whether to enable timing for the event + model: Compiled model (torch.compile()) + *args: Model input arguments + stream_index: Stream index (partition ID). If None, uses context value. + timestamp: Timestamp in nanoseconds. If None, uses context value. + **kwargs: Additional keyword arguments for model execution Returns: - Event: A new event object - """ - return Event(enable_timing=enable_timing) + Model output (same as calling model(*args, **kwargs)) + Note: + This function executes the compiled model and automatically launches + the generated kernels with the specified stream_index and timestamp. + If stream_index or timestamp are not provided, values from the current + context (set via launch_context() or set_launch_context()) are used. + """ + # Get stream_index and timestamp from parameters or context + with launch_context(stream_index=stream_index, timestamp=timestamp): + return model(*args, **kwargs) from .random import * # noqa: F403 from .amp import * @@ -200,9 +263,10 @@ def event(enable_timing=False): "get_autocast_dtype", "set_autocast_dtype", "get_amp_supported_dtype", - "Stream", - "Event", "stream", - "event", + "launch_kernel", + "launch_model", "synchronize", + "get_tog_simulator", + "set_tog_simulator", ] diff --git a/PyTorchSimDevice/torch_openreg/openreg/random.py b/PyTorchSimDevice/torch_openreg/openreg/random.py index 6817bd79..3f2e99fe 100644 --- a/PyTorchSimDevice/torch_openreg/openreg/random.py +++ b/PyTorchSimDevice/torch_openreg/openreg/random.py @@ -11,6 +11,7 @@ "manual_seed", "manual_seed_all", "initial_seed", + "_is_in_bad_fork", ] @@ -59,3 +60,8 @@ def manual_seed_all(seed: int) -> None: for idx in range(device_count()): default_generator = torch_openreg._C._get_default_generator(idx) default_generator.manual_seed(seed) + +def _is_in_bad_fork(): + # For NPU simulator, we don't have the same fork issues as CUDA + # Return False to indicate we're not in a bad fork state + return False \ No newline at end of file diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 5066d214..d6b47123 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -2,6 +2,7 @@ import re import shlex import subprocess +import torch from torch._inductor.codecache import get_lock_dir, get_hash, write from torch._inductor.async_compile import AsyncCompile @@ -144,7 +145,9 @@ def load(cls, source_code, key, input_path = write(source_code, "mlir", specified_dir=write_path) new_input_path = os.path.splitext(input_path)[0] raw_tog_path = new_input_path + "_tog.py" + tog_path = os.path.join(write_path, "tile_graph.onnx") sample_mlir_path = new_input_path + "_sample" + validation_binary_path = os.path.join(write_path, validation_binary_name) gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size) from filelock import FileLock @@ -177,9 +180,9 @@ def load(cls, source_code, val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name) val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name, validation_binary_name, new_link_option) - target = os.path.join(write_path, validation_binary_name) + stack_size = val_llvm_caller.parse_stack_sizes(f"{write_path}/{key}.s", vlenb=vlenb) - spad_size = val_llvm_caller.get_spad_size(target) + spad_size = val_llvm_caller.get_spad_size(validation_binary_path) spad_usage = stack_size + spad_size # Spad usage per lane if extension_config.CONFIG_SPAD_INFO["spad_size"] < spad_usage: logger.debug( @@ -188,6 +191,10 @@ def load(cls, source_code, ) raise SpadOverflowError() + # Skip if TOG file already exists + if os.path.isfile(tog_path): + return key + # Launch tile graph generator gem5_sample_cmd = shlex.split(gem5_cmds[0]) gem5_translate_cmd = shlex.split(gem5_cmds[1]) @@ -213,13 +220,10 @@ def load(cls, source_code, cycle_llvm_caller = MLIRKernelCallerCodeGen(False, arg_attributes, cycle_sim=True) cycle_llvm_caller.generate_wrapper_file(write_path, cycle_wrapper_name) cycle_llvm_caller.compile_wih_kernel(write_path, key + "_sample", cycle_wrapper_name, cycle_binary_name, link_option) - array_size = [] - for (arg_name, arg_attribute) in arg_attributes: - array_size.append(str(arg_attribute[2])) # Run cyclesim cyclesim = CycleSimulator() - cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), " ".join(array_size), vectorlane_size, silent_mode=silent_mode) + cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode) # Create TOG w_offset, x_offset = vectorlane_size, vectorlane_size @@ -231,7 +235,7 @@ def load(cls, source_code, tile_graph_generator = tog_generator(origins) tile_graph_generator.load_file(raw_tog_path) tile_graph_generator.generate_tile_graph( - os.path.join(write_path, "tile_graph.onnx"), + tog_path, cycle_list=cycle_list, x_offset=x_offset, # FIXME. w_offset=w_offset, # FIXME. @@ -247,25 +251,18 @@ def __init__(self): self.cycle_binary_name = "cycle_binary" def mlir(self, source_code, arg_attributes=[], vectorlane_size=16, tile_size=[], spad_info=None, origins=None, silent_mode=False, **kwargs): + autotune = kwargs.get('autotune', False) def task(): key = MLIRCodeCache.load(source_code, valdiation_wrapper_name=self.validation_binary_name, validation_binary_name=self.validation_binary_name, arg_attributes=arg_attributes, vectorlane_size=vectorlane_size, tile_size=tile_size, spad_info=spad_info, origins=origins, - silent_mode=silent_mode, **kwargs) + silent_mode=autotune, **kwargs) return key future = self.submit(task) - if "loop_size" in kwargs: - loop_size = kwargs["loop_size"] - else: - loop_size = [] - - # In the autotune mode, skip validation to speed up - autotune = kwargs.get('autotune', False) - validate = kwargs.get('validate', False) if not autotune else False - def dummy_simulator(*args, **kwargs): + def run_kernel_simulation(*args, **kwargs): # Wait for compilation key = future.result() from filelock import FileLock @@ -277,47 +274,27 @@ def dummy_simulator(*args, **kwargs): # Dump arguments and meta data dump_metadata(args, arg_attributes, result_path) runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path) - if not autotune and (extension_config.pytorchsim_functional_mode or validate): + if extension_config.pytorchsim_functional_mode and not autotune: funcsim = FunctionalSimulator(result_path, key) funcsim.run_spike(args, arg_attributes, runtime_path, self.validation_binary_name, vectorlane_size=vectorlane_size, spad_info=spad_info, - silent_mode=silent_mode) + silent_mode=autotune) + if not extension_config.pytorchsim_timing_mode: return [float("inf")] + # Prepare arguments for launch kernel onnx_path = os.path.join(result_path, "tile_graph.onnx") attribute_path = os.path.join(runtime_path, "attribute") - togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") - TOGSim = TOGSimulator(togsim_path, extension_config.CONFIG_TOGSIM_CONFIG) - TOGSim.vectorlane_size = vectorlane_size - attribute_path = TOGSim.create_attribute_file(attribute_path, args, loop_size=loop_size) - result_path = TOGSim.simulation(onnx_path, attribute_path, silent_mode=silent_mode, autotune_mode=autotune) - result = TOGSimulator.get_result_from_file(result_path) - return result - def dryrun_simulator(*args, **kwargs): - key = future.result() - from filelock import FileLock - lock_dir = get_lock_dir() - lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT) - with lock: - # Run simulator pass - result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key)) - # Dump arguments and meta data - dump_metadata(args, arg_attributes, result_path) - runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path) - - # Todo. Support valude dependent mode for graph mode - if False: # extension_config.pytorchsim_functional_mode: - funcsim = FunctionalSimulator(result_path, key) - funcsim.run_spike(args, arg_attributes, - runtime_path, self.validation_binary_name, - vectorlane_size=vectorlane_size, spad_info=spad_info) - return result_path, runtime_path, None - - is_dryrun = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) and not autotune - target_simulator = dryrun_simulator if is_dryrun else dummy_simulator - target_simulator.arg_attributes = arg_attributes - target_simulator.future = future - return target_simulator + TOGSim = torch.npu.get_tog_simulator() + if not autotune and TOGSim is not None: + attribute_path = TOGSim.create_attribute_file(attribute_path, args) + torch.npu.launch_kernel(onnx_path, attribute_path) + result = None # No result for non-autotune mode + else: + result_path = TOGSimulator.run_standalone(onnx_path, attribute_path, autotune_mode=autotune) + result = TOGSimulator.get_result_from_file(result_path) + return result + return run_kernel_simulation diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index b0bcac7f..eff6f573 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -106,9 +106,6 @@ def __getattr__(name): if name == "CONFIG_TORCHSIM_LOG_PATH": return os.environ.get('TORCHSIM_LOG_PATH', default = os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results")) - if name == "CONFIG_TOGSIM_EAGER_MODE": - return int(os.environ.get("TOGSIM_EAGER_MODE", default=False)) - # SRAM Buffer allocation plan def load_plan_from_module(module_path): if module_path is None: diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py index 18bf65c3..e6351101 100644 --- a/PyTorchSimFrontend/extension_op.py +++ b/PyTorchSimFrontend/extension_op.py @@ -46,9 +46,6 @@ class MLIRExternKernelChoice(ExternKernelChoice): def call_name(self): - is_dryrun = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) - if is_dryrun: - return f"yield from sparse_mm_dummy_stonne_outer" return f"torch.ops.extension_op.{self.name}" custom_lib = torch.library.Library("extension_op", "DEF") @@ -275,10 +272,8 @@ def prepare_outer_product_matrix(a, b, out): def sparse_mm_stonne_outer(a, b, out): onnx_path, attribute_path, c_result_path = prepare_outer_product_matrix(a, b, out) - togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_single_c1_simple_noc.yml' - TOGSim = TOGSimulator(togsim_path, stonne_config_path) - result_path = TOGSim.simulation(onnx_path) + result_path = TOGSimulator.run_standalone(onnx_path, config_path=stonne_config_path) TOGSimulator.get_result_from_file(result_path) # Load result data diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py index 138bec50..4503584c 100644 --- a/PyTorchSimFrontend/mlir/mlir_autotune.py +++ b/PyTorchSimFrontend/mlir/mlir_autotune.py @@ -61,12 +61,24 @@ def make_run_fn( # Check already cached result. write_path = get_write_path(self.source_code) key, _ = write(self.source_code, "mlir", specified_dir=write_path) - result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key), "togsim_result/0") - if os.path.exists(result_path): - result = TOGSimulator.get_result_from_file(result_path) - def cached_run_fn(*args, **kwargs): - return result - return cached_run_fn + result_dir = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key), "togsim_result") + + # Find the most recent .log file in the result directory + if os.path.exists(result_dir) and os.path.isdir(result_dir): + log_files = [f for f in os.listdir(result_dir) if f.endswith('.log')] + if log_files: + # Sort by modification time, get the most recent file + log_files_with_time = [ + (f, os.path.getmtime(os.path.join(result_dir, f))) + for f in log_files + ] + log_files_with_time.sort(key=lambda x: x[1], reverse=True) + latest_log_file = log_files_with_time[0][0] + result_path = os.path.join(result_dir, latest_log_file) + result = TOGSimulator.get_result_from_file(result_path) + def cached_run_fn(*args, **kwargs): + return result + return cached_run_fn # Run a candidate code run_method = custom_async_compile.mlir( @@ -74,7 +86,7 @@ def cached_run_fn(*args, **kwargs): loop_size=None, spad_info=self.extra_args["spad_info"], vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"], origins="Unknown", silent_mode=True, - validate=self.extra_args['validate'], autotune=self.extra_args['autotune']) + autotune=self.extra_args['autotune']) args = [ tensor diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index c5da1f56..a60c706e 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -99,7 +99,7 @@ def write_header(self): from torch import device, empty, empty_strided from {extension_codecache.__name__} import CustomAsyncCompile - from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_TOGSIM_EAGER_MODE, setup_logger + from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, setup_logger from Simulator.simulator import TOGSimulator from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer from torch._inductor.select_algorithm import extern_kernels @@ -1016,7 +1016,6 @@ def run_bench(self, nodes, kernel_name, src_code): "spad_info": self.spad_info, "vlen" : self.vlen, "arg_attributes" : arg_attributes, - "validate" : extension_config.pytorchsim_functional_mode, "autotune" : True, }, source_code=src_code, diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py index a1a9d935..1aa99d14 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_common.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py @@ -82,7 +82,6 @@ def outer_func_render(self, kernel_name, input_args): Y = self.output_node Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) options = dict( kernel=self.kernel, KERNEL_NAME=kernel_name, @@ -94,7 +93,6 @@ def outer_func_render(self, kernel_name, input_args): PADDING_H=self.padding[0], PADDING_W=self.padding[1], VALIDATION_MODE=extension_config.pytorchsim_functional_mode, - TOGSIM_EAGER_MODE=eager_mode, input_reorder=self.input_reorder ) code = self._template_from_string(self.WRAPPER_TEMPLATE).render(**options) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py index 0bf01421..051d7a0e 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py @@ -120,9 +120,6 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if TOGSIM_EAGER_MODE %} - yield ({{KERNEL_NAME}}, ) - {%- endif %} """ def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): super().__init__(input_nodes, layout, input_reorder, **kwargs) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py index 92b9a525..c742b3b2 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py @@ -121,9 +121,6 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if TOGSIM_EAGER_MODE %} - yield ({{KERNEL_NAME}}, ) - {%- endif %} """ def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): super().__init__(input_nodes, layout, input_reorder, **kwargs) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py index ab124852..07211bb4 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py @@ -121,9 +121,6 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if TOGSIM_EAGER_MODE %} - yield ({{KERNEL_NAME}}, ) - {%- endif %} """ def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): super().__init__(input_nodes, layout, input_reorder, **kwargs) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py index 66aa0a27..46a7f9bf 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py @@ -125,9 +125,6 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: # Launch kernel {{ KERNEL_NAME }} - {%- if TOGSIM_EAGER_MODE %} - yield ({{KERNEL_NAME}}, ) - {%- endif %} """ def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): super().__init__(input_nodes, layout, input_reorder, **kwargs) diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index faf5e69c..5305cbb7 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -280,11 +280,6 @@ def codegen_node(self, _node): ex_kernel.call_kernel(kernel_name) _, args, _, _ = ex_kernel.args.mlir_argdefs() args = ", ".join(args) - eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) - if (eager_mode): - V.graph.wrapper_code.writeline( - f"yield ({kernel_name}, ({args}))" - ) self._set_flush_status(True) def ready_to_flush(self): @@ -344,13 +339,6 @@ def codegen_template(self, template_node, epilogue_nodes, prologue_nodes): kernel.call_kernel(kernel_name) V.graph.removed_buffers |= kernel.removed_buffers _, args, _, _ = self.kernel_group.args.mlir_argdefs() - eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) - if (eager_mode): - target_kernel_name = kernel_name if kernel.outer_func_name is None else kernel.outer_func_name + f"_{len(args)}" - args = ", ".join(args) - V.graph.wrapper_code.writeline( - f"yield ({target_kernel_name}, ({args}))" - ) self._set_flush_status(True) def enter_context_fixed(self, node): diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index cdcdd2a7..77e218ea 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -343,8 +343,7 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, self.request_queue.append([]) self.finish_queue : List[Request] = [] - togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") - self.tog_simulator = TOGSimulator(togsim_path, togsim_config) + self.tog_simulator = TOGSimulator(togsim_config) if self.tog_simulator.config_yaml['pytorchsim_timing_mode'] == 0: # Scheduler requires timing mode to be enabled (pytorchsim_timing_mode != 0). logger.error(f"pytorchsim_timing_mode is set to 0 in config file '{togsim_config}'. ") diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 96a1fc86..2771d03c 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -146,7 +146,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}' if not silent_mode: logger.debug(f"[Spike] cmd> {run}") - logger.info("[Spike] Running Spike simulator") + logger.info("[Spike] Running Spike simulator") run_cmd = shlex.split(run) try: stdout_setting = subprocess.DEVNULL if silent_mode else None @@ -194,14 +194,12 @@ class CycleSimulator(): def __init__(self) -> None: pass - def compile_and_simulate(self, target_binary, array_size, vectorlane_size, silent_mode=False): + def compile_and_simulate(self, target_binary, vectorlane_size, silent_mode=False): dir_path = os.path.join(os.path.dirname(target_binary), "m5out") gem5_script_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "gem5_script/script_systolic.py") gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-r", "--stdout-file=sto.log", "-d", dir_path, gem5_script_path, "-c", target_binary, "--vlane", str(vectorlane_size)] - is_dryrun = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) or silent_mode - - if not is_dryrun: + if not silent_mode: logger.debug(f"[Gem5] cmd> {' '.join(gem5_cmd)}") logger.info("[Gem5] Gem5 simulation started") @@ -224,65 +222,55 @@ class TOGSimulator(): TOGSIM_RESULT_PATH_KEY = "TOGSIM_RESULT_PATH" FINISH_STR = "Simulation finished" ALLOC_POOL = dict() # For eagermode buffer plan - def __init__(self, togsim_path, config_path, vectorlane_size=-1) -> None: + def __init__(self, config_path=None, togsim_path=None) -> None: + if config_path is None: + config_path = extension_config.CONFIG_TOGSIM_CONFIG + if togsim_path is None: + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + self.base_dir = togsim_path self.config_path = config_path self.config_yaml = self.load_yaml(self.config_path) self.process = None - self.vectorlane_size = vectorlane_size - - def get_togsim_command(self): - bin = os.path.join(self.base_dir, "build/bin/Simulator") - config = os.path.join(self.base_dir, self.config_path) - cmd = f"{bin} --config {config}" - return cmd + self._next_kernel_id = 0 # Auto-incrementing kernel ID - def simulation(self, model_path, attribute_path="", silent_mode=False, autotune_mode=False): - cmd = f"{self.get_togsim_command()} --models_list {model_path}" - if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: - cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" - if attribute_path: - cmd = f"{cmd} --attributes_list {attribute_path}" - if not silent_mode: - logger.debug(f"[TOGSim] cmd> {cmd}") - logger.info("[TOGSim] TOGSim simulation started") + # Create FIFOs for command and event communication + self.fifo_dir = os.path.join("/tmp", f"togsim_fifo_{os.getpid()}") + os.makedirs(self.fifo_dir, exist_ok=True) + self.trace_file_path = os.path.join(self.fifo_dir, "cmd_fifo") + self.trace_log = "" - try: - with ProgressBar("[TOGSim] Running simulation", silent_mode=silent_mode): - result = subprocess.check_output(shlex.split(cmd)) - except subprocess.CalledProcessError as e: - logger.error(f"[TOGSim] Command failed with exit code {e.returncode}") - logger.error(f"[TOGSim] Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}") - assert 0 + # Create FIFOs if they don't exist + if os.path.exists(self.trace_file_path): + os.remove(self.trace_file_path) + os.mkfifo(self.trace_file_path) - # Separate Autotune logs - if autotune_mode: - base_dir = Path(model_path).parent / "togsim_result" - base_dir.mkdir(parents=True, exist_ok=True) - file_name = f"{len(list(base_dir.iterdir()))}.log" - else: - base_dir = Path(extension_config.CONFIG_TORCHSIM_LOG_PATH) - unique_id = uuid.uuid4().hex[:8] - timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') - file_name = f"{unique_id}_{timestamp}.log" + # Start TOGSim process + self._start_process() - base_dir.mkdir(parents=True, exist_ok=True) - result_path = base_dir / file_name + # Open trace file FIFO once and keep it open (after process starts) + self._trace_file_lock = threading.Lock() + try: + self._trace_file_handle = open(self.trace_file_path, 'w') + except IOError as e: + logger.error(f"[TOGSim] Failed to open trace file: {e}") + raise RuntimeError(f"Failed to open trace file: {e}") - # Prevent race condition - with open(result_path, "w") as f: - f.write(result.decode()) - f.flush() - os.fsync(f.fileno()) + def __enter__(self): + """Context manager entry.""" + # Set this simulator instance as the global TOGSimulator + self.old_tog_simulator = torch.npu.get_tog_simulator() + torch.npu.set_tog_simulator(self) + return self - if not silent_mode: - import logging as _logging - model_path_log = f' of "{model_path}" ' if logger.isEnabledFor(_logging.DEBUG) else " " - logger.info(f'[TOGSim] Simulation log{model_path_log}is stored to "{result_path}"') - return result_path + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit - automatically cleanup.""" + # Reset global TOGSimulator to None + self.until() + torch.npu.set_tog_simulator(self.old_tog_simulator) - def interactive_simulation(self): - cmd = f"{self.get_togsim_command()} --mode interactive" + def _start_process(self): + cmd = f"{self.get_togsim_command(self.config_path, self.base_dir)} --models_list {self.trace_file_path}" if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" @@ -290,69 +278,144 @@ def interactive_simulation(self): if self.process is None: self.process = subprocess.Popen( shlex.split(cmd), - stdin=subprocess.PIPE, - stderr=subprocess.PIPE, + #stdout=subprocess.PIPE, + #stderr=subprocess.PIPE, universal_newlines=True ) else: logger.warning("[TOGSim] Simulator is already running.") - def stop(self): - if self.process: - self.process.terminate() - self.process.wait() - self.process = None - logger.info("[TOGSim] Simulator stopped.") + def _cleanup_fifos(self): + """Clean up FIFO files""" + try: + if os.path.exists(self.trace_file_path): + os.remove(self.trace_file_path) + if os.path.exists(self.fifo_dir): + os.rmdir(self.fifo_dir) + except OSError as e: + logger.warning(f"[TOGSim] Failed to clean up FIFOs: {e}") + + def _send_command(self, command_type, device_index, stream_index, tog_path="", attribute_path="", timestamp=0): + """ + Internal method to send a command to TOGSim via FIFO. + + Args: + command_type: Type of command ("LAUNCH_KERNEL" or "DEVICE_SYNC") + device_index: Device index + stream_index: Stream index + tog_path: Path to TOG file (ONNX model) - empty for DEVICE_SYNC + attribute_path: Path to attribute file - empty for DEVICE_SYNC + timestamp: Timestamp in nanoseconds (default: 0) + + Returns: + int: The kernel ID assigned to this command + """ + if self.process is None: + raise RuntimeError("[TOGSim] Simulator process is not running") + + if self.process.poll() is not None: + raise RuntimeError("[TOGSim] Simulator process has terminated") + + # Get and increment kernel ID + kernel_id = self._next_kernel_id + self._next_kernel_id += 1 + + # Format command: command_type,kernel_id,device_index,stream_index,tog_path,attribute_path,timestamp + command = f"{command_type},{kernel_id},{device_index},{stream_index},{tog_path},{attribute_path},{timestamp}" + + with self._trace_file_lock: + # Write command to TOGSim + try: + self._trace_file_handle.write(command + '\n') + self._trace_file_handle.flush() + self.trace_log += command + '\n' + logger.debug(f"[TOGSim] Sent command: {command}") + except IOError as e: + logger.error(f"[TOGSim] Failed to write to trace file: {e}") + raise RuntimeError(f"Failed to send command to TOGSim: {e}") + return kernel_id + + def until(self): + # Make sure that all kernels in the stream are finished + torch.npu.synchronize() + + # Close trace file handle if open + if self._trace_file_handle is not None: + try: + self._trace_file_handle.close() + except: + pass + self._trace_file_handle = None - def wait(self): if self.process: - logger.info("[TOGSim] Waiting for simulation to complete...") - self.quit() self.process.wait() + + # Read output streams + stdout_output = "" + stderr_output = "" + if self.process.stdout: + stdout_output = self.process.stdout.read() + if self.process.stderr: + stderr_output = self.process.stderr.read() + + # Print stderr immediately if there's any error output + if stderr_output: + sys.stderr.write(stderr_output) + sys.stderr.flush() + + # Save stdout to result file + if stdout_output: + result_path = extension_config.CONFIG_TORCHSIM_LOG_PATH + os.makedirs(result_path, exist_ok=True) + file_name = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + ".log" + result_path = os.path.join(result_path, file_name) + with open(result_path, "w") as f: + f.write(stdout_output) + logger.info(f'[TOGSim] Simulation log is stored to "{result_path}"') self.process = None - logger.info("[TOGSim] Simulation completed.") - def send_command(self, command): - if self.process: - try: - logger.debug(command) - self.process.stdin.write(command + '\n') - self.process.stdin.flush() - ret = self.process.stderr.readline().strip() - return ret - except BrokenPipeError: - err = self.process.stderr.readlines() - for line in err: - logger.error(line.strip()) - self.process = None - exit(1) - else: - logger.warning("Simulator is not running.") - return None - - def launch(self, onnx_path, attribute_path, arrival_time=0, partion_id=0): - command = f"launch {self.config_path} {onnx_path} {attribute_path} {arrival_time} {partion_id}" - ret = self.send_command(command) - return 0 - - def cycle(self): - ret = self.send_command("cycle") - return int(ret.split(" ")[-1]) - - def until(self, until_cycle): - command = f"until {until_cycle}" - ret = self.send_command(command) - bitmap = int(ret.split(" ")[-1]) - indices = [] - for i in range(64): - if (bitmap >> i) & 1: - indices.append(i) - return indices - - def quit(self): - command = "quit" - ret = self.send_command(command) - return + # Save trace_log with same name but .trace extension + if self.trace_log: + result_path = extension_config.CONFIG_TORCHSIM_LOG_PATH + os.makedirs(result_path, exist_ok=True) + file_name = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + ".trace" + trace_path = os.path.join(result_path, file_name) + with open(trace_path, "w") as f: + f.write(self.trace_log) + logger.info(f'[TOGSim] Trace log is stored to "{trace_path}"') + + # Clean up FIFOs + self._cleanup_fifos() + + def launch_kernel(self, device_index, stream_index, tog_path, attribute_path, timestamp=0): + """ + Launch a kernel via FIFO communication. + + Args: + device_index: Device index + stream_index: Stream index + tog_path: Path to TOG file (ONNX model) + attribute_path: Path to attribute file + timestamp: Timestamp in nanoseconds (default: 0) + + Returns: + int: The kernel ID assigned to this launch + """ + return self._send_command("LAUNCH_KERNEL", device_index, stream_index, tog_path, attribute_path, timestamp) + + def device_synchronize(self, device_index): + """ + Synchronize all streams on a device via FIFO communication. + + Args: + device_index: Device index to synchronize + timestamp: Timestamp in nanoseconds (default: 0) + + Returns: + int: The command ID assigned to this synchronization + """ + # For device_synchronize, stream_index is not meaningful, use 0 + return self._send_command("DEVICE_SYNC", device_index, 0, "", "", 0) @classmethod def sram_alloc(cls, buf_name, addr_range): @@ -404,22 +467,83 @@ def get_core_freq(self): else: raise KeyError("Key 'core_freq' not found in JSON.") - def find_zero_sub_tensors(self, tensor): - x, y = self.vectorlane_size, self.vectorlane_size - zero_positions = {} + @staticmethod + def get_togsim_command(config_path, togsim_path=None): + if togsim_path is None: + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + bin = os.path.join(togsim_path, "build/bin/Simulator") + config = os.path.join(togsim_path, config_path) + cmd = f"{bin} --config {config}" + return cmd + + @staticmethod + def run_standalone(model_path, attribute_path="", autotune_mode=False, config_path=None, togsim_path=None): + """ + Run a single kernel simulation in standalone mode. + This method starts a new TOGSim process, runs the kernel, and waits for completion. + For streaming multiple kernels, use launch_kernel() instead. + + Args: + model_path: Path to TOG file (ONNX model) + attribute_path: Path to attribute file + autotune_mode: If True, run in autotune mode (silent) + config_path: Path to TOGSim config file (required) + togsim_path: Path to TOGSim directory (optional, defaults to CONFIG_TORCHSIM_DIR/TOGSim) + + Returns: + Path to the simulation result log file + """ + if config_path is None: + config_path = extension_config.CONFIG_TOGSIM_CONFIG + if togsim_path is None: + togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") + + # Create result path with appropriate filename + if autotune_mode: + base_dir = Path(model_path).parent / "togsim_result" + else: + base_dir = Path(extension_config.CONFIG_TORCHSIM_LOG_PATH) + + base_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + file_name = f"{timestamp}_{uuid.uuid4().hex[:8]}" + result_path = base_dir / f"{file_name}.log" + trace_file_path = base_dir / f"{file_name}.trace" + + # Create trace file in result directory + kernel_id, device_index, stream_index, timestamp = 0, 0, 0, 0 + command = f"LAUNCH_KERNEL,{kernel_id},{device_index},{stream_index},{model_path},{attribute_path},{timestamp}\n" + with open(trace_file_path, 'w') as trace_file: + trace_file.write(command) + trace_file.flush() + os.fsync(trace_file.fileno()) + + try: + cmd = f"{TOGSimulator.get_togsim_command(config_path, togsim_path)} --models_list {trace_file_path}" + if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: + cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" + + if not autotune_mode: + logger.debug(f"[TOGSim] cmd> {cmd}") + logger.info("[TOGSim] TOGSim simulation started") + with ProgressBar("[TOGSim] Running simulation", silent_mode=autotune_mode): + result = subprocess.check_output(shlex.split(cmd)) + except subprocess.CalledProcessError as e: + logger.error(f"[TOGSim] Command failed with exit code {e.returncode}") + logger.error(f"[TOGSim] Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}") + assert 0 - # Need to set vectorlane size - if self.vectorlane_size == -1: - return zero_positions + # Prevent race condition + with open(result_path, "w") as f: + f.write(result.decode()) + f.flush() + os.fsync(f.fileno()) - for i in range(0, tensor.shape[0], y): - for j in range(0, tensor.shape[1], x): - sub_tensor = tensor[i:i + y, j:j + x] - if np.all(sub_tensor == 0): - if i not in zero_positions: - zero_positions[i] = {} - zero_positions[i][j] = 0 # i pos : j pos : 0 - return zero_positions + if not autotune_mode: + import logging as _logging + model_path_log = f' of "{model_path}" ' if logger.isEnabledFor(_logging.DEBUG) else " " + logger.info(f'[TOGSim] Simulation log{model_path_log}is stored to "{result_path}"') + return result_path @staticmethod def get_result_from_file(result_path): @@ -482,6 +606,24 @@ def get_result_from_file(result_path): return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle if __name__ == "__main__": - sim = TOGSimulator("/workspace/PyTorchSim/TOGSim", "/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml") - sim.interactive_simulation() - sim.until(4000) \ No newline at end of file + # Example paths (adjust these to your actual test files) + test_tog_path = "/workspace/PyTorchSim/outputs/6vxl6mwzhfl/tile_graph.onnx" + test_attribute_path = "/workspace/PyTorchSim/outputs/6vxl6mwzhfl/runtime_0001/attribute/0" + + # Test: Launch multiple kernels + sim = TOGSimulator(config_path="/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml") + with sim: + try: + id1 = torch.npu.launch_kernel(tog_path=test_tog_path, attribute_path=test_attribute_path) + id2 = torch.npu.launch_kernel(tog_path=test_tog_path, attribute_path=test_attribute_path) + id3 = torch.npu.launch_kernel(tog_path=test_tog_path, attribute_path=test_attribute_path) + except Exception as e: + print(f"Error during kernel launch: {e}") + + try: + id2 = torch.npu.launch_kernel(tog_path=test_tog_path, attribute_path=test_attribute_path) + id1 = torch.npu.launch_kernel(tog_path=test_tog_path, attribute_path=test_attribute_path) + id3 = torch.npu.launch_kernel(tog_path=test_tog_path, attribute_path=test_attribute_path) + except Exception as e: + print(f"Error during kernel launch: {e}") + print(sim.trace_log) \ No newline at end of file diff --git a/TOGSim/include/TileGraph.h b/TOGSim/include/TileGraph.h index 990c107d..4cad9355 100644 --- a/TOGSim/include/TileGraph.h +++ b/TOGSim/include/TileGraph.h @@ -67,6 +67,10 @@ class TileGraph { std::string get_name() { return _name; } void set_arrival_time(cycle_type arrival_time) { _arrival_time = arrival_time; } cycle_type get_arrival_time() { return _arrival_time; } + void set_kernel_id(unsigned int kernel_id) { _kernel_id = kernel_id; } + unsigned int get_kernel_id() { return _kernel_id; } + void set_start_time(cycle_type start_time) { _start_time = start_time; } + cycle_type get_start_time() { return _start_time; } void init_cache_plan(IntervalTree::interval_vector it) { _cache_plan = std::make_shared>(std::move(it)); } @@ -130,6 +134,7 @@ class TileGraph { int _vec_index=0; std::string _path; std::string _name = "?"; + unsigned int _kernel_id = 0; std::vector _loop_index_list; std::vector> _ranges; std::vector> _subgraph_vec; @@ -137,5 +142,6 @@ class TileGraph { std::map>> _cpu_graph_map; std::shared_ptr> _cache_plan; cycle_type _arrival_time; + cycle_type _start_time = 0; // First tile issue time, 0 means not started yet static std::shared_ptr null_tile; }; \ No newline at end of file diff --git a/TOGSim/include/TileGraphParser.h b/TOGSim/include/TileGraphParser.h index 9c176966..f067fb2d 100644 --- a/TOGSim/include/TileGraphParser.h +++ b/TOGSim/include/TileGraphParser.h @@ -65,7 +65,7 @@ class TileNode { class TileGraphParser { public: - TileGraphParser(std::string onnx_path, std::string attribute_path, std::string config_path); + TileGraphParser(std::string onnx_path, std::string attribute_path, const YAML::Node& config_yaml); std::shared_ptr get_top_loop(); std::unique_ptr& get_tile_graph() { return _tile_graph; } addr_type lookup(std::string key); diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc index 857923c5..b5b9c778 100644 --- a/TOGSim/src/Simulator.cc +++ b/TOGSim/src/Simulator.cc @@ -170,55 +170,8 @@ void Simulator::icnt_cycle() { _icnt->cycle(); } -int Simulator::until(cycle_type until_cycle) { - std::vector partition_scheudler_status; - for (auto &scheduler : _partition_scheduler) - partition_scheudler_status.push_back(scheduler->empty()); - - while (until_cycle == -1 || _core_cycles < until_cycle) { - set_cycle_mask(); - // Core Cycle - if (IS_CORE_CYCLE(_cycle_mask)) - core_cycle(); - - // DRAM cycle - if (IS_DRAM_CYCLE(_cycle_mask)) - dram_cycle(); - - // Interconnect cycle - if (IS_ICNT_CYCLE(_cycle_mask)) - icnt_cycle(); - - // Check if core status has changed - if (_core_cycles % 10 == 0) { - int bitmap = 0; - for (int i=0; i<_partition_scheduler.size(); i++) { - /* Skip this */ - if (partition_scheudler_status.at(i)) - continue; - - if (_partition_scheduler.at(i)->empty()) { - bitmap |= (1 << i); - } - } - if (bitmap) - return bitmap; - } - } - int bitmap = 0; - for (int i=0; i<_partition_scheduler.size(); i++) { - /* Skip this */ - if (partition_scheudler_status.at(i)) - continue; - - if (_partition_scheduler.at(i)->empty()) - bitmap |= (1ULL << i); - } - return bitmap; -} - void Simulator::cycle() { - while (running()) { + while (running() || _core_cycles < 1) { set_cycle_mask(); // Core Cycle if (IS_CORE_CYCLE(_cycle_mask)) @@ -232,7 +185,6 @@ void Simulator::cycle() { if (IS_ICNT_CYCLE(_cycle_mask)) icnt_cycle(); } - spdlog::info("Simulation finished"); for (auto &core: _cores) { core->check_tag(); } diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc index 515f6247..fd629f8a 100644 --- a/TOGSim/src/TileGraphParser.cc +++ b/TOGSim/src/TileGraphParser.cc @@ -685,9 +685,9 @@ void TileLoopNode::print_node() { spdlog::debug("{} stride: {} ", spaces, _stride); } -TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_path, std::string config_path) { +TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_path, const YAML::Node& config_yaml) { loadConfig(attribute_path, _attribute_config); - loadConfig(config_path, _config_yaml); + _config_yaml = config_yaml; // Use the pre-loaded config _attribute_path = attribute_path; if (!std::filesystem::exists(onnx_path)) { diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc index bee1b45f..44fb5612 100644 --- a/TOGSim/src/main.cc +++ b/TOGSim/src/main.cc @@ -1,6 +1,9 @@ #include #include #include +#include +#include +#include #include "Simulator.h" #include "TileGraphParser.h" @@ -9,82 +12,78 @@ namespace fs = std::filesystem; namespace po = boost::program_options; -const char* env_value = std::getenv("TOGSIM_EAGER_MODE"); -bool isDryRun = (env_value != nullptr && std::string(env_value) == "1"); -void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, std::string config_path, cycle_type request_time=0, int partiton_id=0) { - auto graph_praser = TileGraphParser(onnx_path, attribute_path, config_path); +void launchKernel(Simulator* simulator, unsigned int kernel_id, std::string onnx_path, std::string attribute_path, const YAML::Node& config_yaml, cycle_type request_time=0, int partiton_id=0, int device_id=0) { + auto graph_praser = TileGraphParser(onnx_path, attribute_path, config_yaml); std::unique_ptr& tile_graph = graph_praser.get_tile_graph(); tile_graph->set_arrival_time(request_time ? request_time : simulator->get_core_cycle()); - spdlog::info("[Scheduler {}] Register graph path: {} operation: {} at {}", partiton_id, onnx_path, tile_graph->get_name(), simulator->get_core_cycle()); - + tile_graph->set_kernel_id(kernel_id); + spdlog::info("[Scheduler {}] Enqueued kernel id: {} tog: {} operation: {} request_time: {}", partiton_id, kernel_id, onnx_path, tile_graph->get_name(), request_time); simulator->schedule_graph(partiton_id, std::move(tile_graph)); } -Simulator* create_simulator(std::string config_path) { - YAML::Node config_yaml; - if (!loadConfig(config_path, config_yaml)) - exit(1); - SimulationConfig config = initialize_config(config_yaml); - - auto simulator = new Simulator(config); - return simulator; -} +void process_trace_file(Simulator* simulator, std::string trace_file_path, const YAML::Node& config_yaml) { + // Open trace file (can be FIFO or regular file) + std::ifstream trace_file; + trace_file.open(trace_file_path); + if (!trace_file.is_open()) { + spdlog::error("[TOGSim] Failed to open trace file: {}", trace_file_path); + return; + } + spdlog::info("[TOGSim] Reading from trace file: {}", trace_file_path); -int until(Simulator *simulator, cycle_type until_cycle) { - return simulator->until(until_cycle); -} + // Read all available commands and process them + std::string line; + while (std::getline(trace_file, line)) { + if (line.empty()) { + continue; + } -void interactive_mode(Simulator* simulator) { - std::string command; + // Parse command: command_type,kernel_id,device_index,stream_index,tog_path,attribute_path,timestamp + std::istringstream iss(line); + std::string token; + std::vector tokens; - std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> "; - while (std::getline(std::cin, command)) { + while (std::getline(iss, token, ',')) { + tokens.push_back(token); + } - std::istringstream iss(command); - std::string token; - // Parse the first part of the command (e.g., "launch", "until", "quit") - iss >> token; - if (token == "launch") { - std::string onnx_path, attribute_path, config_path; - cycle_type request_time = 0; - int partition_id = 0; - iss >> config_path >> onnx_path >> attribute_path >> request_time >> partition_id; - - // Check if both paths were provided - if (onnx_path.empty() || attribute_path.empty()) { - spdlog::error("Error: Please provide both ONNX path and Attribute path in the format: launch onnx/path attribute/path"); - } else { - launchKernel(simulator, onnx_path, attribute_path, config_path, request_time, partition_id); - std::cerr << "launch done" << std::endl; - } - } else if (token == "until") { - cycle_type until_cycle; - iss >> until_cycle; - int reason; + if (tokens.size() != 7) { + spdlog::error("[TOGSim] Invalid command format. Expected: command_type,kernel_id,device_index,stream_index,tog_path,attribute_path,timestamp. Got: {} ({} tokens)", line, tokens.size()); + continue; + } - if (iss.fail()) { - spdlog::error("Error: Please provide a valid cycle number after 'until'"); + std::string command_type = tokens[0]; + unsigned int kernel_id = std::stoul(tokens[1]); + int device_index = std::stoi(tokens[2]); + int stream_index = std::stoi(tokens[3]); + std::string tog_path = tokens[4]; + std::string attribute_path = tokens[5]; + int timestamp = std::stoi(tokens[6]); + // timestamp (tokens[6]) is available but not used in current implementation + + try { + if (command_type == "LAUNCH_KERNEL") { + launchKernel(simulator, kernel_id, tog_path, attribute_path, config_yaml, timestamp, stream_index, device_index); + } else if (command_type == "DEVICE_SYNC") { + simulator->cycle(); + spdlog::info("[Device {}] Device synchronization completed", device_index); } else { - reason = simulator->until(until_cycle); - std::cerr << " Until finished: " << reason << std::endl; + spdlog::error("[TOGSim] Unknown command type: {}", command_type); } - } else if (token == "cycle") { - cycle_type current_cycle = simulator->get_core_cycle(); - std::cerr << "Current cycle: " << current_cycle << std::endl; - }else if (token == "quit") { - std::cerr << "Quit" << std::endl; - break; - } else { - spdlog::error("Error: unknown command {} Available commands are: launch, until, quit.", token); + } catch (const std::exception& e) { + spdlog::error("[TOGSim] Error processing command {} (type: {}): {}", kernel_id, command_type, e.what()); } - if (isDryRun) - std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> "; } + trace_file.close(); simulator->cycle(); - if (simulator->get_core_cycle()==0) - simulator->until(0); - simulator->print_core_stat(); +} + +Simulator* create_simulator(const YAML::Node& config_yaml) { + SimulationConfig config = initialize_config(config_yaml); + + auto simulator = new Simulator(config); + return simulator; } int main(int argc, char** argv) { @@ -94,13 +93,9 @@ int main(int argc, char** argv) { cmd_parser.add_command_line_option( "config", "Path for hardware configuration file"); cmd_parser.add_command_line_option( - "models_list", "Path for the models list file"); - cmd_parser.add_command_line_option( - "attributes_list", "Path for the models list file"); + "models_list", "Path for the models list file (can be FIFO or regular file)"); cmd_parser.add_command_line_option( "log_level", "Set for log level [trace, debug, info], default = info"); - cmd_parser.add_command_line_option( - "mode", "choose \"trace\" moode and \"iteractive\" mode"); try { cmd_parser.parse(argc, argv); } catch (const CommandLineParser::ParsingError& e) { @@ -120,29 +115,31 @@ int main(int argc, char** argv) { spdlog::set_level(spdlog::level::info); std::string config_path; - std::string onnx_path; - std::string attribute_path; - std::string execution_mode = "trace"; + std::string trace_file_path; /* Create simulator */ cmd_parser.set_if_defined("config", &config_path); - cmd_parser.set_if_defined("mode", &execution_mode); - auto simulator = create_simulator(config_path); - - if (execution_mode.compare("trace") == 0) { - /* Get needed info for launch kernel */ - cmd_parser.set_if_defined("models_list", &onnx_path); - cmd_parser.set_if_defined("attributes_list", &attribute_path); - - /* launch kernels */ - launchKernel(simulator, onnx_path, attribute_path, config_path); - simulator->run_simulator(); - if (simulator->get_core_cycle()==0) - simulator->until(1); + + // Load config once for reuse + YAML::Node config_yaml; + if (!loadConfig(config_path, config_yaml)) { + spdlog::error("[TOGSim] Failed to load config file: {}", config_path); + exit(1); + } + + auto simulator = create_simulator(config_yaml); + + // Get trace file path + cmd_parser.set_if_defined("models_list", &trace_file_path); + + if (!trace_file_path.empty()) { + // Process trace file (unified mode: supports both FIFO and regular file) + process_trace_file(simulator, trace_file_path, config_yaml); + spdlog::info("Simulation finished"); simulator->print_core_stat(); - } else if (execution_mode.compare("interactive") == 0) { - /* Get onnx_path, attribute from user input, request_time */ - interactive_mode(simulator); + } else { + spdlog::error("No trace file provided. Use --models_list to specify trace file path."); + exit(1); } delete simulator; diff --git a/TOGSim/src/scheduler/Scheduler.cc b/TOGSim/src/scheduler/Scheduler.cc index bb5d29cf..b801fc05 100644 --- a/TOGSim/src/scheduler/Scheduler.cc +++ b/TOGSim/src/scheduler/Scheduler.cc @@ -5,8 +5,6 @@ Scheduler::Scheduler(SimulationConfig config, const cycle_type* core_cycle, cons } void Scheduler::schedule_graph(std::unique_ptr tile_graph) { - spdlog::info("[Scheduler {}] Tile Graph {} Scheduled", _id, "FIFO"); // TODO: tile graph id - // _tile_graph = TileGraphScheduler->get_tile_graph(); _tile_graph.push_back(std::move(tile_graph)); refresh_status(); } @@ -25,6 +23,10 @@ std::shared_ptr Scheduler::get_tile(int core_id, int slot_id) { return tile; } else { tile = std::move(_tile_graph.at(0)->get_tile(core_id, slot_id)); + // Record start_time when first non-EMPTY tile is issued + if (tile->get_status() != Tile::Status::EMPTY && _tile_graph.at(0)->get_start_time() == 0) { + _tile_graph.at(0)->set_start_time(*_core_cycle); + } } refresh_status(); return tile; @@ -48,11 +50,22 @@ void Scheduler::refresh_status() { /* Remove finished request */ if (_tile_graph.at(0)->is_finished()) { - spdlog::info("[Scheduler {}] Graph path: {} operation: {} finish at {}", - _id, _tile_graph.at(0)->get_graph_path(), + unsigned int kernel_id = _tile_graph.at(0)->get_kernel_id(); + cycle_type start_time = _tile_graph.at(0)->get_start_time(); + cycle_type compute_time = 0; + if (start_time > 0) { + compute_time = *_core_cycle - start_time; + } else { + // Fallback to arrival_time if start_time was not recorded + start_time = _tile_graph.at(0)->get_arrival_time(); + compute_time = *_core_cycle - start_time; + } + + spdlog::info("[Scheduler {}] Kernel {} has completed - TOG path: {} operation: {} finished at cycle {}", + _id, kernel_id, _tile_graph.at(0)->get_graph_path(), _tile_graph.at(0)->get_name(), *_core_cycle); - spdlog::info("Total compute time {}", - *_core_cycle - _tile_graph.at(0)->get_arrival_time()); + spdlog::info("[Scheduler {}] Kernel {} execution summary - Started at: {} cycles, Total compute time: {} cycles", + _id, kernel_id, start_time, compute_time); _tile_graph.pop_front(); } } \ No newline at end of file diff --git a/scripts/stonne_experiment2/tog_gen.py b/scripts/stonne_experiment2/tog_gen.py index e8013da7..0e4b5812 100644 --- a/scripts/stonne_experiment2/tog_gen.py +++ b/scripts/stonne_experiment2/tog_gen.py @@ -71,10 +71,8 @@ def extract_simulation_stats(result_path): if "outerPro" in path: continue tog_path = os.path.join(path, "tile_graph.onnx") - togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim") stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_validation_c1_simple_noc.yml' - backsim = TOGSimulator(togsim_path, stonne_config_path) - result_path = backsim.simulation(tog_path) + result_path = TOGSimulator.run_standalone(tog_path, config_path=stonne_config_path) nr_multiplications, total_cycle, sim_time = extract_simulation_stats(result_path) sim_time, total_cycle = float(sim_time), int(total_cycle) print(f"[TLS] Cycle={total_cycle} Sim time={sim_time} nr_multiplications={nr_multiplications}") diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 9c7ca255..724c10d0 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -3,41 +3,25 @@ import torch from torchvision.models import resnet18 as model1 from test_transformer import EncoderBlock as model2 +from Simulator.simulator import TOGSimulator base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') -sys.path.append(base_path) -from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml' +os.environ['TOGSIM_CONFIG'] = config target_model1 = model1().eval() target_model2 = model2(768, 12).eval() -# Init scheduler -scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) -# Register compiled model -opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last)) -opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device())) -SchedulerDNNModel.register_model("resnet18", opt_model1) -SchedulerDNNModel.register_model("bert", opt_model2) - -# Init input data -model_input1 = torch.randn(1, 3, 224, 224) -model_input2 = torch.randn(128, 768) - -# Init request -new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0) -new_request2 = Request("bert", [model_input2], [], request_queue_idx=1) -new_request3 = Request("resnet18", [model_input1], [], request_queue_idx=0) -new_request4 = Request("bert", [model_input2], [], request_queue_idx=1) - -# Add request to scheduler -scheduler.add_request(new_request1, request_time=0) -scheduler.add_request(new_request2, request_time=0) -scheduler.add_request(new_request3, request_time=0) -scheduler.add_request(new_request4, request_time=0) - -# Run scheduler -while not scheduler.is_finished(): - scheduler.schedule() - +device = torch.device("npu:0") +opt_model1 = torch.compile(target_model1.to(device=device, memory_format=torch.channels_last)) +opt_model2 = torch.compile(target_model2.to(device=device)) +model_input1 = torch.randn(1, 3, 224, 224).to(device=device) +model_input2 = torch.randn(128, 768).to(device=device) + +with TOGSimulator(config_path=config): + torch.npu.launch_model(opt_model1, model_input1, stream_index=0, timestamp=0) + torch.npu.launch_model(opt_model2, model_input2, stream_index=1, timestamp=0) + torch.npu.synchronize() + torch.npu.launch_model(opt_model1, model_input1, stream_index=0, timestamp=0) + torch.npu.launch_model(opt_model2, model_input2, stream_index=1, timestamp=0) print("Done") \ No newline at end of file diff --git a/tests/test_stream.py b/tests/test_stream.py index 70077abe..70b2c34b 100644 --- a/tests/test_stream.py +++ b/tests/test_stream.py @@ -1,22 +1,12 @@ import torch import time -start_event = torch.npu.event(enable_timing=True) -end_event = torch.npu.event(enable_timing=True) -stream = torch.npu.stream() - def my_kernel(): print("Task is running...") result = sum(range(1000)) time.sleep(2.5) print(f"Task completed with result: {result}") -start_event.record(stream) -stream.launch_kernel(my_kernel) -end_event.record(stream) - - -stream.synchronize() - -elapsed_time = end_event.elapsed_time(start_event) -print("Event has completed! ", elapsed_time) \ No newline at end of file +torch.npu.launch_kernel(my_kernel) +torch.npu.synchronize() +print("Task completed!") \ No newline at end of file From 09753bc60be1bae818fffe429822bac619eb1722 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 5 Feb 2026 05:04:47 +0000 Subject: [PATCH 098/194] [TOGSim] Rename scheduler_graph to enqueue_graph --- TOGSim/include/Simulator.h | 11 +++++++++-- TOGSim/include/scheduler/Scheduler.h | 2 +- TOGSim/src/TileGraphParser.cc | 6 +++--- TOGSim/src/main.cc | 4 ++-- TOGSim/src/scheduler/Scheduler.cc | 2 +- tests/test_stream.py | 12 ------------ 6 files changed, 16 insertions(+), 21 deletions(-) delete mode 100644 tests/test_stream.py diff --git a/TOGSim/include/Simulator.h b/TOGSim/include/Simulator.h index 39fa310e..a0b8b9c5 100644 --- a/TOGSim/include/Simulator.h +++ b/TOGSim/include/Simulator.h @@ -24,8 +24,15 @@ namespace fs = std::filesystem; class Simulator { public: Simulator(SimulationConfig config); - void schedule_graph(int partion_id, std::unique_ptr tile_graph) { - _partition_scheduler.at(partion_id)->schedule_graph(std::move(tile_graph)); + void enqueue_graph(int partion_id, std::unique_ptr tile_graph) { + if (partion_id < 0 || static_cast(partion_id) >= _config.num_partition) { + spdlog::error("[Enqueue_graph] Invalid partition_id: {} (valid range: 0 to {}). " + "Total partitions: {}", partion_id, _config.num_partition - 1, _config.num_partition); + throw std::runtime_error( + fmt::format("[Enqueue_graph] Invalid partition_id: {} (valid range: 0 to {}). " + "Total partitions: {}", partion_id, _config.num_partition - 1, _config.num_partition)); + } + _partition_scheduler.at(partion_id)->enqueue_graph(std::move(tile_graph)); } void run_simulator(); cycle_type get_core_cycle() { return _core_cycles; } diff --git a/TOGSim/include/scheduler/Scheduler.h b/TOGSim/include/scheduler/Scheduler.h index 39ab7576..c178a4c5 100644 --- a/TOGSim/include/scheduler/Scheduler.h +++ b/TOGSim/include/scheduler/Scheduler.h @@ -8,7 +8,7 @@ class Scheduler { public: Scheduler(SimulationConfig config, const cycle_type* core_cycle, const uint64_t* core_time, int id); - void schedule_graph(std::unique_ptr tile_graph); + void enqueue_graph(std::unique_ptr tile_graph); void finish_tile(std::shared_ptr tile) { tile->get_owner()->finish_tile(tile); } /* For other schedulers */ diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc index fd629f8a..882aba6b 100644 --- a/TOGSim/src/TileGraphParser.cc +++ b/TOGSim/src/TileGraphParser.cc @@ -706,7 +706,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa uint64_t value = it->second.as(); _arg_to_address[key] = value; - spdlog::info("[TOGParser/Attribute] Address Attribute key: {} address: 0x{:x}", key, value); + spdlog::trace("[TOGParser/Attribute] Address Attribute key: {} address: 0x{:x}", key, value); } } @@ -719,7 +719,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa for (const auto& val : value_list) { _arg_numa_stride[key].push_back(val.as()); } - spdlog::info("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", key, fmt::join(_arg_numa_stride[key], ", ")); + spdlog::trace("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", key, fmt::join(_arg_numa_stride[key], ", ")); } } @@ -754,7 +754,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa /* Get meta data from graph */ for (const auto& meta : model_proto.metadata_props()) { - spdlog::info("[TOGParser] Register Metadata \"{}\": \"{}\"", meta.key(), meta.value()); + spdlog::trace("[TOGParser] Register Metadata \"{}\": \"{}\"", meta.key(), meta.value()); _tog_meta[meta.key()] = meta.value(); } diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc index 44fb5612..cc73f6db 100644 --- a/TOGSim/src/main.cc +++ b/TOGSim/src/main.cc @@ -18,8 +18,8 @@ void launchKernel(Simulator* simulator, unsigned int kernel_id, std::string onnx std::unique_ptr& tile_graph = graph_praser.get_tile_graph(); tile_graph->set_arrival_time(request_time ? request_time : simulator->get_core_cycle()); tile_graph->set_kernel_id(kernel_id); - spdlog::info("[Scheduler {}] Enqueued kernel id: {} tog: {} operation: {} request_time: {}", partiton_id, kernel_id, onnx_path, tile_graph->get_name(), request_time); - simulator->schedule_graph(partiton_id, std::move(tile_graph)); + spdlog::info("[Scheduler {}] Enqueued kernel id: {}, tog_path: {}, operation: {}, request_time: {}", partiton_id, kernel_id, onnx_path, tile_graph->get_name(), request_time); + simulator->enqueue_graph(partiton_id, std::move(tile_graph)); } void process_trace_file(Simulator* simulator, std::string trace_file_path, const YAML::Node& config_yaml) { diff --git a/TOGSim/src/scheduler/Scheduler.cc b/TOGSim/src/scheduler/Scheduler.cc index b801fc05..0be42f27 100644 --- a/TOGSim/src/scheduler/Scheduler.cc +++ b/TOGSim/src/scheduler/Scheduler.cc @@ -4,7 +4,7 @@ Scheduler::Scheduler(SimulationConfig config, const cycle_type* core_cycle, cons : _id(id), _config(config), _core_cycle(core_cycle), _core_time(core_time) { } -void Scheduler::schedule_graph(std::unique_ptr tile_graph) { +void Scheduler::enqueue_graph(std::unique_ptr tile_graph) { _tile_graph.push_back(std::move(tile_graph)); refresh_status(); } diff --git a/tests/test_stream.py b/tests/test_stream.py deleted file mode 100644 index 70b2c34b..00000000 --- a/tests/test_stream.py +++ /dev/null @@ -1,12 +0,0 @@ -import torch -import time - -def my_kernel(): - print("Task is running...") - result = sum(range(1000)) - time.sleep(2.5) - print(f"Task completed with result: {result}") - -torch.npu.launch_kernel(my_kernel) -torch.npu.synchronize() -print("Task completed!") \ No newline at end of file From 235bb5c8f1e8d4429112b58fcc5613a71e61a974 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 5 Feb 2026 05:13:43 +0000 Subject: [PATCH 099/194] [TOGSim] Add comments feature in trace files --- Simulator/simulator.py | 2 +- TOGSim/src/main.cc | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 2771d03c..13f2b4f0 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -238,7 +238,7 @@ def __init__(self, config_path=None, togsim_path=None) -> None: self.fifo_dir = os.path.join("/tmp", f"togsim_fifo_{os.getpid()}") os.makedirs(self.fifo_dir, exist_ok=True) self.trace_file_path = os.path.join(self.fifo_dir, "cmd_fifo") - self.trace_log = "" + self.trace_log = "# command_type, kernel_id, device_index, stream_index, tog_path, attribute_path, timestamp\n" # Create FIFOs if they don't exist if os.path.exists(self.trace_file_path): diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc index cc73f6db..7c596af5 100644 --- a/TOGSim/src/main.cc +++ b/TOGSim/src/main.cc @@ -39,6 +39,11 @@ void process_trace_file(Simulator* simulator, std::string trace_file_path, const continue; } + // Skip comment lines starting with # + if (line[0] == '#') { + continue; + } + // Parse command: command_type,kernel_id,device_index,stream_index,tog_path,attribute_path,timestamp std::istringstream iss(line); std::string token; @@ -119,14 +124,14 @@ int main(int argc, char** argv) { /* Create simulator */ cmd_parser.set_if_defined("config", &config_path); - + // Load config once for reuse YAML::Node config_yaml; if (!loadConfig(config_path, config_yaml)) { spdlog::error("[TOGSim] Failed to load config file: {}", config_path); exit(1); } - + auto simulator = create_simulator(config_yaml); // Get trace file path From 9dbe03711484b48efd83b612d730b22929989b26 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 5 Feb 2026 13:58:19 +0000 Subject: [PATCH 100/194] [Eager] Add eager mode POC --- PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp | 8 -------- tests/test_eager.py | 11 ++++++++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp b/PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp index 39f019c5..21ab3fef 100644 --- a/PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp +++ b/PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp @@ -158,12 +158,4 @@ TORCH_LIBRARY_IMPL(_, PrivateUse1, m) { } // LITERALINCLUDE END: FALLBACK GLOBAL -// LITERALINCLUDE START: FALLBACK SINGLE -TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { - m.impl( - "sub.Tensor", - torch::CppFunction::makeFromBoxedFunction<&wrapper_cpu_fallback>()); -} -// LITERALINCLUDE END: FALLBACK SINGLE - } // namespace at::openreg diff --git a/tests/test_eager.py b/tests/test_eager.py index 7a2df6e2..9255b681 100644 --- a/tests/test_eager.py +++ b/tests/test_eager.py @@ -1,8 +1,13 @@ import torch +@torch.library.impl("aten::mul.Tensor", "npu") +def my_fallback(x, y): + raise NotImplementedError("Fallback called") + if __name__ == "__main__": + #torch.npu.register_fallback_op("aten::add.out", my_fallback) device = torch.device("npu:0") - x = torch.zeros(10, 10).to(device) - y = torch.zeros(10, 10).to(device) - z = x + y + x = torch.ones(10, 10).to(device) + y = torch.ones(10, 10).to(device) + z = x * y print(z.cpu()) \ No newline at end of file From f9a9f5fa8fa83cdbfb1c7c589ba9f8fd8854e78a Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 6 Feb 2026 05:20:10 +0000 Subject: [PATCH 101/194] [Eager] Add eager to graph fallback API --- .../torch_openreg/openreg/__init__.py | 39 +++++++++++++++++++ tests/test_eager.py | 5 +-- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py index 66ec022a..8d62cee3 100644 --- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py +++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py @@ -243,6 +243,43 @@ def launch_model(model, *args, stream_index=0, timestamp=0, **kwargs): from .random import * # noqa: F403 from .amp import * +def eager_to_compile(op_name): + """ + Register an eager mode operation as a graph-based implementation using torch.compile(). + + Args: + op_name: Operator name (e.g., "aten::mul.Tensor") + + Example: + torch.npu.eager_to_compile("aten::mul.Tensor") + """ + def wrapper(*args, **kwargs): + @torch.compile(dynamic=False) + def dummy_graph(*args, **kwargs): + # Convert "aten::mul.Tensor" -> torch.ops.aten.mul.Tensor + namespace, op_path = op_name.split("::", 1) + op_path_parts = op_path.split(".") + op = torch.ops + for part in [namespace] + op_path_parts: + op = getattr(op, part) + return op(*args, **kwargs) + return dummy_graph(*args, **kwargs) + + torch.library.impl(op_name, "npu", wrapper) + +def register_eager_to_compile(ops): + """ + Register multiple operators at once using eager_to_compile. + + Args: + ops: List of operator names (e.g., ["aten::mul.Tensor", "aten::add.Tensor"]) + + Example: + torch.npu.register_eager_to_compile(["aten::mul.Tensor", "aten::add.Tensor"]) + """ + for op_name in ops: + eager_to_compile(op_name) + __all__ = [ "device", "device_count", @@ -269,4 +306,6 @@ def launch_model(model, *args, stream_index=0, timestamp=0, **kwargs): "synchronize", "get_tog_simulator", "set_tog_simulator", + "eager_to_compile", + "register_eager_to_compile", ] diff --git a/tests/test_eager.py b/tests/test_eager.py index 9255b681..b84cc6f6 100644 --- a/tests/test_eager.py +++ b/tests/test_eager.py @@ -1,8 +1,6 @@ import torch -@torch.library.impl("aten::mul.Tensor", "npu") -def my_fallback(x, y): - raise NotImplementedError("Fallback called") +torch.npu.register_eager_to_compile(["aten::mul.Tensor", "aten::add.Tensor"]) if __name__ == "__main__": #torch.npu.register_fallback_op("aten::add.out", my_fallback) @@ -10,4 +8,5 @@ def my_fallback(x, y): x = torch.ones(10, 10).to(device) y = torch.ones(10, 10).to(device) z = x * y + z = x + z print(z.cpu()) \ No newline at end of file From a13f37b173050848cea423db71fc758c14e7cf4d Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 6 Feb 2026 05:45:57 +0000 Subject: [PATCH 102/194] [Template] Conv warpper minor fix --- PyTorchSimFrontend/mlir/mlir_conv_mt_template.py | 2 +- PyTorchSimFrontend/mlir/mlir_conv_sb_template.py | 2 +- PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py | 2 +- PyTorchSimFrontend/mlir/mlir_conv_template.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py index 051d7a0e..da2bc829 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py @@ -104,7 +104,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: padded_shape = list(X.shape) padded_shape[2] += 2 * {{ PADDING_H }} padded_shape[3] += 2 * {{ PADDING_W }} - X_padding = torch.zeros(padded_shape, device=X.device) + X_padding = torch.zeros(padded_shape).to(device=X.device) X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X # Tanspose inputs diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py index c742b3b2..cc284522 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py @@ -105,7 +105,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: padded_shape = list(X.shape) padded_shape[2] += 2 * {{ PADDING_H }} padded_shape[3] += 2 * {{ PADDING_W }} - X_padding = torch.zeros(padded_shape, device=X.device) + X_padding = torch.zeros(padded_shape).to(device=X.device) X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X # Tanspose inputs diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py index 07211bb4..6d768bf2 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py @@ -105,7 +105,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: padded_shape = list(X.shape) padded_shape[2] += 2 * {{ PADDING_H }} padded_shape[3] += 2 * {{ PADDING_W }} - X_padding = torch.zeros(padded_shape, device=X.device) + X_padding = torch.zeros(padded_shape).to(device=X.device) X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X # Tanspose inputs diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py index 46a7f9bf..e2cd61fd 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py @@ -109,7 +109,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}: padded_shape = list(X.shape) padded_shape[2] += 2 * {{ PADDING_H }} padded_shape[3] += 2 * {{ PADDING_W }} - X_padding = torch.zeros(padded_shape, device=X.device) + X_padding = torch.zeros(padded_shape).to(device=X.device) X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X # Tanspose inputs From e840786efc58ee5771b0b270302ab76ff290eec8 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 11 Feb 2026 08:27:45 +0000 Subject: [PATCH 103/194] [Fix] Index_expr ops codegen issue --- .../mlir/mlir_caller_codegen.py | 2 +- .../mlir/mlir_codegen_backend.py | 9 +- PyTorchSimFrontend/mlir/mlir_conv_common.py | 4 +- PyTorchSimFrontend/mlir/mlir_scheduling.py | 4 +- PyTorchSimFrontend/mlir/mlir_template.py | 2 +- tests/Yolov5/test_yolov5.py | 249 +++++++++++++++++- 6 files changed, 245 insertions(+), 25 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py index a539bdb9..06d41ea2 100644 --- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py +++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py @@ -101,7 +101,7 @@ def generate_args_define(self): bits = 8 else: bits = torch.iinfo(arg_type).bits - buffer_size = int(math.ceil(arg_size * bits // 8 / 64) * 64) # Round up to 64 bytes + buffer_size = int(math.ceil(arg_size * bits // 8 / 64) * 64) * 2 # Round up to 64 bytes + Add some padding for safety self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({buffer_size}ULL){self.ending}') name_set.add(arg_name) self.writeline(self.newline) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index a60c706e..b52b36d0 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -470,7 +470,6 @@ def load(self, name: str, index: sympy.Expr): tile_numel_per_lane = local_tile_desc.get_numel_per_lane() tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype) tile_stride = local_tile_desc.get_tile_stride() - # Compute vector unit size vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype) compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size() @@ -697,7 +696,7 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index): self.reset("recompile") raise mlir_common.RecompileSignal(f"Index access (tile size {prior_tile_size} is not divisible by {prior_ranges})") - tile_size = tile_desc.get_tile_size_per_lane() + tile_size_per_lane = tile_desc.get_tile_size_per_lane() compute_vec_size = tile_desc.get_compute_vec_size() strides = tile_desc.get_tile_stride_per_lane() @@ -707,13 +706,13 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index): # Create tile_dim index dim_list = [] - for idx in range(len(tile_size)): + for idx in range(len(tile_size_per_lane)): # Prepare initial values offset = tile_desc.vmap.vlane_stride #* strides[idx] - outer_sz = tile_size[idx] // tile_desc.vmap.vlane_stride + outer_sz = tile_desc.get_tile_size()[idx] // tile_desc.vmap.vlane_stride with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse): div_coeff = self.get_const_cse(strides[idx], "index") - mod_coeff = self.get_const_cse(tile_size[idx], "index") + mod_coeff = self.get_const_cse(tile_size_per_lane[idx], "index") vlane_stride_coeff = self.get_const_cse(tile_desc.vmap.vlane_stride, "index") vlane_outer_coeff = self.get_const_cse(outer_sz, "index") nr_vector_lane = self.get_const_cse(self.vector_lane, "index") diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py index 1aa99d14..f8566b6d 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_common.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py @@ -85,7 +85,7 @@ def outer_func_render(self, kernel_name, input_args): options = dict( kernel=self.kernel, KERNEL_NAME=kernel_name, - FUNC_NAME=self.function_name + f"_{len(input_args)}", + FUNC_NAME="wrapper_" + kernel_name, INPUT=X, WEIGHT=W, BIAS=Bias, @@ -96,7 +96,7 @@ def outer_func_render(self, kernel_name, input_args): input_reorder=self.input_reorder ) code = self._template_from_string(self.WRAPPER_TEMPLATE).render(**options) - return code, self.function_name + f"_{len(input_args)}" + return code, "wrapper_" + kernel_name def get_arg_attributes(self): arg_attributes = [] diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 5305cbb7..af960533 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -154,8 +154,8 @@ def can_fuse_horizontal(self, node1, node2): } # Buffers still required by the activation node (unmet) or read by it epilogue_unmet = { dep for dep in epilogue_node.unmet_dependencies } - has_depedency = bool(template_writes) and epilogue_unmet.issubset(template_writes) - if not has_depedency: + has_dependency = bool(template_writes) and epilogue_unmet.issubset(template_writes) and not bool(reads1 & writes2) + if not has_dependency: return False # Revert act_node.group : simplify_and_reorder() modified _body, _size, group diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index b864e5f2..556f7e04 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -403,7 +403,7 @@ def call_kernel(self, kernel_name): _, call_args, _, _ = self.kernel_group.args.mlir_argdefs() # generate the code to call this wrapper.generate_kernel_call( - kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", call_args) + kernel_name if self.outer_func_name is None else "wrapper_" + kernel_name, call_args) def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes, tile_info): with self as kernel: diff --git a/tests/Yolov5/test_yolov5.py b/tests/Yolov5/test_yolov5.py index d9e6b261..1262dfb9 100644 --- a/tests/Yolov5/test_yolov5.py +++ b/tests/Yolov5/test_yolov5.py @@ -13,39 +13,230 @@ import os import shutil - +def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): + if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): + message = f"|{name} Test Passed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + else: + message = f"|{name} Test Failed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + print("custom out: ", out.cpu()) + print("cpu out: ", cpu_out) + exit(1) def run_yolo(batch, config): + import copy + device = torch.device("npu:0") torch._dynamo.config.recompile_limit = 64 torch._dynamo.config.cache_size_limit = 128 - + + # Load model and prepare input model = torch.hub.load("ultralytics/yolov5", "yolov5s").cpu().eval() url = "https://ultralytics.com/images/zidane.jpg" - + response = requests.get(url) img = Image.open(BytesIO(response.content)).convert("RGB") - + imgsz = 64 transform = transforms.Compose([ transforms.Resize((imgsz, imgsz)), transforms.ToTensor(), ]) - + x = transform(img).unsqueeze(0) # [1, 3, H, W] - x = x.to(device) - - - model.to(device) - x = x.to(device) - - # Compile and run the model with PyTorchSim - compiled_model = torch.compile(dynamic=False)(model) - y = compiled_model(x) + + # CPU version + model_cpu = copy.deepcopy(model).cpu().eval() + x_cpu = copy.deepcopy(x).cpu() + y_cpu = model_cpu(x_cpu) + + # NPU version + model_npu = model_cpu.to(device).eval() + x_npu = copy.deepcopy(x).to(device) + compiled_model_npu = torch.compile(dynamic=False)(model_npu) + y_npu = compiled_model_npu(x_npu) + + # Compare results + # YOLOv5 output is typically a list or tensor, handle both cases + if isinstance(y_cpu, (list, tuple)): + for i, (out_npu, out_cpu) in enumerate(zip(y_npu, y_cpu)): + test_result(f"YOLOv5 Output {i}", out_npu, out_cpu) + else: + test_result("YOLOv5 Output", y_npu, y_cpu) + print("Yolo Simulation Done") +def test_c3_module(device, batch=1, c1=64, c2=128, n=1, h=64, w=64): + import copy + import sys + + # Import C3 module from YOLOv5 + try: + # Load model first to ensure hub cache is populated + _ = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=False) + + # Try to import from torch hub cache + hub_path = os.path.expanduser("~/.cache/torch/hub/ultralytics_yolov5_master") + if os.path.exists(hub_path): + sys.path.insert(0, hub_path) + # Import C3 module + from models.common import C3 # noqa: F401 + except Exception as e: + print(f"Warning: Could not import C3 module: {e}") + print("Skipping C3 module test") + return + + torch.manual_seed(0) + + # Create input tensor + x = torch.randn(batch, c1, h, w) + + # CPU version + model_cpu = C3(c1, c2, n=n, shortcut=True, g=1, e=0.5).cpu().eval() + x_cpu = copy.deepcopy(x).cpu() + y_cpu = model_cpu(x_cpu) + + # NPU version + model_npu = model_cpu.to(device).eval() + x_npu = copy.deepcopy(x).to(device) + compiled_model_npu = torch.compile(dynamic=False)(model_npu) + y_npu = compiled_model_npu(x_npu) + + # Compare results + if isinstance(y_cpu, (list, tuple)): + for i, (out_npu, out_cpu) in enumerate(zip(y_npu, y_cpu)): + test_result(f"C3 Output {i}", out_npu, out_cpu) + else: + test_result("C3 Output", y_npu, y_cpu) + print("C3 Module Test Done") + + +def test_bottleneck_module(device, batch=1, c1=64, c2=64, shortcut=True, g=1, e=0.5, h=16, w=16): + import copy + import sys + + # Import Bottleneck module from YOLOv5 + try: + # Load model first to ensure hub cache is populated + _ = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=False) + + # Try to import from torch hub cache + hub_path = os.path.expanduser("~/.cache/torch/hub/ultralytics_yolov5_master") + if os.path.exists(hub_path): + sys.path.insert(0, hub_path) + # Import Bottleneck module + from models.common import Bottleneck # noqa: F401 + except Exception as e: + print(f"Warning: Could not import Bottleneck module: {e}") + print("Skipping Bottleneck module test") + return + + torch.manual_seed(0) + + # Create input tensor + x = torch.randn(batch, c1, h, w) + + # CPU version + model_cpu = Bottleneck(c1, c2, shortcut=shortcut, g=g, e=e).cpu().eval() + x_cpu = copy.deepcopy(x).cpu() + y_cpu = model_cpu(x_cpu) + + # NPU version + model_npu = model_cpu.to(device).eval() + x_npu = copy.deepcopy(x).to(device) + compiled_model_npu = torch.compile(dynamic=False)(model_npu) + y_npu = compiled_model_npu(x_npu) + + # Compare results + test_result("Bottleneck Module", y_npu, y_cpu) + print("Bottleneck Module Test Done") + + +def test_conv_module(device, batch=1, c1=32, c2=64, k=3, s=1, p=None, g=1, d=1, act=True, h=16, w=16): + import copy + import sys + + # Import Conv module from YOLOv5 + try: + # Load model first to ensure hub cache is populated + _ = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=False) + + # Try to import from torch hub cache + hub_path = os.path.expanduser("~/.cache/torch/hub/ultralytics_yolov5_master") + if os.path.exists(hub_path): + sys.path.insert(0, hub_path) + # Import Conv module + from models.common import Conv # noqa: F401 + except Exception as e: + print(f"Warning: Could not import Conv module: {e}") + print("Skipping Conv module test") + return + + torch.manual_seed(0) + + # Create input tensor + x = torch.randn(batch, c1, h, w) + + # CPU version + model_cpu = Conv(c1, c2, k=k, s=s, p=p, g=g, d=d, act=act).cpu().eval() + x_cpu = copy.deepcopy(x).cpu() + y_cpu = model_cpu(x_cpu) + + # NPU version + model_npu = model_cpu.to(device).eval() + x_npu = copy.deepcopy(x).to(device) + compiled_model_npu = torch.compile(dynamic=False)(model_npu) + y_npu = compiled_model_npu(x_npu) + + # Compare results + test_result("Conv Module", y_npu, y_cpu) + print("Conv Module Test Done") + + +def test_concat_4d(device): + """ + Test concatenating 3 tensors along dimension 4 + Shapes: (1, 3, 4, 4, 2), (1, 3, 4, 4, 2), (1, 3, 4, 4, 81) + Result: (1, 3, 4, 4, 85) + """ + import copy + + torch.manual_seed(0) + + # Create 3 input tensors + x1 = torch.ones(1, 3, 4, 4, 2) + x2 = torch.ones(1, 3, 4, 4, 2) * 2 + x3 = torch.ones(1, 3, 4, 4, 81) * 3 + + # CPU version + x1_cpu = copy.deepcopy(x1).cpu() + x2_cpu = copy.deepcopy(x2).cpu() + x3_cpu = copy.deepcopy(x3).cpu() + y_cpu = torch.cat([x1_cpu, x2_cpu, x3_cpu], dim=4) + + # NPU version + x1_npu = copy.deepcopy(x1).to(device) + x2_npu = copy.deepcopy(x2).to(device) + x3_npu = copy.deepcopy(x3).to(device) + + def concat_fn(x1, x2, x3): + return torch.cat([x1, x2, x3], dim=4) + + compiled_concat = torch.compile(dynamic=False)(concat_fn) + y_npu = compiled_concat(x1_npu, x2_npu, x3_npu) + + # Compare results + test_result("Concat 4D", y_npu, y_cpu) + print(f"Output shape: {y_npu.shape}") + print("Concat 4D Test Done") + if __name__ == "__main__": base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim") @@ -59,4 +250,34 @@ def run_yolo(batch, config): args = args.parse_args() batch = args.batch + device = torch.device("npu:0") + + # Test Concat 4D + # print("=" * 80) + # print("Testing Concat 4D") + # print("=" * 80) + # test_concat_4d(device) + + # Test Conv module + # print("\n" + "=" * 80) + # print("Testing Conv Module") + # print("=" * 80) + # test_conv_module(device, batch=batch, c1=32, c2=32, k=1, s=1, p=None, g=1, d=1, act=False, h=16, w=16) + + # Test Bottleneck module + # print("\n" + "=" * 80) + # print("Testing Bottleneck Module") + # print("=" * 80) + # test_bottleneck_module(device, batch=batch, c1=32, c2=32, shortcut=True, g=1, e=0.5, h=16, w=16) + + # Test C3 module + # print("\n" + "=" * 80) + # print("Testing C3 Module") + # print("=" * 80) + # test_c3_module(device, batch=batch, c1=64, c2=64, n=1, h=16, w=16) + + # Test full YOLOv5 model + print("\n" + "=" * 80) + print("Testing Full YOLOv5 Model") + print("=" * 80) run_yolo(batch, config) From f60cbe5b766723c8f4eb1306d9464c7ecc7be85e Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 23 Feb 2026 06:25:11 +0000 Subject: [PATCH 104/194] [Codegen] Use ops instead of raw assembly --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index b52b36d0..2cff7815 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1432,10 +1432,10 @@ def convert_indirect_indexing(self, index :sympy.Expr): self.spad_buffer_dict[target_dim] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape] # Store the indirect index variable - opeartion = "affine.vector_store" + target_var = self.cse.varname_map[target_dim] compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"]) - line = f"{opeartion} %{target_dim}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}" - self.stores.writeline(line) + with self.override_buffer_cse(buffer=self.stores): + ops._store(target_var, sram_var, compute_index_var, tile_shape) mlir_dtype = vshape.split("x")[1][:-1] with self.override_buffer_cse(buffer=target_dma_buffers): out = ops._load(tile_numel_per_lane, mlir_dtype, sram_var, sram_index_var, tile_shape) From 014cb116c126c87dd7594c8d065c5d41380b7e97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=9D=B4=EC=9E=AC=EA=B7=A0?= Date: Thu, 19 Feb 2026 15:58:07 +0900 Subject: [PATCH 105/194] [Test] Add DeepSeek v3 base test file and etc. (WIP) --- Dockerfile.base | 3 + .../torch_openreg/openreg/__init__.py | 17 +- .../mlir/mlir_codegen_backend.py | 32 +-- PyTorchSimFrontend/mlir/mlir_common.py | 11 +- PyTorchSimFrontend/mlir/mlir_template.py | 4 + tests/DeepSeek/test_deepseek_v3_base.py | 220 ++++++++++++++++++ 6 files changed, 271 insertions(+), 16 deletions(-) create mode 100644 tests/DeepSeek/test_deepseek_v3_base.py diff --git a/Dockerfile.base b/Dockerfile.base index 0fd950d2..e8504bcf 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -45,6 +45,9 @@ RUN wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2 # Install torchsim dependency RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0 && pip install "transformers<4.44" && pip install diffusers==0.34.0 +# FlashAttention +RUN python -m pip install --no-build-isolation flash-attn + # Extra Python deps for YOLO/vision tests RUN python -m pip install -U pip setuptools wheel && \ python -m pip install --no-cache-dir --no-deps ultralytics && \ diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py index 8d62cee3..f5aabc18 100644 --- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py +++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py @@ -80,8 +80,21 @@ def __init__(self, flags=0): self._stream = torch_openreg._C._stream_create() def __del__(self): - if hasattr(self, '_stream'): - torch_openreg._C._stream_destroy(self._stream) + # Interpreter shutdown can clear module globals before __del__ runs. + # Only destroy when both runtime handle and stream are still valid. + stream = getattr(self, "_stream", None) + backend = globals().get("torch_openreg", None) + c_api = getattr(backend, "_C", None) if backend is not None else None + if stream is None or c_api is None: + return + destroy = getattr(c_api, "_stream_destroy", None) + if destroy is None: + return + try: + destroy(stream) + except (AttributeError, TypeError): + # Ignore cleanup-time teardown ordering issues. + pass def launch_kernel(self, task): """Add a Python callable kernel to this stream. diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 2cff7815..62acd877 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -110,6 +110,7 @@ def write_header(self): aten = torch.ops.aten inductor_ops = torch.ops.inductor assert_size_stride = torch._C._dynamo.guards.assert_size_stride + assert_alignment = torch._C._dynamo.guards.assert_alignment alloc_from_pool = torch.ops.inductor._alloc_from_pool reinterpret_tensor = torch.ops.inductor._reinterpret_tensor custom_async_compile = CustomAsyncCompile() @@ -375,6 +376,10 @@ def _convert_sympy_to_mlir_expr(self, expr, sorted_args): indices.append(str(new_arg)) expr_str = str(expr) + if "ModularIndexing" in expr_str: + def _replace_mod(m): + return f"({m.group(1)} floordiv {m.group(2)}) mod {m.group(3)}" + expr_str = re.sub(r"ModularIndexing\(([^,]+), ([^,]+), ([^)]+)\)", _replace_mod, expr_str) if "//" in expr_str: expr_str = expr_str.replace("//", " floordiv ") return expr_str, indices @@ -1158,30 +1163,28 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe for constraint in sorted_constraints[1:]: index = index.replace(constraint.original_expr, 0) - # Calculate dram stride + # Calculate dram stride in local tile-dim order. + # This keeps dram/sram stride rank aligned with tile rank. + local_dim_to_axis = {dim: axis for axis, dim in enumerate(local_dims)} dram_stride = [0] * local_tile_desc.get_nr_dim() if index.is_Symbol: dim_idx = int(str(index)[5:]) - dram_stride[dim_idx] = 1 + if dim_idx in local_dim_to_axis: + dram_stride[local_dim_to_axis[dim_idx]] = 1 elif index.is_Number: pass else: - dram_dict = defaultdict(list) + dram_dict = defaultdict(lambda: 0) # Assume that div will have high priority than mod for arg in index.as_ordered_terms(): coeff, dim = arg.as_coeff_mul() if len(dim) == 0: continue real_dim = list(dim[0].free_symbols)[0] - dram_dict[str(real_dim)].append(coeff) - # Add missing dims if not added - max_dim = len(self.ranges) if not store_reduction else len(self.ranges) - 1 - for i in range(max_dim): - target_dim = f"index{i}" - if sympy.Symbol(target_dim) not in index.free_symbols: - dram_dict[target_dim] = [0] - sorted_keys = sorted(dram_dict.keys()) - dram_stride = sum((dram_dict[key] for key in sorted_keys), []) + real_dim_name = str(real_dim) + if real_dim_name.startswith("index"): + dram_dict[int(real_dim_name[5:])] += int(coeff) + dram_stride = [dram_dict[dim] for dim in local_dims] # Support floordiv pattern # FIXME. How to integrate implicit dims and floordiv? @@ -1193,6 +1196,9 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe if not str(sub.args[0]).startswith("index"): continue dim_idx = int((str(sub.args[0])[5:])) + if dim_idx not in local_dim_to_axis: + continue + local_dim_idx = local_dim_to_axis[dim_idx] if int(self.kernel_group.tile_desc.get_tile_size()[dim_idx] % sub.args[1]) != 0: # In this case, need to recompile original_tile = self.kernel_group.tile_desc.get_tile_size() @@ -1211,7 +1217,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe # Send recompile signal self.reset("recompile") raise mlir_common.RecompileSignal(f"Tile size {self.kernel_group.tile_desc.get_tile_size()[dim_idx]} is not divisible by {sub.args[1]}") - dim_divisor[dim_idx] = sub.args[1] + dim_divisor[local_dim_idx] = sub.args[1] # Update dram_stride, just insert 0 next to target dim offset = 0 diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index f101b7cb..7eb8f7f1 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -504,7 +504,7 @@ def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=N vlane_stride=vlane_stride ) - self.implicit_dim_size = None + self.implicit_dim_size = {} self.nr_rdim = 0 self.offset = sympy.Integer(0) # Dram offset @@ -654,6 +654,11 @@ def reduction(self, dtype, src_dtype, reduction_type, value): def indirect_indexing(self, index_var, size, check, wrap_neg): raise NotImplementedError() + def check_bounds(self, expr, size, lower, upper): + # MLIR backend currently relies on masked paths for out-of-bounds handling. + # Keep this hook as a no-op to satisfy Inductor's check_bounds callback. + return + def codegen_global_init(self): raise NotImplementedError() @@ -964,6 +969,10 @@ def store_reduction(name, index, value): def reduction(dtype, src_dtype, reduction_type, value): return self.reduction(dtype, src_dtype, reduction_type, value) + @staticmethod + def check_bounds(index, size, lower, upper): + return self.check_bounds(index, size, lower, upper) + @staticmethod def _index_expr(tile_size, buffer, renamed_expression, index): return self._index_expr(tile_size, buffer, renamed_expression, index) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 556f7e04..b1c756ba 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -861,6 +861,8 @@ def load_epilogue(self, name: str, index: sympy.Expr): vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype) tile_stride = self.kernel_group.tile_desc.get_tile_stride() + tile_rank = self.kernel_group.tile_desc.get_nr_dim() + dram_stride = dram_stride[:tile_rank] + [0] * max(tile_rank - len(dram_stride), 0) # Compute vector unit size vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype) @@ -913,6 +915,8 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs): vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype) tile_stride = self.kernel_group.tile_desc.get_tile_stride() + tile_rank = self.kernel_group.tile_desc.get_nr_dim() + dram_stride = dram_stride[:tile_rank] + [0] * max(tile_rank - len(dram_stride), 0) if name not in self.buffer_names: sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, self.kernel_group.tile_desc, index) diff --git a/tests/DeepSeek/test_deepseek_v3_base.py b/tests/DeepSeek/test_deepseek_v3_base.py new file mode 100644 index 00000000..b8402c8b --- /dev/null +++ b/tests/DeepSeek/test_deepseek_v3_base.py @@ -0,0 +1,220 @@ +import os +import sys +import argparse +import torch + + +def _dtype_from_str(name: str) -> torch.dtype: + return { + "float32": torch.float32, + "float16": torch.float16, + "bfloat16": torch.bfloat16, + }.get(name, torch.float32) + + +def _build_random_inputs(batch, seq_len, vocab_size, device): + g = torch.Generator().manual_seed(0) + input_ids = torch.randint(0, vocab_size, (batch, seq_len), generator=g, dtype=torch.int64) + return input_ids.to(device) + + +def _safe_scaled_int(value, scale, min_value=1): + return max(min_value, int(round(float(value) * float(scale)))) + + +def _round_to_multiple(value, multiple, min_value=1): + if multiple is None or multiple <= 0: + return max(min_value, int(value)) + v = max(min_value, int(value)) + return max(min_value, ((v + multiple - 1) // multiple) * multiple) + + +def _maybe_scale_config(config, scale=1.0, max_layers=None): + if scale == 1.0 and max_layers is None: + return config + + if hasattr(config, "hidden_size"): + config.hidden_size = _safe_scaled_int(config.hidden_size, scale) + if hasattr(config, "intermediate_size"): + config.intermediate_size = _safe_scaled_int(config.intermediate_size, scale) + if hasattr(config, "num_hidden_layers"): + config.num_hidden_layers = _safe_scaled_int(config.num_hidden_layers, scale) + if hasattr(config, "num_attention_heads"): + config.num_attention_heads = _safe_scaled_int(config.num_attention_heads, scale) + if hasattr(config, "num_key_value_heads"): + config.num_key_value_heads = min( + _safe_scaled_int(config.num_key_value_heads, scale), + config.num_attention_heads, + ) + + for name in [ + "n_routed_experts", + "n_shared_experts", + "num_local_experts", + "num_experts", + "num_experts_per_tok", + "moe_intermediate_size", + "shared_expert_intermediate_size", + ]: + if hasattr(config, name): + setattr(config, name, _safe_scaled_int(getattr(config, name), scale)) + + # DeepSeek MoE gate expects n_routed_experts to be divisible by n_group. + if hasattr(config, "n_routed_experts") and hasattr(config, "n_group"): + config.n_routed_experts = _round_to_multiple( + config.n_routed_experts, + config.n_group, + min_value=max(1, int(config.n_group)), + ) + + if max_layers is not None and hasattr(config, "num_hidden_layers"): + config.num_hidden_layers = max(1, min(int(max_layers), int(config.num_hidden_layers))) + + if hasattr(config, "hidden_size") and hasattr(config, "num_attention_heads"): + config.hidden_size = max( + config.num_attention_heads, + (config.hidden_size // config.num_attention_heads) * config.num_attention_heads, + ) + + return config + + +def _apply_preset(scale, max_layers, batch, seq_len, preset): + if preset == "tiny": + return 0.03, 4, 1, min(seq_len, 16) + if preset == "small": + return 0.07, 8, 1, min(seq_len, 32) + if preset == "medium": + return 0.10, 12, 1, min(seq_len, 48) + return scale, max_layers, batch, seq_len + + +@torch.no_grad() +def run_deep_seek_v3_base_test( + model_id, + device, + init_mode="config-random", + scale=1.0, + max_layers=None, + dtype="float16", + batch=1, + seq_len=32, + use_tokenizer=False, + prompt="Hello, DeepSeek V3", + trust_remote_code=False, + revision=None, + compile_model=False, +): + from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + + torch_dtype = _dtype_from_str(dtype) + + # Load model config + config = AutoConfig.from_pretrained( + model_id, + trust_remote_code=trust_remote_code, + revision=revision, + ) + + # Some remote model codes expect quantization_config to stay object-like + # (call .to_dict()), so only disable it for pretrained loading path. + if init_mode == "pretrained" and getattr(config, "quantization_config", None) is not None: + config.quantization_config = None + + config = _maybe_scale_config(config, scale=scale, max_layers=max_layers) + + if init_mode == "config-random": + model = AutoModelForCausalLM.from_config( + config=config, + trust_remote_code=trust_remote_code, + ).eval() + model = model.to(dtype=torch_dtype) + elif init_mode == "pretrained": + # Load model(weights) + model = AutoModelForCausalLM.from_pretrained( + model_id, + config=config, + torch_dtype=torch_dtype, + trust_remote_code=trust_remote_code, + revision=revision, + ).eval() + else: + raise ValueError(f"Unsupported init mode: {init_mode}") + + model = model.to(device) + model_params = sum(p.numel() for p in model.parameters()) + print("init mode:", init_mode) + print("scaled hidden_size:", getattr(config, "hidden_size", "n/a")) + print("scaled num_hidden_layers:", getattr(config, "num_hidden_layers", "n/a")) + print("scaled num_attention_heads:", getattr(config, "num_attention_heads", "n/a")) + print("model params:", model_params) + + # Load tokenizer + if use_tokenizer: + tokenizer = AutoTokenizer.from_pretrained( + model_id, + trust_remote_code=trust_remote_code, + revision=revision, + ) + encoded = tokenizer(prompt, return_tensors="pt") + input_ids = encoded["input_ids"].to(device) + else: + vocab_size = getattr(config, "vocab_size", None) + if vocab_size is None: + raise ValueError("Config has no vocab_size; use --use-tokenizer or pass a model with vocab_size.") + input_ids = _build_random_inputs(batch, seq_len, vocab_size, device) + + if compile_model: + model = torch.compile(model, dynamic=False) + + out = model(input_ids) + logits = out.logits + + print("logits shape:", tuple(logits.shape)) + print("logits dtype:", logits.dtype) + print("logits max:", logits.max().item()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="DeepSeek V3 download-based test") + parser.add_argument("--model-id", type=str, default=os.environ.get("DEEPSEEK_V3_MODEL_ID", "deepseek-ai/DeepSeek-V3-Base")) + parser.add_argument("--revision", type=str, default=None) + parser.add_argument("--trust-remote-code", action="store_true", default=True) + parser.add_argument("--init-mode", type=str, default="config-random", choices=["config-random", "pretrained"]) + parser.add_argument("--preset", type=str, default="tiny", choices=["none", "tiny", "small", "medium"]) + parser.add_argument("--scale", type=float, default=1.0) + parser.add_argument("--max-layers", type=int, default=None) + parser.add_argument("--dtype", type=str, default="float32", choices=["float32", "float16", "bfloat16"]) + parser.add_argument("--batch", type=int, default=1) + parser.add_argument("--seq-len", type=int, default=32) + parser.add_argument("--use-tokenizer", action="store_true") + parser.add_argument("--prompt", type=str, default="Hello, DeepSeek V3") + parser.add_argument("--compile", action="store_true", default=True) + + args = parser.parse_args() + + if not args.model_id: + print("Error: --model-id is required (or set DEEPSEEK_V3_MODEL_ID).", file=sys.stderr) + sys.exit(2) + + args.scale, args.max_layers, args.batch, args.seq_len = _apply_preset( + args.scale, args.max_layers, args.batch, args.seq_len, args.preset + ) + + device = torch.device("npu:0") + + run_deep_seek_v3_base_test( + model_id=args.model_id, + device=device, + init_mode=args.init_mode, + scale=args.scale, + max_layers=args.max_layers, + dtype=args.dtype, + batch=args.batch, + seq_len=args.seq_len, + use_tokenizer=args.use_tokenizer, + prompt=args.prompt, + trust_remote_code=args.trust_remote_code, + revision=args.revision, + compile_model=args.compile, + ) From 9a27549ad72880de8046424f8f6102719a549513 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 25 Feb 2026 14:06:30 +0000 Subject: [PATCH 106/194] [Fix] Polish the error handling of dram_stride calculation --- .../_C.cpython-311-x86_64-linux-gnu.so | Bin 15312 -> 0 bytes .../mlir/mlir_codegen_backend.py | 76 +++++++++++++++--- PyTorchSimFrontend/mlir/mlir_common.py | 4 + 3 files changed, 68 insertions(+), 12 deletions(-) delete mode 100755 PyTorchSimDevice/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so diff --git a/PyTorchSimDevice/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so b/PyTorchSimDevice/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so deleted file mode 100755 index 04b3b4e1cb7232dbb845c2f33fe24d94c640b705..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15312 zcmeHOU1%It6uz6Z8tqTk(rRqQj?!YQ?If)!+DdHFH0eZ}Hl`^e{*1G`lkCd=q`R}4 z^`Taaf0RN&d{bzB5Px0-1@%F#RHRBDT0sy6DPkd2(W)rbhmPmWoNqIo?urxr|frkcfRy^ z_nl9q2fuy)#V>nU74yk{IsPE;5**X5``+^jnzlib7!9PHpGJIk-H2}!e*wMXjpH)n zOrm$vYk``pj4MLnWzezhi9)GpS3IZe*|xHW#)j>TTXrXM70)e?4fp3uMR&|e<=s%$ zSYoHA9D6)hbn>}JT{Ti0D(1*rzseDApLC0(?!<5@Qza+)T*@nz(^)%}D`s-ViHcb% zsm{`**O@LAGfpSTH!RyeI<#eI8}|_=K5tm(NqHZJe4fBRE_2b=8(M-7`sl`x&vV7O zLOUMe%SR^=eG%bft+3!^gfCxFp{w2yE+xQP4>|g(GoUk|GoUk|GoUk|GoUk|Gw}b< zz_0Ds{%P&~q0QPi`$VTw){}?57XP@l_oKEW!JG5feM)S9`7ye-FYQ&VpJmDEZ+zb$ zKftuVd^btQ+m~)uf!tsIa-FvJ_q>2CBA2(Vbo^7=fA6qBRc?-$GB5}~>pA5%^J;$@BXB~-E^`@QH-kxvx z#@%}MlsDJf*K;NDr-vx;=?q;yo;{D~#QkJjAD`_{KSDx@C!lX2n!Ip7$W=W%#MDh^ zKxaT_KxaT_KxaT_KxaT_KxaT_KxaT_;D3;T#FEzA_`gg3ugf^&`xap@;UwV*;WNzt z+4m0;a^wG4xg{PQRf(x&V#(Y~+YZnlam$Ez4ZV*4zU2=^fQ4vN_z5FAF~7geT0W&eAe;kHAS)1|MMhTH=O~U0i6Mz0i6Mz0i6Mz z0i6Mz0i6Mzfqy3hsLw<_CTcBriTifJrv&F>Sh=a2C-f?*^SoMU)PXJ$8uguvg+@In z-%E0X{I_#{iRm*^+=-ga!&21A^`P83guXl^)gi$-f*pdW1)Y{UP}GbT9?3_ir#(>`YaQUe#E$ z#_Tq`RX63`+6bp z)2`k!;9&+E`FSnSU!dai@@BwaRDWOa@`3O%Pv9S;P7ANTSl&Jh6ez0(G($W^(4yuk zd@dpVo;nHo6$YON@cGdY14IOq#BWrOzPO2gcN6|d;3wi(x~V4mDqZ#}UUhujOsknP zciSu2X)biHDBI4I?1_9S<>c)QRjOBPr#hw5rNU(1_1uiv)mVdz-*dK8E;}{bEqdje z8ZSEq*UnT6g&LVeD4UDo&r_v;X%Rt<9q<2u9gkwr=0lx7FT0IJjeA%pU98v~_^oHY?Q+D*qc- z_y5f^0T&kd#~vt9W~El}oD`u~4l#>fvE;etM6qg4mP?av*{dnD&Pr8t`ONBEMgrdA|pA$nzHmfgI=OzBPLJ!J+tF{x`#l!)X`#Z4>IbEk;pSyHB(mHEhK$P1{@ zHk_DH6MY)ODdflrxnz$Nf#W&KOjRn%q@_`y8NYA|UKEG-HQpPrpEP^!2hOo?faYqY zsC~2Nf1oeh4@lKVz29lC!T8uOf&4oyH`v9;zmsIVSHd3qDG>V`Z-ev^t?zr-=XMZ} z_c+*Ne+C{RS+uc_XuFpPjt%zMUx64u9zVu+#eS1G#eNNh9^3OCkM|EgA2K*5oG0qv ztMdJda|@{i4RL_xeI{yeh+Q3_D2T6ZU^QmnD*_<&;082scg%i71VHFDvwu2f|DYHE zKY^iO+~59~J@y$O&V_Ij&);)mkNH=IXbP?p5)ijPL;}t~?7NIm6ZlL>)~}dYxtevQ8uQ;3(A>Yq5`u^Z^KVpyCzp*x?c3~h9 V#z@7tOO5QW>kbW0iya_t{}-!tJ*fZy diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 62acd877..d6ddb025 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1,5 +1,6 @@ import contextlib import sympy +import sys import re import os from functools import reduce @@ -375,13 +376,51 @@ def _convert_sympy_to_mlir_expr(self, expr, sorted_args): expr = expr.replace(target_arg, new_arg) indices.append(str(new_arg)) - expr_str = str(expr) - if "ModularIndexing" in expr_str: - def _replace_mod(m): - return f"({m.group(1)} floordiv {m.group(2)}) mod {m.group(3)}" - expr_str = re.sub(r"ModularIndexing\(([^,]+), ([^,]+), ([^)]+)\)", _replace_mod, expr_str) - if "//" in expr_str: - expr_str = expr_str.replace("//", " floordiv ") + # Convert ModularIndexing and FloorDiv to sympy expressions + # ModularIndexing(x, y, z) means (x // y) % z -> Mod(FloorDiv(x, y), z) + # FloorDiv(x, y) means x // y -> will be converted to floordiv in string representation + # Use preorder_traversal to find all instances + replacements = {} + for sub in sympy.preorder_traversal(expr): + if isinstance(sub, ModularIndexing): + # Convert ModularIndexing to Mod(FloorDiv(...), ...) + if sub.args[1] != 1: + floor_div = FloorDiv(sub.args[0], sub.args[1]) + else: + floor_div = sub.args[0] + mod_expr = sympy.Mod(floor_div, sub.args[2]) + replacements[sub] = mod_expr + elif isinstance(sub, FloorDiv): + # Keep FloorDiv as is, will be handled in custom string conversion + # We need to mark it for special handling + pass + + # Apply replacements + for old_expr, new_expr in replacements.items(): + expr = expr.subs(old_expr, new_expr) + + # Custom string conversion for MLIR affine expressions + def mlir_str(expr): + """Convert sympy expression to MLIR affine expression string""" + if isinstance(expr, FloorDiv): + return f"({mlir_str(expr.args[0])} floordiv {mlir_str(expr.args[1])})" + elif isinstance(expr, sympy.Mod): + return f"({mlir_str(expr.args[0])} mod {mlir_str(expr.args[1])})" + elif isinstance(expr, sympy.Add): + terms = [mlir_str(term) for term in expr.args] + return " + ".join(terms) + elif isinstance(expr, sympy.Mul): + factors = [mlir_str(factor) for factor in expr.args] + return " * ".join(factors) + elif isinstance(expr, sympy.Symbol): + return str(expr) + elif expr.is_number: + return str(expr) + else: + # Fallback to string representation + return str(expr) + + expr_str = mlir_str(expr) return expr_str, indices def parse_indices(self, expr, comments="", indices=None, indirect_dims=[]) -> common.CSEVariable: @@ -1174,17 +1213,30 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe elif index.is_Number: pass else: - dram_dict = defaultdict(lambda: 0) + + dram_dict = defaultdict(list) + implicit_dim_divisors = defaultdict(lambda: sys.maxsize) # Assume that div will have high priority than mod for arg in index.as_ordered_terms(): coeff, dim = arg.as_coeff_mul() if len(dim) == 0: continue real_dim = list(dim[0].free_symbols)[0] - real_dim_name = str(real_dim) - if real_dim_name.startswith("index"): - dram_dict[int(real_dim_name[5:])] += int(coeff) - dram_stride = [dram_dict[dim] for dim in local_dims] + if dim[0].has(ModularIndexing): + if dim[0].args[1] < implicit_dim_divisors[str(real_dim)]: + implicit_dim_divisors[str(real_dim)] = dim[0].args[1] + dram_dict[str(real_dim)] = [coeff] + else: + dram_dict[str(real_dim)].append(coeff) + + # Add missing dims if not added + max_dim = len(self.ranges) if not store_reduction else len(self.ranges) - 1 + for i in range(max_dim): + target_dim = f"index{i}" + if sympy.Symbol(target_dim) not in index.free_symbols: + dram_dict[target_dim] = [0] + sorted_keys = sorted(dram_dict.keys()) + dram_stride = sum((dram_dict[key] for key in sorted_keys), []) # Support floordiv pattern # FIXME. How to integrate implicit dims and floordiv? diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 7eb8f7f1..34b185b8 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -923,6 +923,10 @@ def indirect_indexing(index_var, size, check=True, wrap_neg=True): # Skip CSE since this doesn't return an expression return self.indirect_indexing(index_var, size, check, wrap_neg) + @staticmethod + def check_bounds(index, size, lower, upper): + return self.check_bounds(index, size, lower, upper) + @staticmethod def load(name: str, index: sympy.Expr): index = self.rename_indexing(index) From 9b92f11f5aea7517093f748903c811564125b81b Mon Sep 17 00:00:00 2001 From: jung-min Date: Mon, 2 Mar 2026 07:58:59 +0000 Subject: [PATCH 107/194] [Frontend/template] add SDPA modules --- .../torch_openreg/openreg/__init__.py | 7 +- PyTorchSimFrontend/mlir/mlir_lowering.py | 25 +- PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 664 ++++++++++++++++++ PyTorchSimFrontend/mlir/mlir_template.py | 101 ++- tests/test_sdpa.py | 84 +++ 5 files changed, 878 insertions(+), 3 deletions(-) create mode 100644 PyTorchSimFrontend/mlir/mlir_sdpa_template.py create mode 100644 tests/test_sdpa.py diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py index 8d62cee3..5a0de6c3 100644 --- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py +++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py @@ -24,7 +24,7 @@ class device: def __init__(self, device): self.idx = torch.accelerator._get_device_index(device, optional=True) - self.prev_idx = -1 + self.prev_idx = -1 def __enter__(self): self.prev_idx = torch_openreg._C._exchangeDevice(self.idx) @@ -64,6 +64,11 @@ def _lazy_init(): global _initialized, _tog_simulator if is_initialized(): return + + # Replace the global C++ binding with our custom dispatcher patch + from PyTorchSimFrontend.mlir.mlir_sdpa_template import patched_scaled_dot_product_attention + torch._C._nn.scaled_dot_product_attention = patched_scaled_dot_product_attention + torch_openreg._C._init() register_interface_for_device(custom_device(), ExtensionDeviceInterface) _initialized = True diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py index ebf0c80e..e09dcf57 100644 --- a/PyTorchSimFrontend/mlir/mlir_lowering.py +++ b/PyTorchSimFrontend/mlir/mlir_lowering.py @@ -15,6 +15,7 @@ from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate +from PyTorchSimFrontend.mlir.mlir_sdpa_template import MLIRFlashSDPATemplate, flash_sdpa_args from PyTorchSimFrontend import extension_config aten = torch.ops.aten @@ -38,6 +39,26 @@ def tuned_bmm(mat1, mat2, *, layout=None): return mlir_template.generate().output_node() + +def tuned_flash_sdpa( + query : TensorBox, + key : TensorBox, + value : TensorBox, + scale : float, + dropout_p : float = 0.0, + is_causal : bool = False, + return_debug_mask : bool =False) -> tuple: + + print("Enter tuned_flash_sdpa") + + N, Hq, H, L, S, E, Ev, layout, query, key, value = flash_sdpa_args(query, key, value) + mlir_template = MLIRFlashSDPATemplate([query, key, value], layout, scale) + + # _scaled_dot_product_flash_attention has to return a tuple which has 9 values + # since its backward(_scaled_dot_product_flash_attention_backward) needs that values. + # (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor rng_state, Tensor unused, Tensor debug_attn_mask) + return (mlir_template.generate().output_node(), None, None, None, None, None, None, None, None) + def conv_layout( x: TensorBox, weight: TensorBox, @@ -188,4 +209,6 @@ def custom_unsafe_index(x, indices): lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()}) lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()}) if extension_config.CONFIG_USE_TIMING_POOLING: - lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template \ No newline at end of file + lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template + +lowerings.update({getattr(aten._scaled_dot_product_flash_attention, overload): tuned_flash_sdpa for overload in aten._scaled_dot_product_flash_attention.overloads()}) \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py new file mode 100644 index 00000000..b3d88cc6 --- /dev/null +++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py @@ -0,0 +1,664 @@ +import math # sqrt +import sympy + +from typing import List, Optional + +import torch +from torch import empty_strided +from torch._inductor.ir import IRNode, TensorBox, FixedLayout +from torch._inductor.virtualized import V +from torch._inductor.select_algorithm import realize_inputs +from torch.backends.cuda import flash_sdp_enabled, mem_efficient_sdp_enabled + +from PyTorchSimFrontend import extension_config +from PyTorchSimFrontend.mlir import mlir_common +from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate +from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel + + +def flash_sdpa_args( + query : TensorBox, + key : TensorBox, + value : TensorBox) -> list: + """ + Arg processing for flash SDPA. + Its logic is based on: + mm_args() which is in torch._inductor.kernel.mm_common.py (142 line). + """ + + # Materialize input buffers for the codegen backend. + query, key, value = realize_inputs(query, key, value) + + # query : (n, hq, l, e) + # key : (n, h, s, e) + # value : (n, h, s, ev) + # out : (n, hq, l, ev) + # n: Batch size + # hq: query's head counts, h: key and value's head counts. + # l: target sequence lenght and s: source sequence length. + # e: embeding dimension of the query and key and ev: embeding dimension of the value. + nq, hq, l, eq = query.get_size() + nk, hk, sk, ek = key.get_size() + nk, hv, sv, ev = value.get_size() + + n = V.graph.sizevars.guard_equals(nq, nk) + n = V.graph.sizevars.guard_equals(nq, nk) + + h = V.graph.sizevars.guard_equals(hk, hv) + s = V.graph.sizevars.guard_equals(sk, sv) + e = V.graph.sizevars.guard_equals(eq, ek) + + # While there are no theoretical requirements for e == ev, + # this implementation enforces e == ev for simplicity. + # Distinct notations are still maintained to ensure future compatibility and clarity. + if e != ev: + raise NotImplementedError("Flash SDPA does not support mismatched head dimensions between query and value.") + + # Flash attention does not split tiles along the head dimension (e or ev). + # Therefore, the head dimension size must be less than or equal to the number of vlanes. + vector_lane = extension_config.vpu_num_lanes + if e > vector_lane or ev > vector_lane: + raise ValueError(f"The head dimension size must be less than or equal to the number of vlanes (e: {e}, ev: {ev}, vlanes: {vector_lane}).") + + # The aten._scaled_dot_product_flash_attention kernel does not accept an explicit enable_gqa parameter. + # Instead, the Flash SDPA implementation infers GQA usage by checking if hq != hk. + # The Flash SDPA for GQA will be implemented after implementing its native version. + if hq != h : + raise NotImplementedError("Flash SDPA for GQA is not supported yet.") + + layout = FixedLayout( + query.get_device(), + query.get_dtype(), + [n, hq, l, ev] + ) + + return [n, hq, h, l, s, e, ev, layout, query, key, value] + +def validate_sdpa_input( + query : torch.Tensor, + key : torch.Tensor, + value : torch.Tensor, + attn_mask : torch.Tensor = None, + dropout_p : float = 0.0, + is_casual : bool = False, + scale : float = None, + enable_gqa : bool = False) -> None: + """ + Validates input tensors and parameters for Scaled Dot Product Attention (SDPA). + This function's logic can be found in: + https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/transformers/attention.cpp(504 line) + https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html + """ + + # Tensor class, dtype, and device consistency + # Ensure all primary inputs are torch.Tensors + if not all(isinstance(t, torch.Tensor) for t in [query, key, value]): + raise TypeError( + f"Expected query, key and value to be Tensors, but got " + f"{type(query).__name__}, {type(key).__name__}, and {type(value).__name__}." + ) + + # Check for dtype mismatch + if query.dtype != key.dtype or query.dtype != value.dtype: + raise TypeError( + f"Expected query, key, and value to have the same dtype, " + f"but got {query.dtype}, {key.dtype}, and {value.dtype}." + ) + + # Check for device mismatch (e.g., mixing CPU and NPU) + if query.device != key.device or query.device != value.device: + raise ValueError( + f"Expected query, key, and value to be on the same device, " + f"but got {query.device}, {key.device}, and {value.device}." + ) + + # Shape and dimension validation + # SDPA typically expects 4D (B, H, S, D), but we check for at least 2D here + if any(t.dim() < 2 for t in [query, key, value]): + raise ValueError( + f"Expected query, key, and value to be at least 2D, " + f"but got Q:{query.dim()}D, K:{key.dim()}D, V:{value.dim()}D." + ) + + # Attention mask validation + if attn_mask is not None: + if not isinstance(attn_mask, torch.Tensor): + raise TypeError(f"Expected attn_mask to be a Tensor, but got {type(attn_mask).__name__}.") + + # Dtype check: floating point masks must match query dtype; bool masks are also allowed + if attn_mask.dtype.is_floating_point: + if attn_mask.dtype != query.dtype: + raise TypeError(f"Floating point attn_mask must match query dtype ({query.dtype}), but got {attn_mask.dtype}.") + elif attn_mask.dtype != torch.bool: + raise TypeError(f"attn_mask must be floating point or bool, but got {attn_mask.dtype}.") + + # Nested tensor limitation with explicit masking + if query.is_nested or key.is_nested: + raise ValueError("Nested tensors are not supported when an explicit attn_mask is set.") + + # Dropout and causal flag validation (added) + # Dropout probability must be in the range [0, 1) + if not (0.0 <= dropout_p < 1.0): + raise ValueError(f"Expected dropout_p to be in [0, 1), but got {dropout_p}.") + + # Mutual exclusivity: cannot use both explicit mask and causal flag (added) + if is_casual and attn_mask is not None: + raise ValueError("Both attn_mask and is_casual cannot be set at the same time.") + + # Scaling factor validation (added) + if scale is not None and scale <= 0.0: + raise ValueError(f"Expected scale to be a positive number, but got {scale}.") + + # GQA (Grouped Query Attention) constraints (added) + n_head_q = query.size(1) + n_head_k = key.size(1) + n_head_v = value.size(1) + + # The aten._scaled_dot_product_flash_attention kernel does not accept an explicit enable_gqa parameter. + # Instead, the Flash SDPA implementation infers GQA usage by checking if n_head_q != n_head_k. + if not enable_gqa and n_head_q != n_head_k: + raise ValueError(f"Query and Key must have the same number of heads when enable_gqa is false (Q:{n_head_q} vs K:{n_head_k}).") + + if enable_gqa: + if n_head_q == n_head_k: + raise ValueError(f"enable_gqa Query and Key ") + + if n_head_k != n_head_v: + raise ValueError(f"Key and Value must have the same number of heads (K:{n_head_k} vs V:{n_head_v}).") + + # Query heads must be an integer multiple of key heads for grouping + if n_head_q % n_head_k != 0: + raise ValueError( + f"Number of query heads ({n_head_q}) must be divisible by " + f"number of key heads ({n_head_k}) for GQA." + ) + +def convert_boolean_attn_mask(attn_mask: torch.Tensor, target_dtype: torch.dtype) -> float: + """ + Equivalent to the C++ 'convert_boolean_attn_mask' function. + Converts a boolean mask to a floating-point mask for SDPA. + """ + + if attn_mask is not None and attn_mask.dtype == torch.bool: + + new_mask = torch.zeros_like(attn_mask, dtype=target_dtype) + minus_inf = torch.finfo(target_dtype).min + new_mask.masked_fill_(attn_mask.logical_not(), minus_inf) + + return new_mask + + return attn_mask + +def calculate_scale(query: torch.Tensor, scale: float) -> float: + """ + Calculate the scaling factor based on the head dimension if scale is None + Otherwise, use the provided scale. + """ + if scale is None: + return 1.0 / math.sqrt(query.size(-1)) + else: + return scale + +def patched_scaled_dot_product_attention( + query_ : torch.Tensor, + key : torch.Tensor, + value : torch.Tensor, + dropout_p : float = 0.0, + is_casual : bool = False, + attn_mask_ : torch.Tensor = None, + scale_ : float = None, + enable_gqa : bool = None, + orig_fn = torch._C._nn.scaled_dot_product_attention) -> torch.Tensor : + """ + Custom patch for Scaled Dot Product Attention (SDPA) to intercept high-level calls. + For NPU devices, it redirects execution to specific ATen kernels based on global flags. + For all devices, it maintains parity with the original dispatcher logic found in: + https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/transformers/attention.cpp + + This function acts as a custom override that replaces the default PyTorch SDPA implementation, + invoked via 'PyTorchSim/PyTorchSimDevice/torch_openreg/openreg/__init__.py'. + """ + + # Device-specific Dispatching: redirect to specialized kernels if on NPU + if "npu" in str(query_.device): + + validate_sdpa_input(query_, key, value, attn_mask_, dropout_p, is_casual, scale_, enable_gqa) + attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype) + + # Kernel selection logic: emulate C++ dispatcher priority + # Selection priority(can be changed): flash attention > memory efficient > math (cuDNN is not supported) + aten = torch.ops.aten + scale = calculate_scale(query_, scale_) + + if flash_sdp_enabled(): + # Skip padding query, key and value for alignment. + dispatch_kwargs = { + "dropout_p" : dropout_p, + "is_causal" : is_casual, + "return_debug_mask" : False, + "scale" : scale + } + + out_lse_softmax = aten._scaled_dot_product_flash_attention( + query_, key, value, **dispatch_kwargs + ) + + return out_lse_softmax[0] + elif mem_efficient_sdp_enabled(): + # out_and_lse = aten._scaled_dot_product_efficient_attention(...) + # return out_and_lse[0] + raise NotImplementedError("Memory efficient SDPA is not implemented yet.") + else: + dispatch_kwargs = { + "attn_mask" : attn_mask, + "dropout_p" : dropout_p, + "is_causal" : is_casual, + "dropout_mask" : None, + "scale": scale, + "enable_gqa" : enable_gqa + } + + out_lse_softmax = aten._scaled_dot_product_attention_math( + query_, + key, + value, + **dispatch_kwargs) + + return out_lse_softmax[0] + else: + # Fallback: Delegate to the original C++ Dispatcher for other devices + return orig_fn(query_, key, value) + +FLASH_SDPA_TEMPLATE = r""" +// SDPA kernel +// b = {{ b }} +// l = {{ l }} +// s = {{ s }} +// e = {{ e }} +// tile_l = {{ tile_l }} +// tile_s = {{ tile_s }} +// tile_e = {{ tile_e }} +// subtile_l = {{ subtile_l }} +// subtile_s = {{ subtile_s }} +// subtile_e = {{ subtile_e }} +{{kernel.def_global_vars()}} + +func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[query, key, value], outputs=[out], names_str="query, key, value, out", input_reorder=input_reorder)}} { + // Inputs + {{ kernel.def_sram_buffer("query", q_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }} + + // Output + {{ kernel.def_sram_buffer("out", out_tile_desc, indent_size=2) }} + + // Intermediate buffers + {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }} + + // Constants + %c0 = arith.constant 0.0 : {{ data_stype }} + %c1 = arith.constant 1.0 : {{ data_stype }} + %c_scale = arith.constant {{ scale }} : {{ data_stype }} + %c_neg_inf = arith.constant -1.0e+30 : {{ data_stype }} + + %v0_c = arith.constant dense<0.0> : vector<{{ chunk_size }}x{{ data_stype }}> + %v0_l = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(tile_l, tile_e) }}x{{ data_stype }}> + %v0_s = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(tile_s, tile_l) }}x{{ data_stype }}> + %v0_2x = arith.constant dense<0.0> : vector<2x{{ data_stype }}> + + %v_neg_inf_c = arith.constant dense<-1.0e+30> : vector<{{ chunk_size }}x{{ data_stype }}> + %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2x{{ data_stype }}> + + %v_scale = vector.broadcast %c_scale : {{ data_stype }} to vector<{{ tile_s }}x{{ data_stype }}> + + {{ kernel.def_local_vars(indent_size=2) }} + + affine.for %index0 = 0 to {{ b }} { + affine.for %index3 = 0 to 1 step 1 { + affine.for %index1 = 0 to {{ l }} step {{ tile_l }} { + {{ kernel.def_dma_op("MVIN", "query", q_idx, q_tile_desc, subtile_size=[1, subtile_l, subtile_e], indent_size=8) }} + + affine.vector_store %v0_l, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_l, tile_e) }}x{{ data_stype }}> + affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> + affine.vector_store %v0_2x, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> + + %qt_buffer2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_l }}], strides: [{{ tile_l }}, 1] : {{ q_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1> + %ot_buffer2D = memref.reinterpret_cast %out_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_l }}], strides: [{{ tile_l }}, 1] : {{ out_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1> + + affine.for %index2 = 0 to {{ s }} step {{ tile_s }} { + {{ kernel.def_dma_op("MVIN", "key", k_idx, k_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10) }} + {{ kernel.def_dma_op("MVIN", "value", v_idx, v_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10) }} + + affine.vector_store %v0_s, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_s, tile_l) }}x{{ data_stype }}> + + %k_buffer2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ data_stype }}, 1> + %vt_buffer2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ data_stype }}, 1> + + + // key @ query.t and scaling. + linalg.matmul + ins(%k_buffer2D, %qt_buffer2D : memref<{{ tile_s }}x{{ tile_e }}x{{ data_stype }}, 1>, memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>) + outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(data_stype) }}) + + %raw_mul_vec = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}> + %scaled_mul_vec = arith.mulf %raw_mul_vec, %v_scale : vector<{{ tile_s }}x{{ data_stype }}> + affine.vector_store %scaled_mul_vec, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}> + + + // Find new max. + %old_max = affine.vector_load %max_buffer[0,0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> + + %chunk_max_res = affine.for %index5 = 0 to {{ tile_s }} step {{ chunk_size }} iter_args(%iter_max=%v_neg_inf_c) -> (vector<{{ chunk_size }}x{{ data_stype }}>) { + %chunk_val = affine.vector_load %mul_buffer[0, %index5] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ chunk_size }}x{{ data_stype }}> + %local_max = arith.maximumf %chunk_val, %iter_max : vector<{{ chunk_size }}x{{ data_stype }}> + affine.yield %local_max : vector<{{ chunk_size }}x{{ data_stype }}> + } + + %max_cast = vector.shape_cast %chunk_max_res : vector<{{ chunk_size }}x{{ data_stype }}> to vector<{{ chunk_size // 2 }}x2x{{ data_stype }}> + %max_reduced_1 = vector.multi_reduction , %max_cast, %v_neg_inf_2x [0] : vector<8x2x{{ data_stype }}> to vector<2x{{ data_stype }}> + %max_shuffled = vector.shuffle %max_reduced_1, %max_reduced_1 [1, 0] : vector<2x{{ data_stype }}>, vector<2x{{ data_stype }}> + %max_reduced_2 = arith.maximumf %max_reduced_1, %max_shuffled : vector<2x{{ data_stype }}> + + %new_max = arith.maximumf %max_reduced_2, %old_max : vector<2x{{ data_stype }}> + affine.vector_store %new_max, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> + + + // Compute rescale factors: exp(old_max - new_max) + %max_diff = arith.subf %old_max, %new_max : vector<2x{{ data_stype }}> + %max_diff_scalar = vector.extract %max_diff[0] : {{ data_stype }} from vector<2x{{ data_stype }}> + + %rescale_bcast_e = vector.broadcast %max_diff_scalar : {{ data_stype }} to vector<{{ tile_e }}x{{ data_stype }}> + %exp_rescale_e = math.exp %rescale_bcast_e : vector<{{ tile_e }}x{{ data_stype }}> + + %rescale_bcast_2 = vector.broadcast %max_diff_scalar : {{ data_stype }} to vector<2x{{ data_stype }}> + %exp_rescale_2 = math.exp %rescale_bcast_2 : vector<2x{{ data_stype }}> + + + // Rescale previous out and sum accumulators + %old_out = affine.vector_load %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}> + %rescaled_out = arith.mulf %exp_rescale_e, %old_out : vector<{{ tile_e }}x{{ data_stype }}> + affine.vector_store %rescaled_out, %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}> + + %old_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> + %rescaled_sum = arith.mulf %old_sum, %exp_rescale_2 : vector<2x{{ data_stype }}> + + + // Shift scores and apply exp: exp(x - new_max) + %scaled_scores_reload = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}> + %new_max_scalar = vector.extract %new_max[0] : {{ data_stype }} from vector<2x{{ data_stype }}> + %new_max_bcast = vector.broadcast %new_max_scalar : {{ data_stype }} to vector<{{ tile_s }}x{{ data_stype }}> + + %shifted_scores = arith.subf %scaled_scores_reload, %new_max_bcast : vector<{{ tile_s }}x{{ data_stype }}> + %exp_scores = math.exp %shifted_scores : vector<{{ tile_s }}x{{ data_stype }}> + affine.vector_store %exp_scores, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}> + + + // accumulate current sum + %chunk_sum_res = affine.for %index5 = 0 to {{ tile_s }} step {{ chunk_size }} iter_args(%iter_sum=%v0_c) -> (vector<{{ chunk_size }}x{{ data_stype }}>) { + %chunk_exp = affine.vector_load %mul_buffer[0, %index5] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ chunk_size }}x{{ data_stype }}> + %local_sum = arith.addf %chunk_exp, %iter_sum : vector<{{ chunk_size }}x{{ data_stype }}> + affine.yield %local_sum : vector<{{ chunk_size }}x{{ data_stype }}> + } + + %zero_2x = vector.broadcast %c0 : {{ data_stype }} to vector<2x{{ data_stype }}> + %sum_cast = vector.shape_cast %chunk_sum_res : vector<{{ chunk_size }}x{{ data_stype }}> to vector<{{ chunk_size // 2 }}x2x{{ data_stype }}> + %sum_reduced_1 = vector.multi_reduction , %sum_cast, %zero_2x [0] : vector<8x2x{{ data_stype }}> to vector<2x{{ data_stype }}> + %sum_shuffled = vector.shuffle %sum_reduced_1, %sum_reduced_1 [1, 0] : vector<2x{{ data_stype }}>, vector<2x{{ data_stype }}> + %sum_reduced_2 = arith.addf %sum_reduced_1, %sum_shuffled : vector<2x{{ data_stype }}> + + %new_sum = arith.addf %sum_reduced_2, %rescaled_sum : vector<2x{{ data_stype }}> + affine.vector_store %new_sum, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> + + + // value.t @ mul + linalg.matmul + { idx_map = array } + ins(%vt_buffer2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ data_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(data_stype) }}) + outs(%ot_buffer2D : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>) + } + + // out @ row_sum^(-1) + %final_row_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> + %one_2x = vector.broadcast %c1 : {{ data_stype }} to vector<2x{{ data_stype }}> + + %reciprocal_row_sum_2x = arith.divf %one_2x, %final_row_sum : vector<2x{{ data_stype }}> + %reciprocal_scalar = vector.extract %reciprocal_row_sum_2x[0] : {{ data_stype }} from vector<2x{{ data_stype }}> + %reciprocal_bcast_e = vector.broadcast %reciprocal_scalar : {{ data_stype }} to vector<{{ tile_e }}x{{ data_stype }}> + + %accumulated_out = affine.vector_load %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}> + %stable_final_out = arith.mulf %accumulated_out, %reciprocal_bcast_e : vector<{{ tile_e }}x{{ data_stype }}> + affine.vector_store %stable_final_out, %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}> + + {{ kernel.store_output(indent_size=8) }} + } { accumulation_loop=true } + } { outer_loop=true } + } { outer_loop=true } + return +} +""" + +class MLIRFlashSDPATemplate(MLIRTemplate): + def __init__(self, input_nodes, layout, scale, input_reorder=None): + super().__init__("kernel", input_nodes, layout, input_reorder) + self.scale = scale + + def render(self, + kernel: MLIRTemplateKernel, + template_buffer_node = None, + epilogue_nodes: Optional[List[IRNode]] = None, + prologue_nodes: Optional[List[IRNode]] = None, + tile_info = None, + **kwargs): + + # Except for kernel, other arguments are usually None. + query, key, value, out, q_tensor, k_tensor, v_tensor, out_tensor, b, l, s, e, ev, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) + + if tile_info is None: + tile_l, tile_s, tile_e, subtile_l, subtile_s, subtile_e = self.select_tile(kernel, l, s, e, n_extra_node, 0, n_prologue_node)[0] + else: + tile_l, tile_s, tile_e, subtile_l, subtile_s, subtile_e = tile_info + + TOG_latency = l if tile_l > l else tile_l + kernel.loop_size = [TOG_latency, tile_s, tile_e] + + # Select template code + # Other templates will be added according to situations. + nr_reduction_nodes = [node for node in epilogue_nodes if node.is_reduction()] if epilogue_nodes is not None else [] + if nr_reduction_nodes: + raise NotImplementedError("FLASH_SDPA_REDUCTION_TEMPLATE is not implemented yet.") + elif prologue_nodes: + raise NotImplementedError("FLASH_SDPA_PROLOGUE_TEMPLATE is not implemented yet.") + else: + template = FLASH_SDPA_TEMPLATE + epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2", "index3": "index3"} + nr_rdim = 0 + + # Prepare tile descriptors for input and output tensors. + # Intermediate buffers (transient data) do not require DRAM settings(dram stride and dram indices) + # as they are not synchronized with external DRAM. + # DRAM and SRAM tile shapes must match. + vlane_stride = 1 + + # (n, l, s, e, ev) + loop_dim = [sympy.Symbol("index0"), sympy.Symbol("index1"), sympy.Symbol("index2"), sympy.Symbol("index3")] + + + # Hardware constraint: The tile split axis is restricted. + # To accommodate this, we compute (key @ query.t) instead of (query @ key.t). + # SRAM settings + vlane_split_axis = 1 + q_tile_size = [1, tile_l, tile_e] + q_tile_stride = [0, tile_e, 1] + q_tile_desc = mlir_common.MLIRMultiDimTile(q_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride) + q_tile_desc.set_tile_size_stride(q_tile_size, q_tile_stride) + q_tile_desc.set_name("q_buffer") + q_tile_desc.offset = query.get_layout().offset + # DRAM settings + q_stride = q_tensor.stride() + q_idx = [loop_dim[0]*q_stride[0], loop_dim[1]*q_stride[1], loop_dim[3]*q_stride[2]] # To keep index arguemnt order, we used index_list + + # Since we use a weight-stationary approach in the Systolic Array (SA), + # the split axis of the first operand differs from a standard linear algebra matmul. + # The first operand (key) must be split along the column axis. + # This logic aligns with the relationship between the dot product's summation direction and the hardware's accumulation direction in the SA. + # SRAM settings + vlane_split_axis = 2 + k_tile_size = [1, tile_s, tile_e] + k_tile_stride = [0, 1, tile_s] + k_tile_desc = mlir_common.MLIRMultiDimTile(k_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride) + k_tile_desc.set_tile_size_stride(k_tile_size, k_tile_stride) + k_tile_desc.set_name("k_buffer") + k_tile_desc.offset = key.get_layout().offset + # DRAM settings + k_stride = k_tensor.stride() + k_idx = [loop_dim[0]*k_stride[0], loop_dim[2]*k_stride[1], loop_dim[3]*k_stride[2]] + + # Since we compute mul = key @ query.t, we perform out.t = (value.t @ Softmax(mul).t).t, + # which simplifies to (value.t @ Softmax(mul)) + # SRAM settings + vlane_split_axis = 1 + v_tile_size = [1, tile_s, tile_e] + v_tile_stride = [0, tile_e, 1] + v_tile_desc = mlir_common.MLIRMultiDimTile(v_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride) + v_tile_desc.set_tile_size_stride(v_tile_size, v_tile_stride) + v_tile_desc.set_name("v_buffer") + v_tile_desc.offset = value.get_layout().offset + # DRAM settings + v_stride = v_tensor.stride() + v_idx = [loop_dim[0]*v_stride[0], loop_dim[2]*v_stride[1], loop_dim[3]*v_stride[2]] # To keep index arguemnt order, we used index_list + + # Output is also stored in transposed format to match the value.t @ Softmax(mul) operation. + # SRAM settings + vlane_split_axis = 1 + out_tile_size = [1, tile_l, tile_e] + out_tile_stride=[0, tile_e, 1] + out_tile_desc = mlir_common.MLIRMultiDimTile(out_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride) + out_tile_desc.set_tile_size_stride(out_tile_size, out_tile_stride) + out_tile_desc.set_name("out_buffer") + # DRAM settings + out_stride = out.get_layout().stride[1:] + out_idx = [loop_dim[0]*out_stride[0], loop_dim[1]*out_stride[1], loop_dim[3]*out_stride[2]] + + # Intermediate buffers + + # For mul = key @ query.t + vlane_split_axis = 1 + mul_tile_size = [tile_s, tile_l] + mul_tile_stride = [tile_l, 1] + mul_tile_desc = mlir_common.MLIRMultiDimTile(mul_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride) + mul_tile_desc.set_tile_size_stride(mul_tile_size, mul_tile_stride) + mul_tile_desc.set_name("mul_buffer") + #FIXME. What is the offset? -> It doesn't matter at this time. + + # For storing maximum values per row + vlane_split_axis = 0 + max_size = [tile_l, 2] + max_stride = [2, 1] + max_desc = mlir_common.MLIRMultiDimTile(max_size, kernel.vector_lane, vlane_split_axis, vlane_stride) + max_desc.set_tile_size_stride(max_size, max_stride) + max_desc.set_name("max_buffer") + + # For storing summation per row + vlane_split_axis = 0 + sum_size = [tile_l, 2] + sum_stride = [2, 1] + sum_desc = mlir_common.MLIRMultiDimTile(sum_size, kernel.vector_lane, vlane_split_axis, vlane_stride) + sum_desc.set_tile_size_stride(sum_size, sum_stride) + sum_desc.set_name("sum_buffer") + + # For reduction + chunk_size = 16 + + kernel.render_options = dict( + KERNEL_NAME = self.name, + kernel = kernel, + b = b, + l = l, + s = s, + e = e, # Input sizes (dram) + tile_l = tile_l, + tile_s = tile_s, + tile_e = tile_e, # Tile sizes (sram) + subtile_l = subtile_l, + subtile_s = subtile_s, + subtile_e = subtile_e, # Subtile sizes (sram) + data_stype="f32", + query = query, + key = key, + value = value, + out = out, # Inputs and output (dram) + q_idx = q_idx, + k_idx = k_idx, + v_idx = v_idx, + out_idx = out_idx, # Strides (dram) + q_tile_desc = q_tile_desc, + k_tile_desc = k_tile_desc, + v_tile_desc = v_tile_desc, + mul_tile_desc = mul_tile_desc, + out_tile_desc = out_tile_desc, # Tile descriptions (sram) + max_desc = max_desc, + sum_desc = sum_desc, # Intermediate buffer descriptions (sram) + scale = self.scale, + chunk_size = chunk_size, + input_reorder = self.input_reorder # ETC + ) + + kernel.epilogue_info = dict( + output_node = self.output_node.name, + sram_var = "out_buffer", + dram_var = "out", + dram_idx = out_idx, + dram_tile_desc = out_tile_desc, + nr_rdim = nr_rdim, + r_dim_size = 0, + dim_aliasing = epilogue_dim_aliasing + ) + + code = self._template_from_string(template).render(**kernel.render_options) + kernel.add_loop_info([kernel.render_options["l"], kernel.render_options["s"], kernel.render_options["e"]], [kernel.render_options["tile_l"], kernel.render_options["tile_s"], kernel.render_options["tile_e"]]) + return code + + def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes): + if template_buffer_node is not None: + self.output_node = template_buffer_node + + query = self.input_nodes[0] + key = self.input_nodes[1] + value = self.input_nodes[2] + out = self.output_node + + q_tensor = empty_strided(query.layout.size, query.layout.stride) + k_tensor = empty_strided(key.layout.size, key.layout.stride) + v_tensor = empty_strided(value.layout.size, value.layout.stride) + out_tensor = empty_strided(out.layout.size, out.layout.stride) + + # Flatten batch and head dimensions (n, h) into a single dimension (b = n*h) + q_tensor = q_tensor.view([-1, q_tensor.shape[-2], q_tensor.shape[-1]]) + k_tensor = k_tensor.view([-1, k_tensor.shape[-2], k_tensor.shape[-1]]) + v_tensor = v_tensor.view([-1, v_tensor.shape[-2], v_tensor.shape[-1]]) + out_tensor = out_tensor.view([-1, out_tensor.shape[-2], out_tensor.shape[-1]]) + + b, l, s, e, ev = q_tensor.size(0), q_tensor.size(1), k_tensor.size(1), k_tensor.size(2), v_tensor.size(2) + + n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0 + n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0 + + return query, key, value, out, q_tensor, k_tensor, v_tensor, out_tensor, b, l, s, e, ev, n_extra_node, n_prologue_node + + # Reuse the existing function in MLIRBMMTemplate. + def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_node): + + # FIXME: Update the method for getting tile candidates once TestDmaFineGrained oass works correctly with Flash Attention. + # tile_candidates = kernel.flash_sdpa_mapping(l, s, e, n_extra_node=n_extra_node) + tile_candidates = [[kernel.vector_lane, kernel.vector_lane, e]] + + for idx, (tile_l, tile_s, tile_e) in enumerate(tile_candidates): + subtile_l = tile_l if (tile_l < kernel.vector_lane) or n_prologue_node else kernel.vector_lane + subtile_s = tile_s # if (tile_s < kernel.vector_lane) or prologue_nodes else kernel.vector_lane + subtile_e = tile_e # if (tile_e < kernel.vector_lane) or prologue_nodes else kernel.vector_lane + + tile_candidates[idx] = tile_l,tile_s,tile_e,subtile_l,subtile_s,subtile_e + + return tile_candidates diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index b864e5f2..23f5e3dc 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -387,6 +387,100 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) tile_candidates = [v for _, v in tile_candidates] return tile_candidates + + # Flash Attention requires more SRAM compared to standard GEMM. + # Total buffers needed: query, key, value, out, mul, max, sum + # Tensor Shapes: + # query (tile_l, tile_e), key (tile_s, tile_e), value (tile_s, tile_e), mul (tile_s, tile_l), out(tile_l, tile_e) + # max, sum : (tile_l, 2) + def flash_sdpa_mapping(self, l, s, e, n_extra_node=0, n_prologue_node=0, pad_e=True, min_tile=False, is_conv=False): + tile_candidates = [] + + spad_size_per_lane = self.spad_info["spad_size"] + spad_size = spad_size_per_lane * self.vector_lane + + # Double buffering + max_spad_per_lane = spad_size_per_lane // 2 + max_spad_size = spad_size // 2 + + # Padding for utilization + minimum_tile_size = 8 + minimum_n_tile = self.num_cores if min_tile else 1 + l_pad_factor = self.vector_lane if l > self.vector_lane else minimum_tile_size + s_pad_factor = self.vector_lane if s > self.vector_lane else minimum_tile_size + + pad = lambda x, factor: ((x + factor - 1) // factor) * factor + l_padded = pad(l, l_pad_factor) + s_padded = pad(s, s_pad_factor) + + # Calculate the total number of vector-sized blocks + l_idx = l_padded // self.vector_lane + s_idx = s_padded // self.vector_lane + + # Generate candidates for the number of blocks per tile + l_tile_range = sympy.divisors(l_idx) if l > self.vector_lane else [1] + s_tile_range = sympy.divisors(s_idx) if s > self.vector_lane else [1] + + # Convert block count to actual tile size + maximize_i_j = 1 + max_used_spad_size = 0 + + # Flash Attention does not tile along the head dimension (e or ev). + tile_e = e + + for i in l_tile_range: + tile_l = i * self.vector_lane if l > self.vector_lane else l_padded + for j in s_tile_range: + tile_s = j * self.vector_lane if s > self.vector_lane else s_padded + + # Calculate used spad size + used_spad_size = ( + tile_l * tile_e * (1 + n_prologue_node) # query + + tile_s * tile_e # key + + tile_s * tile_e # value + + tile_s * tile_l # mul + + tile_l * tile_e * (1 + n_extra_node) # out + + (tile_l * 2) * 2 # max, sum + ) * self.precision + + # Calculate used spad size per lane. + query_per_lane = tile_e * (1+n_prologue_node) + key_per_lane = tile_s + value_per_lane = tile_e + mul_per_lane = tile_s + out_per_lane = tile_e * (1 + n_extra_node) + vec_per_lane = 2 * 2 + + used_spad_per_lane = ( + query_per_lane + + key_per_lane + + value_per_lane + + mul_per_lane + + out_per_lane + + vec_per_lane + ) * self.precision + + # Add the validated candidate to the list if it passes all hardware constraints. + n_tile = math.ceil(l / max(tile_l, 128)) * math.ceil(s / max(tile_s, 128)) + check_spad_size = (used_spad_size < max_spad_size and used_spad_per_lane < max_spad_per_lane) + + if (check_spad_size + and max_used_spad_size < used_spad_size # SRAM utilization + and maximize_i_j <= tile_l * tile_s # Larger tile + and n_tile >= minimum_n_tile # Pallelism + and max(tile_s, 128) // max(tile_l, 128) < 10): # Balanced Shape + max_used_spad_size = used_spad_size + maximize_i_j = tile_l * tile_s + + if check_spad_size: + tile_candidates.append((used_spad_size, (tile_l, tile_s, tile_e))) + + # Sort by used_spad_size. + # tile_candidates[0] is the best solution we have. + tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True) + tile_candidates = [v for _, v in tile_candidates] + + return tile_candidates def meta_kernel(self): kernel_arg_attributes = self.kernel_arg_attributes @@ -827,7 +921,12 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com def def_sram_buffer(self, dram_name, tile_desc, id=0, indent_size=0): # Prepare code block with self: - dtype = self.named_nodes[dram_name].get_layout().dtype + try: + dtype = self.named_nodes[dram_name].get_layout().dtype + except (KeyError, AttributeError, TypeError): + import torch + dtype = torch.float32 + tile_shape = tile_desc.get_mlir_shape(mlir_common.DTYPE_TO_MLIR[dtype]) buffer_name = self.allocate_sram_buffer(dtype, dram_name, tile_desc, id, forced_name=dram_name) code = f"%{tile_desc.name} = memref.get_global @{buffer_name} : {tile_shape}" diff --git a/tests/test_sdpa.py b/tests/test_sdpa.py new file mode 100644 index 00000000..9c921eb4 --- /dev/null +++ b/tests/test_sdpa.py @@ -0,0 +1,84 @@ +import sys +import math +import torch +import inspect +from typing import List +import torch.nn.functional as F +from torch.nn.attention import SDPBackend, sdpa_kernel +from torch.fx.passes.graph_drawer import FxGraphDrawer +from torch._inductor.decomposition import decompositions + +def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): + message = f"|{name} Test Passed|" + if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): + print("-" * len(message)) + print(message) + print("-" * len(message)) + else: + print("custom out: ", out.cpu()) + print("cpu out: ", cpu_out) + exit(1) + +def test_scaled_dot_product_attention(device, backends="flash"): + torch.manual_seed(0) + n_batch_list = [1, 4, 8, 16] + n_head_list = [1, 4, 8, 12] + n_token_list = [128, 256, 512, 1024] + head_dim_list = [32, 64, 128] + + for n_batch in n_batch_list: + for n_head in n_head_list: + for n_token in n_token_list: + for head_dim in head_dim_list: + # Inputs + query = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32) + key = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32) + value = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32) + + query = query.to(device=device) + key = key.to(device=device) + value = value.to(device=device) + + # With NPU + if backends == "flash": + backends = [SDPBackend.FLASH_ATTENTION] + elif backends == "math": + backends = [SDPBackend.MATH] + elif backends == "memory_efficient": + backends = [SDPBackend.EFFICIENT_ATTENTION] + else: + backends = [SDPBackend.FLASH_ATTENTION, SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION] + + with sdpa_kernel(backends=backends): + opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention) + out = opt_fn(query, key, value) + + out = out.to(device) + + # With CPU + device = torch.device('cpu') + query = query.to(device=device) + key = key.to(device=device) + value = value.to(device=device) + cpu_out = F.scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False) + + name = f"SDPA(n_batch: {n_batch}, n_head: {n_head}, n_token: {n_token}, head_dim: {head_dim})" + test_result(name, out, cpu_out) + + print("All tests passed!") + +def clear_caches(): + import os + from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache + from torch._inductor.codecache import FxGraphCache + AOTAutogradCache.clear() + torch._dynamo.reset() + os.environ["TORCHINDUCTOR_CACHE"] = "0" + FxGraphCache.clear() + +if __name__ == "__main__": + clear_caches() + + device = torch.device('npu:0') + test_scaled_dot_product_attention(device, backends="flash") + \ No newline at end of file From 88e79e06cf329e756862a616a70d37752d74fc21 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 3 Mar 2026 12:24:53 +0900 Subject: [PATCH 108/194] [CI] Update for torch 2.8 based image --- .github/workflows/docker-base-image.yml | 72 ------------------------- .github/workflows/docker-image-2-8.yml | 4 +- .github/workflows/docker-image.yml | 70 ------------------------ 3 files changed, 2 insertions(+), 144 deletions(-) delete mode 100644 .github/workflows/docker-base-image.yml delete mode 100644 .github/workflows/docker-image.yml diff --git a/.github/workflows/docker-base-image.yml b/.github/workflows/docker-base-image.yml deleted file mode 100644 index 2c29a11b..00000000 --- a/.github/workflows/docker-base-image.yml +++ /dev/null @@ -1,72 +0,0 @@ -name: Docker Base Image CI - -on: - push: - branches: [ "base" ] - repository_dispatch: - types: [ build_base ] - -jobs: - build: - runs-on: ubuntu-latest - - permissions: - contents: read - packages: write - - steps: - # Step 1: Checkout the repository - - name: Checkout Code - uses: actions/checkout@v4 - - # Step 2: Log in to GitHub Container Registry - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - # Step 2: Set environemnt - - name: Set environment - env: - GIT_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - if [ -n "${{ github.event.pull_request.head.sha }}" ]; then - echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV - echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}" - else - echo "GITHUB_SHA=${{ github.sha }}" >> $GITHUB_ENV - echo "GITHUB_SHA=${{ github.sha }}" - fi - - gem5_response_file=/tmp/releases-gem5-latest.json - curl -s https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} - GEM5_ASSET_ID=$(jq ".assets[0].id" ${gem5_response_file}) - echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" - echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV - - llvm_response_file=/tmp/releases-gem5-latest.json - curl -s https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file} - LLVM_ASSET_ID=$(jq ".assets[0].id" ${llvm_response_file}) - echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" - echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV - - spike_response_file=/tmp/releases-spike-latest.json - curl -s https://api.github.com/repos/PSAL-POSTECH/riscv-isa-sim/releases/latest > ${spike_response_file} - SPIKE_ASSET_ID=$(jq ".assets[0].id" ${spike_response_file}) - echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID" - echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID" >> $GITHUB_ENV - - # Step 3: Build and Push Docker Image - - name: Build and Push Docker Image - uses: docker/build-push-action@v4 - with: - context: . - file: ./Dockerfile.base - push: true - build-args: | - GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }} - LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }} - SPIKE_ASSET_ID=${{ env.SPIKE_ASSET_ID }} - tags: ghcr.io/psal-postech/torchsim_base:latest diff --git a/.github/workflows/docker-image-2-8.yml b/.github/workflows/docker-image-2-8.yml index 4d511a1a..f1e915d6 100644 --- a/.github/workflows/docker-image-2-8.yml +++ b/.github/workflows/docker-image-2-8.yml @@ -1,8 +1,8 @@ name: Docker image CI (PyTorch 2.8) on: - push: - branches: [ "torch_v2.8" ] + pull_request: + branches: [ "master", "develop" ] workflow_dispatch: jobs: diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml deleted file mode 100644 index eba48da2..00000000 --- a/.github/workflows/docker-image.yml +++ /dev/null @@ -1,70 +0,0 @@ -name: Docker image CI - -on: - pull_request: - branches: [ "master", "develop" ] - -jobs: - build-and-test: - runs-on: self-hosted - - permissions: - contents: read - packages: write - - steps: - # Step 1: Checkout the repository - - name: Checkout Code - uses: actions/checkout@v4 - with: - ref: ${{ github.event.pull_request.head.sha }} - submodules: recursive - - # Step 2: Log in to GitHub Container Registry - - name: Login to GHCR - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - # Step 3: Build and Push Docker Image - - name: Build and Push Docker Image - uses: docker/build-push-action@v6 - with: - context: . - file: ./Dockerfile - push: true - no-cache: true - tags: ghcr.io/psal-postech/torchsim-test:${{ github.sha }} - - # Step 4: Wait for GHCR propagation - - name: Wait for GHCR propagation - run: | - for i in {1..30}; do - echo "Checking if image exists in GHCR (attempt $i)..." - if docker manifest inspect ghcr.io/psal-postech/torchsim-test:${GITHUB_SHA} > /dev/null 2>&1; then - echo "Image is now available in GHCR." - exit 0 - fi - echo "Image not yet available, retrying in 30 seconds..." - sleep 20 - done - echo "Image did not become available in GHCR within expected time." - exit 1 - - test-pytorchsim-wrapper: - needs: build-and-test - uses: ./.github/workflows/pytorchsim_test.yml - with: - image_name: ghcr.io/psal-postech/torchsim-test:${{ github.sha }} - vector_lane: 128 - spad_size: 128 - -# call-test2: -# needs: build-and-test -# uses: ./.github/workflows/pytorchsim_test.yml -# with: -# image_name: ghcr.io/psal-postech/${GITHUB_SHA} -# vector_lane: 8 -# spad_size: 32 \ No newline at end of file From fc247be17221f2b6aa8c52228a2e86b7315ef78d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=9D=B4=EC=9E=AC=EA=B7=A0?= Date: Mon, 2 Mar 2026 00:28:31 +0900 Subject: [PATCH 109/194] [Template] Add cat & sort template + Multi-output (WIP) --- .../torch_openreg/openreg/__init__.py | 49 +++ PyTorchSimFrontend/mlir/mlir_cat_template.py | 167 +++++++++++ PyTorchSimFrontend/mlir/mlir_common.py | 6 +- PyTorchSimFrontend/mlir/mlir_lowering.py | 281 +++++++++++++++++- PyTorchSimFrontend/mlir/mlir_sort_template.py | 253 ++++++++++++++++ PyTorchSimFrontend/mlir/mlir_template.py | 30 +- tests/DeepSeek/test_deepseek_v3_base.py | 170 +++++++++-- tests/test_cat.py | 89 ++++++ tests/test_sort.py | 112 +++++++ 9 files changed, 1121 insertions(+), 36 deletions(-) create mode 100644 PyTorchSimFrontend/mlir/mlir_cat_template.py create mode 100644 PyTorchSimFrontend/mlir/mlir_sort_template.py create mode 100644 tests/test_cat.py create mode 100644 tests/test_sort.py diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py index f5aabc18..5603a4f7 100644 --- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py +++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py @@ -256,6 +256,52 @@ def launch_model(model, *args, stream_index=0, timestamp=0, **kwargs): from .random import * # noqa: F403 from .amp import * +def _precheck_cat_out_args(args, kwargs): + tensors = args[0] if len(args) > 0 else kwargs.get("tensors") + dim = args[1] if len(args) > 1 else kwargs.get("dim", 0) + out = kwargs.get("out", args[2] if len(args) > 2 else None) + + if out is None: + return + if not isinstance(tensors, (list, tuple)) or len(tensors) == 0: + raise RuntimeError("aten::cat.out requires non-empty tensor list") + if not all(isinstance(t, torch.Tensor) for t in tensors): + raise RuntimeError("aten::cat.out tensors must be Tensor values") + if not isinstance(out, torch.Tensor): + raise RuntimeError("aten::cat.out out must be a Tensor") + + rank = tensors[0].dim() + if rank == 0: + raise RuntimeError("aten::cat.out does not support scalar inputs") + if dim < 0: + dim += rank + if dim < 0 or dim >= rank: + raise RuntimeError(f"aten::cat.out dim out of range: dim={dim}, rank={rank}") + if any(t.dim() != rank for t in tensors): + raise RuntimeError("aten::cat.out inputs must have the same rank") + if any(t.dtype != tensors[0].dtype for t in tensors): + raise RuntimeError("aten::cat.out inputs must have the same dtype") + if out.dim() != rank: + raise RuntimeError("aten::cat.out out rank mismatch") + + for d in range(rank): + if d == dim: + continue + base = tensors[0].shape[d] + if any(t.shape[d] != base for t in tensors[1:]): + raise RuntimeError( + f"aten::cat.out non-concatenated dimension mismatch at dim={d}" + ) + if out.shape[d] != base: + raise RuntimeError(f"aten::cat.out out shape mismatch at dim={d}") + + expected = sum(t.shape[dim] for t in tensors) + if out.shape[dim] != expected: + raise RuntimeError( + f"aten::cat.out out concatenated dimension mismatch at dim={dim}: " + f"expected {expected}, got {out.shape[dim]}" + ) + def eager_to_compile(op_name): """ Register an eager mode operation as a graph-based implementation using torch.compile(). @@ -267,6 +313,9 @@ def eager_to_compile(op_name): torch.npu.eager_to_compile("aten::mul.Tensor") """ def wrapper(*args, **kwargs): + if op_name == "aten::cat.out": + _precheck_cat_out_args(args, kwargs) + @torch.compile(dynamic=False) def dummy_graph(*args, **kwargs): # Convert "aten::mul.Tensor" -> torch.ops.aten.mul.Tensor diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py new file mode 100644 index 00000000..996af1de --- /dev/null +++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py @@ -0,0 +1,167 @@ +from typing import List, Optional, cast + +import sympy +from torch._inductor.ir import Buffer, IRNode +from torch._inductor.virtualized import V + +from PyTorchSimFrontend.mlir import mlir_common +from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate, MLIRTemplateKernel + + +TEMPLATE = r""" +{{kernel.def_global_vars()}} + +func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=[X0, X1], outputs=[Y], names_str=NAMES_STR, input_reorder=input_reorder)}} { + {{ kernel.def_sram_buffer("X0", X0_TILE_DESC, id=0, indent_size=2) }} + {{ kernel.def_sram_buffer("X1", X1_TILE_DESC, id=1, indent_size=2) }} + {{ kernel.def_sram_buffer(OUT_DVAR, Y_TILE_DESC, id=2, indent_size=2) }} + {{ kernel.def_local_vars(indent_size=2) }} + + affine.for %cat_block = 0 to 1 step 1 { +{% if DIM == 0 %} + affine.for %index0 = 0 to {{ X0_ROWS }} step 1 { + affine.for %index1 = 0 to {{ COLS }} step 1 { + {{ kernel.def_dma_op("MVIN", "X0", X0_IDX, X0_TILE_DESC, indent_size=8) }} + {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y0_IDX, X0_TILE_DESC, indent_size=8) }} + } + } + + affine.for %index2 = 0 to {{ X1_ROWS }} step 1 { + affine.for %index3 = 0 to {{ COLS }} step 1 { + {{ kernel.def_dma_op("MVIN", "X1", X1_IDX, X1_TILE_DESC, indent_size=8) }} + {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y1_IDX, X1_TILE_DESC, indent_size=8) }} + } + } +{% else %} + affine.for %index0 = 0 to {{ ROWS }} step 1 { + affine.for %index1 = 0 to {{ X0_COLS }} step 1 { + {{ kernel.def_dma_op("MVIN", "X0", X0_IDX, X0_TILE_DESC, indent_size=8) }} + {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y0_IDX, X0_TILE_DESC, indent_size=8) }} + } + affine.for %index3 = 0 to {{ X1_COLS }} step 1 { + {{ kernel.def_dma_op("MVIN", "X1", X1_IDX, X1_TILE_DESC, indent_size=8) }} + {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y1_IDX, X1_TILE_DESC, indent_size=8) }} + } + } +{% endif %} + } { outer_loop=true } + return +} +""" + + +class MLIRCatTemplate(MLIRTemplate): + def __init__(self, input_nodes, layout, dim, input_reorder=None): + super().__init__("kernel", input_nodes, layout, input_reorder) + self.dim = dim + + def render( + self, + kernel: MLIRTemplateKernel, + template_buffer_node=None, + epilogue_nodes: Optional[List[IRNode]] = None, + tile_info=None, + **kwargs, + ): + is_out_variant = template_buffer_node is not None + if is_out_variant: + self.output_node = template_buffer_node + # cat template currently emits a single output buffer and does not + # support epilogue output remapping. + + def _unwrap_node(n): + return n.node if hasattr(n, "node") else n + + x0 = _unwrap_node(self.input_nodes[0]) + x1 = _unwrap_node(self.input_nodes[1]) + y = _unwrap_node(self.output_node) + + def _as_int(v): + try: + return int(v) + except Exception: + return int(V.graph.sizevars.size_hint(v)) + + x0_rows = _as_int(x0.get_size()[0]) + x1_rows = _as_int(x1.get_size()[0]) + x0_cols = _as_int(x0.get_size()[1]) + x1_cols = _as_int(x1.get_size()[1]) + y_cols = _as_int(y.get_size()[1]) + kernel.loop_size = None + + # 2D cat template with contiguous layout. + x0_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) + x0_tile_desc.set_tile_size_stride([1, 1], [1, 1]) + x0_tile_desc.set_name("x0_cat_tile") + x1_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) + x1_tile_desc.set_tile_size_stride([1, 1], [1, 1]) + x1_tile_desc.set_name("x1_cat_tile") + y_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) + y_tile_desc.set_tile_size_stride([1, 1], [1, 1]) + y_tile_desc.set_name("y_cat_tile") + + if self.dim == 0: + # Flattened offsets for dim=0 cat. + x0_idx = [sympy.Symbol("index0") * x0_cols, sympy.Symbol("index1")] + x1_idx = [sympy.Symbol("index2") * x1_cols, sympy.Symbol("index3")] + y0_idx = [sympy.Symbol("index0") * y_cols, sympy.Symbol("index1")] + y1_idx = [(sympy.Symbol("index2") + x0_rows) * y_cols, sympy.Symbol("index3")] + else: + # Flattened offsets for dim=1 cat. + x0_idx = [sympy.Symbol("index0") * x0_cols, sympy.Symbol("index1")] + x1_idx = [sympy.Symbol("index0") * x1_cols, sympy.Symbol("index3")] + y0_idx = [sympy.Symbol("index0") * y_cols, sympy.Symbol("index1")] + y1_idx = [sympy.Symbol("index0") * y_cols, sympy.Symbol("index3") + x0_cols] + + kernel.render_options = dict( + KERNEL_NAME=self.name, + kernel=kernel, + X0=x0, + X1=x1, + Y=y, + OUT_DVAR="out_ptr1" if is_out_variant else "Y", + NAMES_STR="X0, X1, out_ptr1" if is_out_variant else "X0, X1, Y", + DIM=self.dim, + X0_ROWS=x0_rows, + X1_ROWS=x1_rows, + ROWS=x0_rows, + X0_COLS=x0_cols, + X1_COLS=x1_cols, + COLS=x0_cols, + X0_TILE_DESC=x0_tile_desc, + X1_TILE_DESC=x1_tile_desc, + Y_TILE_DESC=y_tile_desc, + X0_IDX=x0_idx, + X1_IDX=x1_idx, + Y0_IDX=y0_idx, + Y1_IDX=y1_idx, + input_reorder=self.input_reorder, + ) + # Needed when epilogue fusion requests set_ranges(). + kernel.dim_aliasing = {"index0": "index0", "index1": "index1"} + + if hasattr(self.output_node, "node") and hasattr(self.output_node.node, "get_name"): + output_node_name = self.output_node.node.get_name() + elif hasattr(self.output_node, "get_name"): + output_node_name = self.output_node.get_name() + else: + output_node_name = self.output_node.name + + if hasattr(y, "get_numel"): + y_numel = y.get_numel() + elif hasattr(y, "node") and hasattr(y.node, "get_numel"): + y_numel = y.node.get_numel() + else: + y_numel = None + + kernel.epilogue_info = dict( + output_node=output_node_name, + sram_var="y_cat_tile", + dram_var=kernel.render_options["OUT_DVAR"], + dram_tile_desc=y_tile_desc, + ) + if y_numel is not None: + kernel.exception_nodes[kernel.render_options["OUT_DVAR"]] = {"numel": y_numel} + + code = self._template_from_string(TEMPLATE).render(**kernel.render_options) + return code diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 34b185b8..256d7101 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -173,7 +173,11 @@ def get_mlir_shape(info): def mlir_argdefs(self, extra_node=dict()): buffer_types = {} for x in V.graph.buffers: - if not isinstance(x.layout, MultiOutputLayout): # FIXME: MultiOutputLayout should be handled + if isinstance(x.layout, MultiOutputLayout): + # MultiOutput kernel containers own concrete output nodes in `outputs`. + for out in getattr(x, "outputs", []): + buffer_types[out.get_name()] = [out.get_dtype(), out.get_numel(), out.get_size(), out.get_stride()] + else: buffer_types[x.get_name()] = [x.get_dtype(), x.get_numel(), x.get_size(), x.get_stride()] for name, val in V.graph.graph_inputs.items(): if isinstance(val, sympy.Expr): diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py index ebf0c80e..0f28f03b 100644 --- a/PyTorchSimFrontend/mlir/mlir_lowering.py +++ b/PyTorchSimFrontend/mlir/mlir_lowering.py @@ -15,10 +15,15 @@ from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate +from PyTorchSimFrontend.mlir.mlir_cat_template import MLIRCatTemplate +from PyTorchSimFrontend.mlir.mlir_sort_template import MLIRSortTemplate from PyTorchSimFrontend import extension_config aten = torch.ops.aten aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm") +_orig_cat_default_lowering = lowerings.get(aten.cat.default) +_orig_cat_out_lowering = lowerings.get(aten.cat.out) +_orig_sort_values_stable_lowering = lowerings.get(aten.sort.values_stable) def tuned_mm(mat1, mat2, * ,layout=None): m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout) @@ -181,11 +186,285 @@ def custom_unsafe_index(x, indices): x.realize() return index_impl(x, indices, check=False) + +def _cat_layout(tensors: Sequence[TensorBox], dim: int) -> ir.Layout: + with V.graph.fake_mode: + output = torch.ops.aten.cat( + [ir.ir_node_to_tensor(t, guard_shape=True) for t in tensors], + dim, + ) + sizes = ir.convert_shape_to_inductor(output.size()) + stride = ir.convert_shape_to_inductor(output.stride()) + return ir.FixedLayout( + tensors[0].get_device(), + tensors[0].get_dtype(), + sizes, + stride, + ) + + +def _can_use_cat_template(tensors: Sequence[TensorBox], dim: int) -> bool: + # Current template specialization: 2 inputs, rank-2, dim in {0, 1}. + if len(tensors) != 2: + return False + if not all(hasattr(t, "get_size") and hasattr(t, "get_dtype") and hasattr(t, "realize") for t in tensors): + return False + if tensors[0].get_dtype() != tensors[1].get_dtype(): + return False + rank0 = len(tensors[0].get_size()) + rank1 = len(tensors[1].get_size()) + if rank0 != 2 or rank1 != 2: + return False + if dim < 0: + dim += rank0 + if dim not in (0, 1): + return False + + if dim == 0: + cols0 = tensors[0].get_size()[1] + cols1 = tensors[1].get_size()[1] + return V.graph.sizevars.statically_known_equals(cols0, cols1) + + rows0 = tensors[0].get_size()[0] + rows1 = tensors[1].get_size()[0] + return V.graph.sizevars.statically_known_equals(rows0, rows1) + + +def _cat_fallback(reason: str, tensors: Sequence[TensorBox], dim: int): + # Non-template cases delegate to the original lowering path. + return _orig_cat_default_lowering(tensors, dim) + + +def _custom_cat_impl(tensors: Sequence[TensorBox], dim: int = 0): + if _orig_cat_default_lowering is None: + raise RuntimeError("Original aten.cat.default lowering is missing") + if len(tensors) > 0: + rank = len(tensors[0].get_size()) + if dim < 0: + dim += rank + if not _can_use_cat_template(tensors, dim): + return _cat_fallback("default-path", tensors, dim) + + for t in tensors: + t.realize() + layout = _cat_layout(tensors, dim) + mlir_template = MLIRCatTemplate(list(tensors), layout, dim=dim) + return mlir_template.generate().output_node() + + +def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0): + return _custom_cat_impl(tensors, dim) + + +def custom_cat_out(tensors: Sequence[TensorBox], dim: int = 0, out: Optional[TensorBox] = None): + if _orig_cat_out_lowering is None: + raise RuntimeError("Original aten.cat.out lowering is missing") + if out is None: + return _orig_cat_out_lowering(tensors, dim, out) + + copy_default_lowering = lowerings.get(aten.copy_.default) + slice_tensor_lowering = lowerings.get(aten.slice.Tensor) + if copy_default_lowering is None or slice_tensor_lowering is None: + raise RuntimeError("cat.out lowering requires aten.copy_.default and aten.slice.Tensor lowerings") + + # Lower cat.out as a sequence of slice+copy ops so each piece still runs + # through the existing compiled/simulated kernel path. + if len(tensors) == 0: + raise RuntimeError("cat.out requires at least one input tensor") + if not all(hasattr(t, "get_size") and hasattr(t, "get_dtype") and hasattr(t, "realize") for t in tensors): + raise RuntimeError("cat.out inputs must be tensor-like values") + rank = len(tensors[0].get_size()) + if rank == 0: + raise RuntimeError("cat.out does not support scalar inputs") + if dim < 0: + dim = dim + rank + if dim < 0 or dim >= rank: + raise RuntimeError(f"cat.out dim out of range: dim={dim}, rank={rank}") + if any(len(t.get_size()) != rank for t in tensors): + raise RuntimeError("cat.out inputs must have the same rank") + if any(t.get_dtype() != tensors[0].get_dtype() for t in tensors): + raise RuntimeError("cat.out inputs must have the same dtype") + # cat semantics: all non-cat dimensions must be equal. + for i in range(rank): + if i == dim: + continue + base = tensors[0].get_size()[i] + if any(not V.graph.sizevars.statically_known_equals(base, t.get_size()[i]) for t in tensors[1:]): + raise RuntimeError(f"cat.out non-concatenated dimension mismatch at dim={i}") + + # Output shape must match concatenated shape. + if not hasattr(out, "get_size"): + raise RuntimeError("cat.out output must be tensor-like") + out_sizes = list(out.get_size()) + if len(out_sizes) != rank: + raise RuntimeError("cat.out output rank mismatch") + for i in range(rank): + if i == dim: + continue + if not V.graph.sizevars.statically_known_equals(out_sizes[i], tensors[0].get_size()[i]): + raise RuntimeError(f"cat.out output shape mismatch at dim={i}") + expected_cat = sum(t.get_size()[dim] for t in tensors) + if not V.graph.sizevars.statically_known_equals(out_sizes[dim], expected_cat): + raise RuntimeError(f"cat.out output concatenated dimension mismatch at dim={dim}") + + if isinstance(out, TensorBox): + out.realize() + + offset = 0 + for src in tensors: + src.realize() + end = offset + src.get_size()[dim] + dst_view = slice_tensor_lowering(out, dim, offset, end, 1) + copy_default_lowering(dst_view, src) + offset = end + return out + + +def _custom_sort_values_impl( + self: TensorBox, + dim: int = -1, + descending: bool = False, + values: Optional[TensorBox] = None, + indices: Optional[TensorBox] = None, + stable: Optional[bool] = None, +): + if values is None or indices is None: + raise RuntimeError("sort.values* lowering requires both out tensors: values, indices") + + def _normalize_dim(rank: int, d: int) -> int: + return d + rank if d < 0 else d + + if not hasattr(self, "get_size"): + raise RuntimeError("sort.values* lowering requires TensorBox input") + + rank = len(self.get_size()) + norm_dim = _normalize_dim(rank, dim) + if norm_dim < 0 or norm_dim >= rank: + raise RuntimeError(f"sort.values* dim out of range: dim={dim}, rank={rank}") + if rank != 2: + raise RuntimeError(f"sort.values* lowering currently supports rank-2 only, got rank={rank}") + if norm_dim not in (0, 1): + raise RuntimeError(f"sort.values* lowering currently supports dim in {{0,1}} only, got dim={norm_dim}") + + self.realize() + if isinstance(values, TensorBox): + values.realize() + if isinstance(indices, TensorBox): + indices.realize() + + value_layout, _ = _sort_layouts(self, norm_dim, descending) + mlir_template = MLIRSortTemplate( + [self], + value_layout, + dim=norm_dim, + descending=descending, + stable=True if stable is None else stable, + indices_node=indices, + ) + sorted_values = mlir_template.generate(template_buffer_node=values, epilogue_nodes=[indices]).output_node() + return sorted_values, indices + + +def _sort_layouts(x: TensorBox, dim: int, descending: bool): + with V.graph.fake_mode: + v, i = torch.ops.aten.sort( + ir.ir_node_to_tensor(x, guard_shape=True), + dim, + descending, + ) + v_sizes = ir.convert_shape_to_inductor(v.size()) + v_stride = ir.convert_shape_to_inductor(v.stride()) + i_sizes = ir.convert_shape_to_inductor(i.size()) + i_stride = ir.convert_shape_to_inductor(i.stride()) + + value_layout = ir.FixedLayout(x.get_device(), x.get_dtype(), v_sizes, v_stride) + index_layout = ir.FixedLayout(x.get_device(), torch.int64, i_sizes, i_stride) + return value_layout, index_layout + + +def custom_sort_stable( + self: TensorBox, + *, + stable: Optional[bool] = None, + dim: int = -1, + descending: bool = False, +): + empty_strided_lowering = lowerings.get(aten.empty_strided.default) + if empty_strided_lowering is None: + if _orig_sort_values_stable_lowering is None: + raise RuntimeError("sort.stable lowering requires aten.empty_strided.default") + return _orig_sort_values_stable_lowering(self, dim=dim, descending=descending, stable=True) + + rank = len(self.get_size()) if hasattr(self, "get_size") else 0 + norm_dim = dim + rank if dim < 0 else dim + if rank > 0 and (norm_dim < 0 or norm_dim >= rank): + raise RuntimeError(f"sort.stable dim out of range: dim={dim}, rank={rank}") + + # Template specialization supports rank-2 and dim in {0,1}. + if rank == 2 and norm_dim not in (0, 1): + if _orig_sort_values_stable_lowering is None: + raise RuntimeError("Original aten.sort.values_stable lowering is missing") + return _orig_sort_values_stable_lowering(self, dim=dim, descending=descending, stable=True) + + try: + value_layout, index_layout = _sort_layouts(self, norm_dim, descending) + values = empty_strided_lowering( + list(value_layout.size), + list(value_layout.stride), + dtype=value_layout.dtype, + device=self.get_device(), + ) + indices = empty_strided_lowering( + list(index_layout.size), + list(index_layout.stride), + dtype=index_layout.dtype, + device=self.get_device(), + ) + return _custom_sort_values_impl( + self=self, + dim=dim, + descending=descending, + values=values, + indices=indices, + stable=True if stable is None else stable, + ) + except Exception: + if _orig_sort_values_stable_lowering is None: + raise + return _orig_sort_values_stable_lowering(self, dim=dim, descending=descending, stable=stable) + + +def custom_sort_values_stable( + self: TensorBox, + *, + stable: Optional[bool] = None, + dim: int = -1, + descending: bool = False, + values: Optional[TensorBox] = None, + indices: Optional[TensorBox] = None, +): + return _custom_sort_values_impl( + self=self, + dim=dim, + descending=descending, + values=values, + indices=indices, + stable=stable, + ) + + lowerings.update({getattr(aten.mm, overload): tuned_mm for overload in aten.mm.overloads()}) lowerings.update({getattr(aten.addmm, overload): tuned_addmm for overload in aten.addmm.overloads()}) lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()}) lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()}) lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()}) lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()}) + +lowerings.update({aten.cat.default: custom_cat_default}) +lowerings.update({aten.cat.out: custom_cat_out}) + +lowerings.update({aten.sort.stable: custom_sort_stable}) +lowerings.update({aten.sort.values_stable: custom_sort_values_stable}) + if extension_config.CONFIG_USE_TIMING_POOLING: - lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template \ No newline at end of file + lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template diff --git a/PyTorchSimFrontend/mlir/mlir_sort_template.py b/PyTorchSimFrontend/mlir/mlir_sort_template.py new file mode 100644 index 00000000..d12c7570 --- /dev/null +++ b/PyTorchSimFrontend/mlir/mlir_sort_template.py @@ -0,0 +1,253 @@ +from typing import List, Optional + +import sympy +from torch._inductor.ir import IRNode +from torch._inductor.virtualized import V + +from PyTorchSimFrontend.mlir import mlir_common +from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate, MLIRTemplateKernel + + +TEMPLATE = r""" +{{kernel.def_global_vars()}} + +func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=[X, YI], outputs=[YV], names_str=NAMES_STR, input_reorder=input_reorder)}} { + {{ kernel.def_sram_buffer("YI", YI_TILE_DESC, id=1, indent_size=2) }} + {{ kernel.def_sram_buffer(OUT_DVAR, YV_TILE_DESC, id=2, indent_size=2) }} + {{ kernel.def_local_vars(indent_size=2) }} + + %c0 = arith.constant 0 : index + %c_cols = arith.constant {{ COLS }} : index + + affine.for %sort_block = 0 to 1 step 1 { + // Initialize output value/index buffers. + affine.for %row = 0 to {{ ROWS }} step 1 { + affine.for %col = 0 to {{ COLS }} step 1 { + {{ kernel.def_dma_op("MVIN", "X", INIT_X_IDX, X_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=8) }} + {{ kernel.def_dma_op("MVOUT", OUT_DVAR, INIT_YV_IDX, X_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=8) }} +{% if DIM == 1 %} + %idx_i64 = arith.index_cast %col : index to {{ YI_ELEM_TYPE }} +{% else %} + %idx_i64 = arith.index_cast %row : index to {{ YI_ELEM_TYPE }} +{% endif %} + memref.store %idx_i64, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} + {{ kernel.def_dma_op("MVOUT", "YI", INIT_YI_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=8) }} + } + } + +{% if DIM == 1 %} + // Stable bubble sort on each row (dim=1). + affine.for %row = 0 to {{ ROWS }} step 1 { + affine.for %pass = 0 to {{ COLS }} step 1 { + affine.for %j = 0 to {{ COLS_MINUS1 }} step 1 { + {{ kernel.def_dma_op("MVIN", OUT_DVAR, D1_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }} + %lhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} + + {{ kernel.def_dma_op("MVIN", OUT_DVAR, D1_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }} + %rhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} + +{% if DESCENDING %} + %need_swap = arith.cmpf olt, %lhs, %rhs : {{ YV_ELEM_TYPE }} +{% else %} + %need_swap = arith.cmpf ogt, %lhs, %rhs : {{ YV_ELEM_TYPE }} +{% endif %} + scf.if %need_swap { + memref.store %rhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} + {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D1_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} + + memref.store %lhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} + {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D1_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} + + {{ kernel.def_dma_op("MVIN", "YI", D1_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} + %li = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} + + {{ kernel.def_dma_op("MVIN", "YI", D1_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} + %ri = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} + + memref.store %ri, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} + {{ kernel.def_dma_op("MVOUT", "YI", D1_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} + + memref.store %li, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} + {{ kernel.def_dma_op("MVOUT", "YI", D1_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} + } + } + } + } +{% else %} + // Stable bubble sort on each column (dim=0). + affine.for %col = 0 to {{ COLS }} step 1 { + affine.for %pass = 0 to {{ ROWS }} step 1 { + affine.for %i = 0 to {{ ROWS_MINUS1 }} step 1 { + {{ kernel.def_dma_op("MVIN", OUT_DVAR, D0_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }} + %lhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} + + {{ kernel.def_dma_op("MVIN", OUT_DVAR, D0_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }} + %rhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} + +{% if DESCENDING %} + %need_swap = arith.cmpf olt, %lhs, %rhs : {{ YV_ELEM_TYPE }} +{% else %} + %need_swap = arith.cmpf ogt, %lhs, %rhs : {{ YV_ELEM_TYPE }} +{% endif %} + scf.if %need_swap { + memref.store %rhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} + {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D0_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} + + memref.store %lhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} + {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D0_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} + + {{ kernel.def_dma_op("MVIN", "YI", D0_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} + %li = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} + + {{ kernel.def_dma_op("MVIN", "YI", D0_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} + %ri = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} + + memref.store %ri, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} + {{ kernel.def_dma_op("MVOUT", "YI", D0_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} + + memref.store %li, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} + {{ kernel.def_dma_op("MVOUT", "YI", D0_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} + } + } + } + } +{% endif %} + } { outer_loop=true } + return +} +""" + + +class MLIRSortTemplate(MLIRTemplate): + def __init__(self, input_nodes, layout, dim, descending=False, stable=False, indices_node=None, input_reorder=None): + super().__init__("kernel", input_nodes, layout, input_reorder) + self.dim = dim + self.descending = descending + self.stable = stable + self.indices_node = indices_node + + def render( + self, + kernel: MLIRTemplateKernel, + template_buffer_node=None, + epilogue_nodes: Optional[List[IRNode]] = None, + tile_info=None, + **kwargs, + ): + if template_buffer_node is not None: + self.output_node = template_buffer_node + if self.indices_node is None: + raise RuntimeError("MLIRSortTemplate requires indices output node") + + x = self.input_nodes[0] + yv = self.output_node + yi = self.indices_node + + def _as_int(v): + try: + return int(v) + except Exception: + return int(V.graph.sizevars.size_hint(v)) + + x_size = x.get_size() + if len(x_size) != 2: + raise RuntimeError("MLIRSortTemplate currently supports rank-2 input only") + if self.dim not in (0, 1): + raise RuntimeError(f"MLIRSortTemplate currently supports dim in {{0,1}} only, got dim={self.dim}") + + rows = _as_int(x_size[0]) + cols = _as_int(x_size[1]) + cols_minus1 = max(0, cols - 1) + rows_minus1 = max(0, rows - 1) + + x_dtype = x.get_dtype() + yv_dtype = yv.get_dtype() + yi_dtype = yi.get_dtype() + if x_dtype != yv_dtype: + raise RuntimeError("sort template requires input/value dtype match") + + yi_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) + yi_tile_desc.set_tile_size_stride([1, 1], [1, 1]) + yi_tile_desc.set_name("yi_sort_tile") + yv_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) + yv_tile_desc.set_tile_size_stride([1, 1], [1, 1]) + yv_tile_desc.set_name("yv_sort_tile") + # Neighbor element descriptors use DRAM offset to preserve affine stride metadata. + yv_s1_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) + yv_s1_tile_desc.set_tile_size_stride([1, 1], [1, 1]) + yv_s1_tile_desc.set_name("yv_sort_tile") + yi_s1_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) + yi_s1_tile_desc.set_tile_size_stride([1, 1], [1, 1]) + yi_s1_tile_desc.set_name("yi_sort_tile") + if int(self.dim) == 1: + yv_s1_tile_desc.offset = sympy.Integer(1) + yi_s1_tile_desc.offset = sympy.Integer(1) + else: + yv_s1_tile_desc.offset = sympy.Integer(cols) + yi_s1_tile_desc.offset = sympy.Integer(cols) + + row = sympy.Symbol("row") + col = sympy.Symbol("col") + i = sympy.Symbol("i") + j = sympy.Symbol("j") + + init_x_idx = [row * cols, col] + init_yv_idx = [row * cols, col] + init_yi_idx = [row * cols, col] + + d1_s0_idx = [row * cols, j] + d1_s1_idx = [row * cols, j] + + d0_s0_idx = [i * cols, col] + d0_s1_idx = [i * cols, col] + + kernel.loop_size = None + numel = rows * cols + kernel.render_options = dict( + KERNEL_NAME=self.name, + kernel=kernel, + X=x, + YV=yv, + YI=yi, + OUT_DVAR="YV", + NAMES_STR="X, YI, YV", + ROWS=rows, + COLS=cols, + COLS_MINUS1=cols_minus1, + ROWS_MINUS1=rows_minus1, + DIM=int(self.dim), + DESCENDING=bool(self.descending), + YI_TILE_DESC=yi_tile_desc, + YV_TILE_DESC=yv_tile_desc, + YI_S1_TILE_DESC=yi_s1_tile_desc, + YV_S1_TILE_DESC=yv_s1_tile_desc, + INIT_X_IDX=init_x_idx, + INIT_YV_IDX=init_yv_idx, + INIT_YI_IDX=init_yi_idx, + D1_S0_IDX=d1_s0_idx, + D1_S1_IDX=d1_s1_idx, + D0_S0_IDX=d0_s0_idx, + D0_S1_IDX=d0_s1_idx, + YV_ELEM_TYPE=mlir_common.DTYPE_TO_MLIR[yv_dtype], + YI_ELEM_TYPE=mlir_common.DTYPE_TO_MLIR[yi_dtype], + X_MEMREF_TYPE=f"memref<{numel}x{mlir_common.DTYPE_TO_MLIR[x_dtype]}>", + YV_MEMREF_TYPE=f"memref<{numel}x{mlir_common.DTYPE_TO_MLIR[yv_dtype]}>", + YI_MEMREF_TYPE=f"memref<{numel}x{mlir_common.DTYPE_TO_MLIR[yi_dtype]}>", + YV_TILE_MEMREF_TYPE=yv_tile_desc.get_mlir_shape(mlir_common.DTYPE_TO_MLIR[yv_dtype]), + YI_TILE_MEMREF_TYPE=yi_tile_desc.get_mlir_shape(mlir_common.DTYPE_TO_MLIR[yi_dtype]), + X_TILE_DESC=yv_tile_desc, + input_reorder=self.input_reorder, + ) + + output_node_name = yv.get_name() if hasattr(yv, "get_name") else yv.name + kernel.epilogue_info = dict( + output_node=output_node_name, + sram_var="yv_sort_tile", + dram_var=kernel.render_options["OUT_DVAR"], + dram_tile_desc=yv_tile_desc, + ) + kernel.exception_nodes[kernel.render_options["OUT_DVAR"]] = {"numel": yv.get_numel()} + kernel.exception_nodes["YI"] = {"numel": yi.get_numel()} + + code = self._template_from_string(TEMPLATE).render(**kernel.render_options) + return code diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index b1c756ba..76b0ef71 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -403,7 +403,7 @@ def call_kernel(self, kernel_name): _, call_args, _, _ = self.kernel_group.args.mlir_argdefs() # generate the code to call this wrapper.generate_kernel_call( - kernel_name if self.outer_func_name is None else "wrapper_" + kernel_name, call_args) + kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", call_args) def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes, tile_info): with self as kernel: @@ -628,8 +628,26 @@ def def_kernel( self.buffer_names[node.get_name()] = self.epilogue_info['sram_var'] def hook(): - arg_defs, *_ = self.kernel_group.args.mlir_argdefs(extra_node=extra_node) - return f"({', '.join(arg_defs)})" + arg_defs, call_args, *_ = self.kernel_group.args.mlir_argdefs(extra_node=extra_node) + output_names = names[len(inputs) : len(inputs) + len(outputs)] + out_ptr_idx = 0 + renamed_arg_defs = [] + for outer, arg_def in zip(call_args, arg_defs): + raw_symbol = arg_def.split(":", 1)[0].strip().lstrip("%") + if outer in self.kernel_group.args.input_buffers: + symbol = self.kernel_group.args.input_buffers[outer] + elif outer in self.kernel_group.args.output_buffers: + symbol = self.kernel_group.args.output_buffers[outer] + elif raw_symbol.startswith("out_ptr") and out_ptr_idx < len(output_names): + symbol = output_names[out_ptr_idx] + out_ptr_idx += 1 + elif outer in self.kernel_group.args.sizevars: + symbol = self.kernel_group.args.sizevars[outer] + else: + symbol = raw_symbol + _, arg_type = arg_def.split(":", 1) + renamed_arg_defs.append(f"%{symbol}:{arg_type}") + return f"({', '.join(renamed_arg_defs)})" assert "" not in self.render_hooks self.render_hooks[""] = hook @@ -1151,6 +1169,8 @@ def __init__(self, name, input_nodes, layout, input_reorder = None): super().__init__(name) self.input_nodes = [node for node in input_nodes if node is not None] self.output_node: Buffer = Buffer(name="buf_out", layout=layout) + # Multi-output templates can override this with explicit output buffers. + self.output_nodes = [self.output_node] self.input_reorder = input_reorder self.layout = layout @@ -1166,10 +1186,12 @@ def generate(self, **kwargs) -> ChoiceCaller: kernel_hash_name = f"mlir_{self.name}_{next(self.index_counter)}" extra_args = [] # create the BenchmarkRequest + output_nodes = getattr(self, "output_nodes", None) or [self.output_node] + bmreq = MLIRBenchmarkRequest( kernel_name=kernel_name, input_tensor_meta=TensorMeta.from_irnodes(self.input_nodes), - output_tensor_meta=TensorMeta.from_irnodes(self.output_node), + output_tensor_meta=TensorMeta.from_irnodes(output_nodes), extra_args=extra_args, source_code=code, ) diff --git a/tests/DeepSeek/test_deepseek_v3_base.py b/tests/DeepSeek/test_deepseek_v3_base.py index b8402c8b..ade787c5 100644 --- a/tests/DeepSeek/test_deepseek_v3_base.py +++ b/tests/DeepSeek/test_deepseek_v3_base.py @@ -1,8 +1,55 @@ import os import sys import argparse +import copy +from pathlib import Path import torch +# recursive compile for some ops that are caused by graph break +torch.npu.register_eager_to_compile([ + "aten::zero_", + "aten::sum.IntList_out", + "aten::mul.out", + "aten::floor_divide", + "aten::floor_divide.Tensor", + "aten::floor_divide.Scalar", + "aten::cat.out", + "aten::sort.values_stable", +]) + + +def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): + out_cpu = out.cpu() + max_diff = (out_cpu - cpu_out).abs().max().item() + mean_diff = (out_cpu - cpu_out).abs().mean().item() + if torch.allclose(out_cpu, cpu_out, rtol=rtol, atol=atol): + message = f"|{name} Test Passed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + print(f"Max absolute difference: {max_diff:.6f}") + print(f"Mean absolute difference: {mean_diff:.6f}") + else: + message = f"|{name} Test Failed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + print("NPU out: ", out_cpu) + print("CPU out: ", cpu_out) + print(f"Max absolute difference: {max_diff:.6f}") + print(f"Mean absolute difference: {mean_diff:.6f}") + exit(1) + + +def _extract_logits(output): + if isinstance(output, torch.Tensor): + return output + if hasattr(output, "logits"): + return output.logits + if isinstance(output, (list, tuple)) and len(output) > 0 and isinstance(output[0], torch.Tensor): + return output[0] + raise TypeError(f"Unsupported output type for comparison: {type(output)}") + def _dtype_from_str(name: str) -> torch.dtype: return { @@ -81,7 +128,7 @@ def _maybe_scale_config(config, scale=1.0, max_layers=None): def _apply_preset(scale, max_layers, batch, seq_len, preset): if preset == "tiny": - return 0.03, 4, 1, min(seq_len, 16) + return 0.03, 1, 1, min(seq_len, 16) if preset == "small": return 0.07, 8, 1, min(seq_len, 32) if preset == "medium": @@ -89,8 +136,58 @@ def _apply_preset(scale, max_layers, batch, seq_len, preset): return scale, max_layers, batch, seq_len +def _togsim_log_count() -> int: + log_dir = Path("togsim_results") + if not log_dir.exists(): + return 0 + return len(list(log_dir.glob("*.log"))) + + +def _assert_simulation_happened(before_count: int, case_name: str): + after_count = _togsim_log_count() + if after_count <= before_count: + raise RuntimeError( + f"{case_name}: TOGSim log count did not increase " + f"(before={before_count}, after={after_count})" + ) + print(f"{case_name}: TOGSim logs increased ({before_count} -> {after_count})") + + +def test_cat_default(device): + def cat_default_fn(a, b): + return torch.cat([a, b], dim=0) + + x = torch.randn(8, 16, device=device) + y = torch.randn(6, 16, device=device) + opt_fn = torch.compile(dynamic=False)(cat_default_fn) + + before = _togsim_log_count() + out = opt_fn(x, y) + _assert_simulation_happened(before, "cat.default") + + cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0) + test_result("cat.default", out, cpu_out, rtol=1e-4, atol=1e-4) + + +def test_cat_out(device): + def cat_out_fn(a, b, out): + return torch.ops.aten.cat.out([a, b], 0, out=out) + + x = torch.randn(8, 16, device=device) + y = torch.randn(6, 16, device=device) + out_buf = torch.empty(14, 16, device=device) + opt_fn = torch.compile(dynamic=False)(cat_out_fn) + + before = _togsim_log_count() + out = opt_fn(x, y, out_buf) + _assert_simulation_happened(before, "cat.out") + + cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0) + test_result("cat.out", out, cpu_out, rtol=1e-4, atol=1e-4) + + @torch.no_grad() -def run_deep_seek_v3_base_test( +def run_deepseek_v3_base( model_id, device, init_mode="config-random", @@ -120,7 +217,6 @@ def run_deep_seek_v3_base_test( # (call .to_dict()), so only disable it for pretrained loading path. if init_mode == "pretrained" and getattr(config, "quantization_config", None) is not None: config.quantization_config = None - config = _maybe_scale_config(config, scale=scale, max_layers=max_layers) if init_mode == "config-random": @@ -141,7 +237,6 @@ def run_deep_seek_v3_base_test( else: raise ValueError(f"Unsupported init mode: {init_mode}") - model = model.to(device) model_params = sum(p.numel() for p in model.parameters()) print("init mode:", init_mode) print("scaled hidden_size:", getattr(config, "hidden_size", "n/a")) @@ -157,23 +252,33 @@ def run_deep_seek_v3_base_test( revision=revision, ) encoded = tokenizer(prompt, return_tensors="pt") - input_ids = encoded["input_ids"].to(device) + cpu_input_ids = encoded["input_ids"].cpu() else: vocab_size = getattr(config, "vocab_size", None) if vocab_size is None: raise ValueError("Config has no vocab_size; use --use-tokenizer or pass a model with vocab_size.") - input_ids = _build_random_inputs(batch, seq_len, vocab_size, device) + cpu_input_ids = _build_random_inputs(batch, seq_len, vocab_size, torch.device("cpu")) + input_ids = cpu_input_ids.to(device) - if compile_model: - model = torch.compile(model, dynamic=False) + # CPU version + model_cpu = copy.deepcopy(model).cpu().eval() + cpu_out = _extract_logits(model_cpu(cpu_input_ids)) - out = model(input_ids) - logits = out.logits + # NPU version + model_npu = copy.deepcopy(model_cpu).to(device).eval() + if compile_model: + model_npu = torch.compile(model_npu, dynamic=False) + npu_out = _extract_logits(model_npu(input_ids)) + + # Campare results + test_result( + "DeepSeek V3 Base", + npu_out, + cpu_out, + rtol=3e-1, + atol=2e-1, + ) - print("logits shape:", tuple(logits.shape)) - print("logits dtype:", logits.dtype) - print("logits max:", logits.max().item()) - if __name__ == "__main__": parser = argparse.ArgumentParser(description="DeepSeek V3 download-based test") @@ -181,7 +286,7 @@ def run_deep_seek_v3_base_test( parser.add_argument("--revision", type=str, default=None) parser.add_argument("--trust-remote-code", action="store_true", default=True) parser.add_argument("--init-mode", type=str, default="config-random", choices=["config-random", "pretrained"]) - parser.add_argument("--preset", type=str, default="tiny", choices=["none", "tiny", "small", "medium"]) + parser.add_argument("--preset", type=str, default="small", choices=["none", "tiny", "small", "medium"]) parser.add_argument("--scale", type=float, default=1.0) parser.add_argument("--max-layers", type=int, default=None) parser.add_argument("--dtype", type=str, default="float32", choices=["float32", "float16", "bfloat16"]) @@ -190,6 +295,7 @@ def run_deep_seek_v3_base_test( parser.add_argument("--use-tokenizer", action="store_true") parser.add_argument("--prompt", type=str, default="Hello, DeepSeek V3") parser.add_argument("--compile", action="store_true", default=True) + parser.add_argument("--test", type=str, default="e2e", choices=["all", "e2e", "cat"]) args = parser.parse_args() @@ -203,18 +309,22 @@ def run_deep_seek_v3_base_test( device = torch.device("npu:0") - run_deep_seek_v3_base_test( - model_id=args.model_id, - device=device, - init_mode=args.init_mode, - scale=args.scale, - max_layers=args.max_layers, - dtype=args.dtype, - batch=args.batch, - seq_len=args.seq_len, - use_tokenizer=args.use_tokenizer, - prompt=args.prompt, - trust_remote_code=args.trust_remote_code, - revision=args.revision, - compile_model=args.compile, - ) + if args.test in ("all", "cat"): + test_cat_default(device) + test_cat_out(device) + if args.test in ("all", "e2e"): + run_deepseek_v3_base( + model_id=args.model_id, + device=device, + init_mode=args.init_mode, + scale=args.scale, + max_layers=args.max_layers, + dtype=args.dtype, + batch=args.batch, + seq_len=args.seq_len, + use_tokenizer=args.use_tokenizer, + prompt=args.prompt, + trust_remote_code=args.trust_remote_code, + revision=args.revision, + compile_model=args.compile, + ) diff --git a/tests/test_cat.py b/tests/test_cat.py new file mode 100644 index 00000000..32573a05 --- /dev/null +++ b/tests/test_cat.py @@ -0,0 +1,89 @@ +import argparse +from pathlib import Path + +import torch + + +def _test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): + if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): + message = f"|{name} Test Passed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + return + + message = f"|{name} Test Failed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + print("custom out: ", out.cpu()) + print("cpu out: ", cpu_out) + raise RuntimeError(f"{name} mismatch") + + +def _togsim_log_count() -> int: + log_dir = Path("togsim_results") + if not log_dir.exists(): + return 0 + return len(list(log_dir.glob("*.log"))) + + +def _assert_simulation_happened(before_count: int, case_name: str): + after_count = _togsim_log_count() + if after_count <= before_count: + raise RuntimeError( + f"{case_name}: TOGSim log count did not increase " + f"(before={before_count}, after={after_count})" + ) + print(f"{case_name}: TOGSim logs increased ({before_count} -> {after_count})") + + +def test_cat_default(device): + def cat_default_fn(a, b): + return torch.cat([a, b], dim=0) + + x = torch.randn(8, 16, device=device) + y = torch.randn(6, 16, device=device) + opt_fn = torch.compile(dynamic=False)(cat_default_fn) + + before = _togsim_log_count() + out = opt_fn(x, y) + _assert_simulation_happened(before, "cat.default") + + cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0) + _test_result("cat.default", out, cpu_out, rtol=1e-4, atol=1e-4) + + +def test_cat_out(device): + def cat_out_fn(a, b, out): + return torch.ops.aten.cat.out([a, b], 0, out=out) + + x = torch.randn(8, 16, device=device) + y = torch.randn(6, 16, device=device) + out_buf = torch.empty(14, 16, device=device) + opt_fn = torch.compile(dynamic=False)(cat_out_fn) + + before = _togsim_log_count() + out = opt_fn(x, y, out_buf) + _assert_simulation_happened(before, "cat.out") + + cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0) + _test_result("cat.out", out, cpu_out, rtol=1e-4, atol=1e-4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run cat simulation tests") + parser.add_argument( + "--case", + choices=["default", "out", "all"], + default="all", + help="Which cat case to run", + ) + args = parser.parse_args() + + device = torch.device("npu:0") + + if args.case in ("default", "all"): + test_cat_default(device) + if args.case in ("out", "all"): + test_cat_out(device) diff --git a/tests/test_sort.py b/tests/test_sort.py new file mode 100644 index 00000000..2b070223 --- /dev/null +++ b/tests/test_sort.py @@ -0,0 +1,112 @@ +import argparse +import torch +import torch._dynamo +import torch.utils.cpp_extension + +def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): + if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): + message = f"|{name} Test Passed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + else: + message = f"|{name} Test Failed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + print("custom out:", out.cpu()) + print("cpu out:", cpu_out) + raise SystemExit(1) + + +def test_equal(name, out, cpu_out): + if torch.equal(out.cpu(), cpu_out): + message = f"|{name} Test Passed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + else: + message = f"|{name} Test Failed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + print("custom out:", out.cpu()) + print("cpu out:", cpu_out) + raise SystemExit(1) + + +def _normalize_dim(dim: int, rank: int) -> int: + d = dim if dim >= 0 else rank + dim + if d < 0 or d >= rank: + raise ValueError(f"dim out of range: dim={dim}, rank={rank}") + return d + + +def test_sort_stable(device, size=(128, 128), dim=-1, descending=False): + _normalize_dim(dim, len(size)) + + def sort_stable_fn(x): + return torch.sort(x, stable=True, dim=dim, descending=descending) + + x = torch.randn(size, dtype=torch.float32) + x_npu = x.to(device=device) + + opt_sort = torch.compile(dynamic=False)(sort_stable_fn) + out_values, out_indices = opt_sort(x_npu) + + ref_values, ref_indices = torch.sort(x, stable=True, dim=dim, descending=descending) + + test_result("Sort.stable/values", out_values, ref_values) + test_equal("Sort.stable/indices", out_indices, ref_indices) + + +def test_sort_values_stable(device, size=(128, 128), dim=-1, descending=False): + _normalize_dim(dim, len(size)) + + def sort_out_fn(x): + out_values = torch.empty_like(x, device=x.device) + out_indices = torch.empty_like(x, dtype=torch.int64, device=x.device) + return torch.sort(x, stable=True, dim=dim, descending=descending, out=(out_values, out_indices)) + + x = torch.randn(size, dtype=torch.float32) + x_npu = x.to(device=device) + + opt_sort = sort_out_fn# torch.compile(dynamic=False)(sort_out_fn) + out_values, out_indices = opt_sort(x_npu) + + ref_values, ref_indices = torch.sort(x, stable=True, dim=dim, descending=descending) + + test_result("Sort.values_stable/values", out_values, ref_values) + test_equal("Sort.values_stable/indices", out_indices, ref_indices) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run sort tests") + parser.add_argument("--shape", type=str, default="(128,128)") + parser.add_argument("--dim", type=int, default=0) + parser.add_argument("--descending", action="store_true") + parser.add_argument( + "--mode", + type=str, + default="all", + choices=["all", "default", "values"], + ) + args = parser.parse_args() + + shape = tuple(map(int, args.shape.strip("()").split(","))) + + from Scheduler.scheduler import PyTorchSimRunner + + module = PyTorchSimRunner.setup_device() + device = module.custom_device() + + # Register recursive-compile bridge only when values_stable path is explicitly tested. + if args.mode in ("all", "values"): + torch.npu.register_eager_to_compile([ + "aten::sort.values_stable", + ]) + + if args.mode in ("all", "default"): + test_sort_stable(device, size=shape, dim=args.dim, descending=args.descending) + if args.mode in ("all", "values"): + test_sort_values_stable(device, size=shape, dim=args.dim, descending=args.descending) From f615178ae581236a1b4d1018f9b458b2c552179f Mon Sep 17 00:00:00 2001 From: jung-min Date: Wed, 4 Mar 2026 07:57:47 +0000 Subject: [PATCH 110/194] [Fix] Prevent fallback to eager mode after reaching compilation limit (7) --- tests/test_sdpa.py | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/tests/test_sdpa.py b/tests/test_sdpa.py index 9c921eb4..6ffd6f2e 100644 --- a/tests/test_sdpa.py +++ b/tests/test_sdpa.py @@ -14,6 +14,7 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): print("-" * len(message)) print(message) print("-" * len(message)) + pass else: print("custom out: ", out.cpu()) print("cpu out: ", cpu_out) @@ -31,35 +32,25 @@ def test_scaled_dot_product_attention(device, backends="flash"): for n_token in n_token_list: for head_dim in head_dim_list: # Inputs + clear_caches() query = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32) key = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32) value = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32) + # With NPU query = query.to(device=device) key = key.to(device=device) value = value.to(device=device) - # With NPU - if backends == "flash": - backends = [SDPBackend.FLASH_ATTENTION] - elif backends == "math": - backends = [SDPBackend.MATH] - elif backends == "memory_efficient": - backends = [SDPBackend.EFFICIENT_ATTENTION] - else: - backends = [SDPBackend.FLASH_ATTENTION, SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION] - - with sdpa_kernel(backends=backends): - opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention) - out = opt_fn(query, key, value) - + opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention) + out = opt_fn(query, key, value) out = out.to(device) # With CPU - device = torch.device('cpu') - query = query.to(device=device) - key = key.to(device=device) - value = value.to(device=device) + cpu_device = torch.device('cpu') + query = query.to(device=cpu_device) + key = key.to(device=cpu_device) + value = value.to(device=cpu_device) cpu_out = F.scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False) name = f"SDPA(n_batch: {n_batch}, n_head: {n_head}, n_token: {n_token}, head_dim: {head_dim})" @@ -76,9 +67,7 @@ def clear_caches(): os.environ["TORCHINDUCTOR_CACHE"] = "0" FxGraphCache.clear() -if __name__ == "__main__": - clear_caches() - +if __name__ == "__main__": device = torch.device('npu:0') test_scaled_dot_product_attention(device, backends="flash") \ No newline at end of file From 8ca5d02d599d06725b90963ee44701cb50e8f444 Mon Sep 17 00:00:00 2001 From: jung-min Date: Wed, 4 Mar 2026 08:09:28 +0000 Subject: [PATCH 111/194] [FIX] Add idx_map to the first matmul for logical consistency --- PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py index b3d88cc6..49c6c6bb 100644 --- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py +++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py @@ -339,6 +339,7 @@ def patched_scaled_dot_product_attention( // key @ query.t and scaling. linalg.matmul + { idx_map = array } ins(%k_buffer2D, %qt_buffer2D : memref<{{ tile_s }}x{{ tile_e }}x{{ data_stype }}, 1>, memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>) outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(data_stype) }}) @@ -451,7 +452,7 @@ def render(self, prologue_nodes: Optional[List[IRNode]] = None, tile_info = None, **kwargs): - + # Except for kernel, other arguments are usually None. query, key, value, out, q_tensor, k_tensor, v_tensor, out_tensor, b, l, s, e, ev, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) From 41288bc2d300305d91559ae49a67f11984f789c0 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 3 Mar 2026 16:40:57 +0900 Subject: [PATCH 112/194] [Template] Polish template kernel of cat operation --- .../torch_openreg/openreg/__init__.py | 49 --- PyTorchSimFrontend/mlir/mlir_bmm_template.py | 3 + PyTorchSimFrontend/mlir/mlir_cat_template.py | 369 ++++++++++++------ PyTorchSimFrontend/mlir/mlir_conv_common.py | 3 + PyTorchSimFrontend/mlir/mlir_gemm_template.py | 3 + PyTorchSimFrontend/mlir/mlir_lowering.py | 118 +----- PyTorchSimFrontend/mlir/mlir_scheduling.py | 22 +- PyTorchSimFrontend/mlir/mlir_template.py | 43 +- tests/test_cat.py | 143 +++++-- 9 files changed, 424 insertions(+), 329 deletions(-) diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py index 5603a4f7..f5aabc18 100644 --- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py +++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py @@ -256,52 +256,6 @@ def launch_model(model, *args, stream_index=0, timestamp=0, **kwargs): from .random import * # noqa: F403 from .amp import * -def _precheck_cat_out_args(args, kwargs): - tensors = args[0] if len(args) > 0 else kwargs.get("tensors") - dim = args[1] if len(args) > 1 else kwargs.get("dim", 0) - out = kwargs.get("out", args[2] if len(args) > 2 else None) - - if out is None: - return - if not isinstance(tensors, (list, tuple)) or len(tensors) == 0: - raise RuntimeError("aten::cat.out requires non-empty tensor list") - if not all(isinstance(t, torch.Tensor) for t in tensors): - raise RuntimeError("aten::cat.out tensors must be Tensor values") - if not isinstance(out, torch.Tensor): - raise RuntimeError("aten::cat.out out must be a Tensor") - - rank = tensors[0].dim() - if rank == 0: - raise RuntimeError("aten::cat.out does not support scalar inputs") - if dim < 0: - dim += rank - if dim < 0 or dim >= rank: - raise RuntimeError(f"aten::cat.out dim out of range: dim={dim}, rank={rank}") - if any(t.dim() != rank for t in tensors): - raise RuntimeError("aten::cat.out inputs must have the same rank") - if any(t.dtype != tensors[0].dtype for t in tensors): - raise RuntimeError("aten::cat.out inputs must have the same dtype") - if out.dim() != rank: - raise RuntimeError("aten::cat.out out rank mismatch") - - for d in range(rank): - if d == dim: - continue - base = tensors[0].shape[d] - if any(t.shape[d] != base for t in tensors[1:]): - raise RuntimeError( - f"aten::cat.out non-concatenated dimension mismatch at dim={d}" - ) - if out.shape[d] != base: - raise RuntimeError(f"aten::cat.out out shape mismatch at dim={d}") - - expected = sum(t.shape[dim] for t in tensors) - if out.shape[dim] != expected: - raise RuntimeError( - f"aten::cat.out out concatenated dimension mismatch at dim={dim}: " - f"expected {expected}, got {out.shape[dim]}" - ) - def eager_to_compile(op_name): """ Register an eager mode operation as a graph-based implementation using torch.compile(). @@ -313,9 +267,6 @@ def eager_to_compile(op_name): torch.npu.eager_to_compile("aten::mul.Tensor") """ def wrapper(*args, **kwargs): - if op_name == "aten::cat.out": - _precheck_cat_out_args(args, kwargs) - @torch.compile(dynamic=False) def dummy_graph(*args, **kwargs): # Convert "aten::mul.Tensor" -> torch.ops.aten.mul.Tensor diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py index 178ea987..9398f90c 100644 --- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py @@ -154,6 +154,9 @@ class MLIRBMMTemplate(MLIRTemplate): def __init__(self, input_nodes, layout, input_reorder=None): super().__init__("kernel", input_nodes, layout, input_reorder) + self.support_epilogue_fusion = True + self.support_prologue_fusion = True + self.support_reduction_fusion = True def render(self, kernel: MLIRTemplateKernel, diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py index 996af1de..d68af7d4 100644 --- a/PyTorchSimFrontend/mlir/mlir_cat_template.py +++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py @@ -1,8 +1,9 @@ -from typing import List, Optional, cast +from typing import List, Optional +import math +import itertools import sympy -from torch._inductor.ir import Buffer, IRNode -from torch._inductor.virtualized import V +from torch._inductor.ir import IRNode from PyTorchSimFrontend.mlir import mlir_common from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate, MLIRTemplateKernel @@ -10,40 +11,28 @@ TEMPLATE = r""" {{kernel.def_global_vars()}} - -func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=[X0, X1], outputs=[Y], names_str=NAMES_STR, input_reorder=input_reorder)}} { - {{ kernel.def_sram_buffer("X0", X0_TILE_DESC, id=0, indent_size=2) }} - {{ kernel.def_sram_buffer("X1", X1_TILE_DESC, id=1, indent_size=2) }} - {{ kernel.def_sram_buffer(OUT_DVAR, Y_TILE_DESC, id=2, indent_size=2) }} +func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=INPUT_NAMES, outputs=[Y], names_str=NAMES_STR, input_reorder=input_reorder)}} { +{%- for buffer_name, tile_desc in UNIQUE_BUFFER_TILE_DESCS.items() %} + {{ kernel.def_sram_buffer(buffer_name, tile_desc, indent_size=2) }} +{%- endfor %} {{ kernel.def_local_vars(indent_size=2) }} affine.for %cat_block = 0 to 1 step 1 { -{% if DIM == 0 %} - affine.for %index0 = 0 to {{ X0_ROWS }} step 1 { - affine.for %index1 = 0 to {{ COLS }} step 1 { - {{ kernel.def_dma_op("MVIN", "X0", X0_IDX, X0_TILE_DESC, indent_size=8) }} - {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y0_IDX, X0_TILE_DESC, indent_size=8) }} - } - } - - affine.for %index2 = 0 to {{ X1_ROWS }} step 1 { - affine.for %index3 = 0 to {{ COLS }} step 1 { - {{ kernel.def_dma_op("MVIN", "X1", X1_IDX, X1_TILE_DESC, indent_size=8) }} - {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y1_IDX, X1_TILE_DESC, indent_size=8) }} - } - } -{% else %} - affine.for %index0 = 0 to {{ ROWS }} step 1 { - affine.for %index1 = 0 to {{ X0_COLS }} step 1 { - {{ kernel.def_dma_op("MVIN", "X0", X0_IDX, X0_TILE_DESC, indent_size=8) }} - {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y0_IDX, X0_TILE_DESC, indent_size=8) }} - } - affine.for %index3 = 0 to {{ X1_COLS }} step 1 { - {{ kernel.def_dma_op("MVIN", "X1", X1_IDX, X1_TILE_DESC, indent_size=8) }} - {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y1_IDX, X1_TILE_DESC, indent_size=8) }} - } - } -{% endif %} +{%- for d in range(RANK-1) %} + affine.for %index{{ OUTPUT_DIM[d] }} = 0 to {{ OUTPUT_SIZES[d] }} step {{ TILE_SIZES[d] }} { +{%- endfor %} +{%- for i in range(NUM_INPUTS) %} + // Input tensor{{ i }} + affine.for %index_local{{ DIM }}_{{ i }} = 0 to {{ INPUT_SIZES[i][DIM] }} step {{ INPUT_TILE_SIZES_DIM[i] }} { + %index{{ DIM }}_{{i}} = affine.apply affine_map<(d0) -> (d0 + {{ CUMULATIVE_OFFSETS[i] }})> (%index_local{{ DIM }}_{{ i }}) + {{ kernel.def_dma_op("MVIN", INPUT_BUFFER_NAMES[i], INPUT_IDXS[i], INPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }} + {{ kernel.def_dma_op("MVOUT", OUT_DVAR, OUTPUT_IDXS[i], INPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }} + } { inner_loop=true } +{%- endfor %} + +{%- for d in range(RANK-1) %} + } { outer_loop=true } +{%- endfor %} } { outer_loop=true } return } @@ -51,8 +40,8 @@ class MLIRCatTemplate(MLIRTemplate): - def __init__(self, input_nodes, layout, dim, input_reorder=None): - super().__init__("kernel", input_nodes, layout, input_reorder) + def __init__(self, input_nodes, layout, dim): + super().__init__("kernel", input_nodes, layout) self.dim = dim def render( @@ -66,87 +55,248 @@ def render( is_out_variant = template_buffer_node is not None if is_out_variant: self.output_node = template_buffer_node - # cat template currently emits a single output buffer and does not - # support epilogue output remapping. - - def _unwrap_node(n): - return n.node if hasattr(n, "node") else n - - x0 = _unwrap_node(self.input_nodes[0]) - x1 = _unwrap_node(self.input_nodes[1]) - y = _unwrap_node(self.output_node) - - def _as_int(v): - try: - return int(v) - except Exception: - return int(V.graph.sizevars.size_hint(v)) - - x0_rows = _as_int(x0.get_size()[0]) - x1_rows = _as_int(x1.get_size()[0]) - x0_cols = _as_int(x0.get_size()[1]) - x1_cols = _as_int(x1.get_size()[1]) - y_cols = _as_int(y.get_size()[1]) - kernel.loop_size = None - - # 2D cat template with contiguous layout. - x0_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) - x0_tile_desc.set_tile_size_stride([1, 1], [1, 1]) - x0_tile_desc.set_name("x0_cat_tile") - x1_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) - x1_tile_desc.set_tile_size_stride([1, 1], [1, 1]) - x1_tile_desc.set_name("x1_cat_tile") - y_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) - y_tile_desc.set_tile_size_stride([1, 1], [1, 1]) - y_tile_desc.set_name("y_cat_tile") - if self.dim == 0: - # Flattened offsets for dim=0 cat. - x0_idx = [sympy.Symbol("index0") * x0_cols, sympy.Symbol("index1")] - x1_idx = [sympy.Symbol("index2") * x1_cols, sympy.Symbol("index3")] - y0_idx = [sympy.Symbol("index0") * y_cols, sympy.Symbol("index1")] - y1_idx = [(sympy.Symbol("index2") + x0_rows) * y_cols, sympy.Symbol("index3")] - else: - # Flattened offsets for dim=1 cat. - x0_idx = [sympy.Symbol("index0") * x0_cols, sympy.Symbol("index1")] - x1_idx = [sympy.Symbol("index0") * x1_cols, sympy.Symbol("index3")] - y0_idx = [sympy.Symbol("index0") * y_cols, sympy.Symbol("index1")] - y1_idx = [sympy.Symbol("index0") * y_cols, sympy.Symbol("index3") + x0_cols] + # Extract info + input_nodes = self.input_nodes + y = self.output_node + num_inputs = len(self.input_nodes) + rank = len(y.get_size()) + + input_sizes = [x.get_size() for x in input_nodes] + output_sizes = [sz for dim, sz in enumerate(y.get_size()) if dim != self.dim] + output_dim = [dim for dim, sz in enumerate(y.get_size()) if dim != self.dim] + tile_sizes = tile_info if tile_info is not None else [1] * len(output_sizes) + output_strides = y.get_layout().stride + + # Calculate input tile sizes + input_tile_sizes_dim = self._calculate_input_tile_sizes( + kernel, input_sizes, tile_sizes, num_inputs, rank + ) + buffer_name_to_template_name, input_buffer_names = self._build_buffer_mapping(input_nodes) + input_tile_descs, unique_tile_descs = self._build_tile_descriptors( + kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names + ) + y_tile_desc = self._build_output_tile_desc( + kernel, input_tile_sizes_dim, tile_sizes, rank + ) + + input_idxs, output_idxs, cumulative_offsets = self._build_index_expressions( + input_nodes, input_sizes, output_strides, rank, num_inputs + ) + + # Map unique buffer names to their tile descriptors for template + unique_buffer_tile_descs = {} + for actual_name, template_name in buffer_name_to_template_name.items(): + if actual_name in unique_tile_descs: + unique_buffer_tile_descs[template_name] = unique_tile_descs[actual_name] + + names_str = ", ".join(input_buffer_names + ["out_ptr1" if is_out_variant else "Y"]) + indent_size = 2 + (rank - 1) * 2 + 4 kernel.render_options = dict( KERNEL_NAME=self.name, kernel=kernel, - X0=x0, - X1=x1, Y=y, OUT_DVAR="out_ptr1" if is_out_variant else "Y", - NAMES_STR="X0, X1, out_ptr1" if is_out_variant else "X0, X1, Y", + NAMES_STR=names_str, + INPUT_NAMES=input_nodes, + INPUT_BUFFER_NAMES=input_buffer_names, + NUM_INPUTS=num_inputs, + RANK=rank, DIM=self.dim, - X0_ROWS=x0_rows, - X1_ROWS=x1_rows, - ROWS=x0_rows, - X0_COLS=x0_cols, - X1_COLS=x1_cols, - COLS=x0_cols, - X0_TILE_DESC=x0_tile_desc, - X1_TILE_DESC=x1_tile_desc, - Y_TILE_DESC=y_tile_desc, - X0_IDX=x0_idx, - X1_IDX=x1_idx, - Y0_IDX=y0_idx, - Y1_IDX=y1_idx, + INPUT_SIZES=input_sizes, + OUTPUT_SIZES=output_sizes, + OUTPUT_DIM=output_dim, + TILE_SIZES=tile_sizes, + INPUT_TILE_SIZES_DIM=input_tile_sizes_dim, + INPUT_TILE_DESCS=input_tile_descs, + UNIQUE_BUFFER_TILE_DESCS=unique_buffer_tile_descs, + INPUT_IDXS=input_idxs, + OUTPUT_IDXS=output_idxs, + CUMULATIVE_OFFSETS=cumulative_offsets, + INDENT_SIZE=indent_size, input_reorder=self.input_reorder, ) - # Needed when epilogue fusion requests set_ranges(). - kernel.dim_aliasing = {"index0": "index0", "index1": "index1"} - if hasattr(self.output_node, "node") and hasattr(self.output_node.node, "get_name"): - output_node_name = self.output_node.node.get_name() - elif hasattr(self.output_node, "get_name"): - output_node_name = self.output_node.get_name() - else: - output_node_name = self.output_node.name + self._setup_epilogue_info(kernel, y) + code = self._template_from_string(TEMPLATE).render(**kernel.render_options) + return code + + def get_tile_candidates( + self, + kernel: MLIRTemplateKernel, + template_buffer_node=None, + epilogue_nodes: Optional[List[IRNode]] = None, + **kwargs, + ): + """Generate tile candidates for cat operation. Concat dimension always has tile size 1.""" + if template_buffer_node is not None: + self.output_node = template_buffer_node + + y = self.output_node + num_inputs = len(self.input_nodes) + output_sizes = [sz for dim, sz in enumerate(y.get_size()) if dim != self.dim] + num_non_dim_dims = len(output_sizes) + + if num_non_dim_dims == 0: + return [[1]] + + tile_candidates = [] + dim_tile_candidates = [] + + for dim_size in output_sizes: + dim_candidates = [] + max_tile = min(dim_size, kernel.spad_info["spad_size"] // (kernel.vector_lane * kernel.precision * 2 * num_inputs)) + + for mult in range(1, max_tile // kernel.vector_lane + 1): + tile = mult * kernel.vector_lane + if tile <= dim_size: + dim_candidates.append(tile) + if max_tile > 0: + for exp in range(int(math.log2(max_tile)) + 1): + tile = 2 ** exp + if tile <= dim_size and tile not in dim_candidates: + dim_candidates.append(tile) + + if dim_size not in dim_candidates: + dim_candidates.append(dim_size) + + dim_tile_candidates.append(sorted(set(dim_candidates))[:5]) + + for tile_combo in itertools.product(*dim_tile_candidates): + total_elements = math.prod(tile_combo) + total_spad_needed = total_elements * (num_inputs + 1) * kernel.precision + + if total_spad_needed <= kernel.spad_info["spad_size"] * kernel.vector_lane: + tile_candidates.append(list(tile_combo)) + + if not tile_candidates: + tile_candidates = [[1] * num_non_dim_dims] + + tile_candidates.sort(key=lambda x: -math.prod(x)) + return tile_candidates[:4] + + def _calculate_input_tile_sizes( + self, kernel, input_sizes, tile_sizes, num_inputs, rank + ): + """Calculate tile sizes for concat dimension for each input.""" + non_dim_tile_elements = math.prod(tile_sizes) if tile_sizes else 1 + non_dim_tile_spad = non_dim_tile_elements * kernel.precision + max_spad_per_input = kernel.spad_info["spad_size"] * kernel.vector_lane // 2 + extra_concat_input = math.ceil(max_spad_per_input / non_dim_tile_spad) - num_inputs + + input_tile_sizes_dim = [] + for i in range(num_inputs): + input_dim_size = input_sizes[i][self.dim] + if extra_concat_input > 0 and non_dim_tile_elements > 0: + max_tile_dim = min(input_dim_size, extra_concat_input) + extra_concat_input -= max_tile_dim + else: + max_tile_dim = 1 + input_tile_sizes_dim.append(max_tile_dim) + return input_tile_sizes_dim + + def _build_buffer_mapping(self, input_nodes): + """Map actual buffer names to template buffer names """ + buffer_name_to_template_name = {} + input_buffer_names = [] + for x in input_nodes: + actual_name = x.get_name() + template_name = buffer_name_to_template_name.setdefault( + actual_name, f"X{len(buffer_name_to_template_name)}" + ) + input_buffer_names.append(template_name) + return buffer_name_to_template_name, input_buffer_names + + def _build_tile_descriptors( + self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names + ): + """Build tile descriptors for each input.""" + input_tile_descs = [] + unique_tile_descs = {} + + for i, x in enumerate(input_nodes): + # Build full tile size list for this input + full_tile_sizes = [] + tile_size_idx = 0 + for d in range(rank): + if d != self.dim: + full_tile_sizes.append(tile_sizes[tile_size_idx]) + tile_size_idx += 1 + else: + full_tile_sizes.append(input_tile_sizes_dim[i]) + + tile_desc = mlir_common.MLIRMultiDimTile( + full_tile_sizes, + kernel.vector_lane, + vlane_split_axis=rank - 1, + vlane_stride=1 + ) + tile_desc.set_tile_size(full_tile_sizes) + template_buffer_name = input_buffer_names[i] + tile_desc.set_name(f"{template_buffer_name.lower()}_cat_tile") + input_tile_descs.append(tile_desc) + + # Store unique tile desc by actual buffer name + actual_name = x.get_name() + if actual_name not in unique_tile_descs: + unique_tile_descs[actual_name] = tile_desc + + return input_tile_descs, unique_tile_descs + + def _build_index_expressions( + self, input_nodes, input_sizes, output_strides, rank, num_inputs + ): + """Build index expressions for input and output.""" + input_idxs = [] + output_idxs = [] + cumulative_offsets = [0] + for i in range(num_inputs - 1): + cumulative_offsets.append(cumulative_offsets[-1] + input_sizes[i][self.dim]) + + for i, x in enumerate(input_nodes): + x_stride = x.get_layout().stride + input_idx = [] + output_idx = [] + for d in range(rank): + if d != self.dim: + input_idx_symbol = sympy.Symbol(f"index{d}") + output_idx_symbol = sympy.Symbol(f"index{d}") + else: + input_idx_symbol = sympy.Symbol(f"index_local{self.dim}_{i}") + output_idx_symbol = sympy.Symbol(f"index{self.dim}_{i}") + input_idx.append(input_idx_symbol * x_stride[d]) + output_idx.append(output_idx_symbol * output_strides[d]) + input_idxs.append(input_idx) + output_idxs.append(output_idx) + + return input_idxs, output_idxs, cumulative_offsets + + def _build_output_tile_desc(self, kernel, input_tile_sizes_dim, tile_sizes, rank): + """Build output tile descriptor.""" + max_output_tile_dim = max(input_tile_sizes_dim) if input_tile_sizes_dim else 1 + output_full_tile_sizes = [] + tile_size_idx = 0 + for d in range(rank): + if d != self.dim: + output_full_tile_sizes.append(tile_sizes[tile_size_idx]) + tile_size_idx += 1 + else: + output_full_tile_sizes.append(max_output_tile_dim) + + y_tile_desc = mlir_common.MLIRMultiDimTile( + output_full_tile_sizes, + kernel.vector_lane, + vlane_split_axis=rank - 1, + vlane_stride=1 + ) + y_tile_desc.set_tile_size(output_full_tile_sizes) + y_tile_desc.set_name("y_cat_tile") + return y_tile_desc + + def _setup_epilogue_info(self, kernel, y): + """Setup epilogue information.""" if hasattr(y, "get_numel"): y_numel = y.get_numel() elif hasattr(y, "node") and hasattr(y.node, "get_numel"): @@ -154,14 +304,5 @@ def _as_int(v): else: y_numel = None - kernel.epilogue_info = dict( - output_node=output_node_name, - sram_var="y_cat_tile", - dram_var=kernel.render_options["OUT_DVAR"], - dram_tile_desc=y_tile_desc, - ) if y_numel is not None: kernel.exception_nodes[kernel.render_options["OUT_DVAR"]] = {"numel": y_numel} - - code = self._template_from_string(TEMPLATE).render(**kernel.render_options) - return code diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py index f8566b6d..f72a7663 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_common.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py @@ -12,6 +12,9 @@ class MLIRConvCommonTemplate(MLIRTemplate): WRAPPER_TEMPLATE = None def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): super().__init__("kernel", input_nodes, layout, input_reorder) + self.support_epilogue_fusion = True + self.support_prologue_fusion = False + self.support_reduction_fusion = False self.stride = kwargs["stride"] self.padding = kwargs["padding"] self.dilation = kwargs["dilation"] diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py index 0158caa6..5b116807 100644 --- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py @@ -105,6 +105,9 @@ class MLIRGemmTemplate(MLIRTemplate): def __init__(self, input_nodes, layout, input_reorder=None): super().__init__("kernel", input_nodes, layout, input_reorder) + self.support_epilogue_fusion = True + self.support_prologue_fusion = True + self.support_reduction_fusion = True def render(self, kernel: MLIRTemplateKernel, diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py index 0f28f03b..d7aee715 100644 --- a/PyTorchSimFrontend/mlir/mlir_lowering.py +++ b/PyTorchSimFrontend/mlir/mlir_lowering.py @@ -202,48 +202,9 @@ def _cat_layout(tensors: Sequence[TensorBox], dim: int) -> ir.Layout: stride, ) - -def _can_use_cat_template(tensors: Sequence[TensorBox], dim: int) -> bool: - # Current template specialization: 2 inputs, rank-2, dim in {0, 1}. - if len(tensors) != 2: - return False - if not all(hasattr(t, "get_size") and hasattr(t, "get_dtype") and hasattr(t, "realize") for t in tensors): - return False - if tensors[0].get_dtype() != tensors[1].get_dtype(): - return False - rank0 = len(tensors[0].get_size()) - rank1 = len(tensors[1].get_size()) - if rank0 != 2 or rank1 != 2: - return False - if dim < 0: - dim += rank0 - if dim not in (0, 1): - return False - - if dim == 0: - cols0 = tensors[0].get_size()[1] - cols1 = tensors[1].get_size()[1] - return V.graph.sizevars.statically_known_equals(cols0, cols1) - - rows0 = tensors[0].get_size()[0] - rows1 = tensors[1].get_size()[0] - return V.graph.sizevars.statically_known_equals(rows0, rows1) - - -def _cat_fallback(reason: str, tensors: Sequence[TensorBox], dim: int): - # Non-template cases delegate to the original lowering path. - return _orig_cat_default_lowering(tensors, dim) - - -def _custom_cat_impl(tensors: Sequence[TensorBox], dim: int = 0): - if _orig_cat_default_lowering is None: - raise RuntimeError("Original aten.cat.default lowering is missing") - if len(tensors) > 0: - rank = len(tensors[0].get_size()) - if dim < 0: - dim += rank - if not _can_use_cat_template(tensors, dim): - return _cat_fallback("default-path", tensors, dim) +def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0): + if tensors and dim < 0: + dim += len(tensors[0].get_size()) for t in tensors: t.realize() @@ -251,75 +212,6 @@ def _custom_cat_impl(tensors: Sequence[TensorBox], dim: int = 0): mlir_template = MLIRCatTemplate(list(tensors), layout, dim=dim) return mlir_template.generate().output_node() - -def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0): - return _custom_cat_impl(tensors, dim) - - -def custom_cat_out(tensors: Sequence[TensorBox], dim: int = 0, out: Optional[TensorBox] = None): - if _orig_cat_out_lowering is None: - raise RuntimeError("Original aten.cat.out lowering is missing") - if out is None: - return _orig_cat_out_lowering(tensors, dim, out) - - copy_default_lowering = lowerings.get(aten.copy_.default) - slice_tensor_lowering = lowerings.get(aten.slice.Tensor) - if copy_default_lowering is None or slice_tensor_lowering is None: - raise RuntimeError("cat.out lowering requires aten.copy_.default and aten.slice.Tensor lowerings") - - # Lower cat.out as a sequence of slice+copy ops so each piece still runs - # through the existing compiled/simulated kernel path. - if len(tensors) == 0: - raise RuntimeError("cat.out requires at least one input tensor") - if not all(hasattr(t, "get_size") and hasattr(t, "get_dtype") and hasattr(t, "realize") for t in tensors): - raise RuntimeError("cat.out inputs must be tensor-like values") - rank = len(tensors[0].get_size()) - if rank == 0: - raise RuntimeError("cat.out does not support scalar inputs") - if dim < 0: - dim = dim + rank - if dim < 0 or dim >= rank: - raise RuntimeError(f"cat.out dim out of range: dim={dim}, rank={rank}") - if any(len(t.get_size()) != rank for t in tensors): - raise RuntimeError("cat.out inputs must have the same rank") - if any(t.get_dtype() != tensors[0].get_dtype() for t in tensors): - raise RuntimeError("cat.out inputs must have the same dtype") - # cat semantics: all non-cat dimensions must be equal. - for i in range(rank): - if i == dim: - continue - base = tensors[0].get_size()[i] - if any(not V.graph.sizevars.statically_known_equals(base, t.get_size()[i]) for t in tensors[1:]): - raise RuntimeError(f"cat.out non-concatenated dimension mismatch at dim={i}") - - # Output shape must match concatenated shape. - if not hasattr(out, "get_size"): - raise RuntimeError("cat.out output must be tensor-like") - out_sizes = list(out.get_size()) - if len(out_sizes) != rank: - raise RuntimeError("cat.out output rank mismatch") - for i in range(rank): - if i == dim: - continue - if not V.graph.sizevars.statically_known_equals(out_sizes[i], tensors[0].get_size()[i]): - raise RuntimeError(f"cat.out output shape mismatch at dim={i}") - expected_cat = sum(t.get_size()[dim] for t in tensors) - if not V.graph.sizevars.statically_known_equals(out_sizes[dim], expected_cat): - raise RuntimeError(f"cat.out output concatenated dimension mismatch at dim={dim}") - - if isinstance(out, TensorBox): - out.realize() - - offset = 0 - for src in tensors: - src.realize() - end = offset + src.get_size()[dim] - dst_view = slice_tensor_lowering(out, dim, offset, end, 1) - copy_default_lowering(dst_view, src) - offset = end - return out - - def _custom_sort_values_impl( self: TensorBox, dim: int = -1, @@ -459,9 +351,7 @@ def custom_sort_values_stable( lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()}) lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()}) lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()}) - -lowerings.update({aten.cat.default: custom_cat_default}) -lowerings.update({aten.cat.out: custom_cat_out}) +lowerings.update({getattr(aten.cat, overload): custom_cat_default for overload in aten.cat.overloads()}) lowerings.update({aten.sort.stable: custom_sort_stable}) lowerings.update({aten.sort.values_stable: custom_sort_values_stable}) diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index af960533..2f9c9704 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -44,12 +44,10 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule # Case 3: Prologue(Pointwise) + Tempalte if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and len(node2.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE: - from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate - from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate - target_node = base_template_node2[0].node - # Currently only BMM, MM support prologue fusion - if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)): + + # Check if template supports prologue fusion + if not getattr(target_node.template, 'support_prologue_fusion', False): return False if len(node1.read_writes.writes) != 1: @@ -129,12 +127,14 @@ def can_fuse_horizontal(self, node1, node2): if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(node2.get_nodes())==1 and len(base_template_node2) == 0 and not node2.is_reduction(): # Don't fuse maxpool template code from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate - from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate - from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate template_node = base_template_node1[0] epilogue_node = node2 + # Check if template supports epilogue fusion + if not getattr(template_node.node.template, 'support_epilogue_fusion', False): + return False + if isinstance(template_node.node.template, MLIRMaxPoolTemplate): return False @@ -161,7 +161,7 @@ def can_fuse_horizontal(self, node1, node2): # Revert act_node.group : simplify_and_reorder() modified _body, _size, group if template_node.group != epilogue_node.group: # We don't fuse this case... - if (isinstance(template_node.node.template, MLIRBMMTemplate) or isinstance(template_node.node.template, MLIRGemmTemplate)) and template_node.group[1][0][0] == 1: + if getattr(template_node.node.template, 'support_prologue_fusion', False) and template_node.group[1][0][0] == 1: return False if list(template_node.group[1][0]) != list(epilogue_node.get_nodes()[0].node.data.get_size()): @@ -171,10 +171,10 @@ def can_fuse_horizontal(self, node1, node2): # Case 2: Tempalte + Reduction fusion if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(node2.get_nodes())==1 and len(base_template_node2) == 0 and node2.is_reduction() and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE: - from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate - from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate target_node = base_template_node1[0].node - if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)): + + # Check if template supports reduction fusion + if not getattr(target_node.template, 'support_reduction_fusion', False): return False size_match = node1.get_nodes()[0].node.get_numel() == reduce(operator.mul, node2.get_nodes()[0].node.get_size(), 1) * reduce(operator.mul, node2.get_nodes()[0].node.get_reduction_size(), 1) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 76b0ef71..04d327f8 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -14,7 +14,7 @@ from unittest.mock import patch from torch._inductor.codegen.common import KernelTemplate, CSE, DeferredLine -from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, ChoiceCaller +from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, ChoiceCaller, ir_node_to_tensor from torch._inductor.select_algorithm import PartialRender from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller from torch._inductor.autotune_process import TensorMeta @@ -124,6 +124,7 @@ def __init__(self, self.epilogue_buffer_group = IndentedBufferGroup(self, prefix="epilogue_") self.global_vars = IndentedBuffer() self.exception_nodes = {} + self.epilogue_info = {} # Reduction data structure self.reduction_epilogue_suffix = IndentedBuffer() self.reduction_fusion = False @@ -403,7 +404,7 @@ def call_kernel(self, kernel_name): _, call_args, _, _ = self.kernel_group.args.mlir_argdefs() # generate the code to call this wrapper.generate_kernel_call( - kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", call_args) + kernel_name if self.outer_func_name is None else "wrapper_" + kernel_name, call_args) def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes, tile_info): with self as kernel: @@ -460,11 +461,11 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_ } node.codegen((vars, reduction_vars)) - # Codegen epilogue nodes - tile_desc = kernel.set_tile_size(kernel.epilogue_info) - kernel.kernel_group.set_tile_info(tile_desc) - kernel.call_ranges = None if epilogue_nodes: + # Codegen epilogue nodes + tile_desc = kernel.set_tile_size(kernel.epilogue_info) + kernel.kernel_group.set_tile_info(tile_desc) + kernel.call_ranges = None with kernel.epilogue_buffer_group.as_local(): _, (group, reduction_group) = max( epilogue_nodes, key=lambda x: int(x.is_reduction()) @@ -625,7 +626,9 @@ def def_kernel( extra_node[node.get_name()] = node.node else: extra_node[node.get_name()] = node - self.buffer_names[node.get_name()] = self.epilogue_info['sram_var'] + + if 'sram_var' in self.epilogue_info: + self.buffer_names[node.get_name()] = self.epilogue_info['sram_var'] def hook(): arg_defs, call_args, *_ = self.kernel_group.args.mlir_argdefs(extra_node=extra_node) @@ -688,7 +691,8 @@ def def_conv_kernel( self.kernel_group.args.output_buffers[node.get_name()] = name self.store_buffer_names.add(node.get_name()) #TODO: Is this enough not calling store() in mlir_common.py? self.extra_node[node.get_name()] = node - self.buffer_names[node.get_name()] = self.epilogue_info['sram_var'] #TODO: Buffer name fixed + if 'sram_var' in self.epilogue_info: + self.buffer_names[node.get_name()] = self.epilogue_info['sram_var'] #TODO: Buffer name fixed def kernel_hook(): arg_defs, *_ = self.kernel_group.args.mlir_argdefs(extra_node=self.extra_node) @@ -1146,6 +1150,15 @@ def set_tile_size(self, template_fusion_info, prologue=False): return tile_desc class MLIRTemplateCaller(CUDATemplateCaller): + def __init__(self, name, category, input_nodes, layout, make_kernel_render, supports_epilogue_fusion, template, info_kwargs, description): + bmreq = MLIRBenchmarkRequest( + kernel_name=name, + input_tensor_meta=list(), + output_tensor_meta=list(), + extra_args=[], + source_code="", + ) + super().__init__(name, category, input_nodes, layout, make_kernel_render, bmreq, supports_epilogue_fusion, template, info_kwargs, description) def __str__(self): return f"MLIRTemplateCaller(source_file={self.bmreq.source_file})" @@ -1173,6 +1186,10 @@ def __init__(self, name, input_nodes, layout, input_reorder = None): self.output_nodes = [self.output_node] self.input_reorder = input_reorder self.layout = layout + # Fusion support flags (default to False) + self.support_epilogue_fusion = False + self.support_prologue_fusion = False + self.support_reduction_fusion = False def generate(self, **kwargs) -> ChoiceCaller: kernel_name = f"mlir_{self.name}" @@ -1184,18 +1201,9 @@ def generate(self, **kwargs) -> ChoiceCaller: code = self.render(kernel=kernel, **kwargs) kernel_hash_name = f"mlir_{self.name}_{next(self.index_counter)}" - extra_args = [] # create the BenchmarkRequest output_nodes = getattr(self, "output_nodes", None) or [self.output_node] - bmreq = MLIRBenchmarkRequest( - kernel_name=kernel_name, - input_tensor_meta=TensorMeta.from_irnodes(self.input_nodes), - output_tensor_meta=TensorMeta.from_irnodes(output_nodes), - extra_args=extra_args, - source_code=code, - ) - def make_kernel_render( template_node: TemplateBuffer, prologue_nodes: Optional[List[IRNode]] = None, @@ -1236,7 +1244,6 @@ def make_kernel_render( self.input_nodes, self.output_node.get_layout(), make_kernel_render, - bmreq, False, # supports_epilogue_fusion self, kwargs, diff --git a/tests/test_cat.py b/tests/test_cat.py index 32573a05..62de6759 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -20,24 +20,6 @@ def _test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): print("cpu out: ", cpu_out) raise RuntimeError(f"{name} mismatch") - -def _togsim_log_count() -> int: - log_dir = Path("togsim_results") - if not log_dir.exists(): - return 0 - return len(list(log_dir.glob("*.log"))) - - -def _assert_simulation_happened(before_count: int, case_name: str): - after_count = _togsim_log_count() - if after_count <= before_count: - raise RuntimeError( - f"{case_name}: TOGSim log count did not increase " - f"(before={before_count}, after={after_count})" - ) - print(f"{case_name}: TOGSim logs increased ({before_count} -> {after_count})") - - def test_cat_default(device): def cat_default_fn(a, b): return torch.cat([a, b], dim=0) @@ -46,9 +28,7 @@ def cat_default_fn(a, b): y = torch.randn(6, 16, device=device) opt_fn = torch.compile(dynamic=False)(cat_default_fn) - before = _togsim_log_count() out = opt_fn(x, y) - _assert_simulation_happened(before, "cat.default") cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0) _test_result("cat.default", out, cpu_out, rtol=1e-4, atol=1e-4) @@ -63,19 +43,122 @@ def cat_out_fn(a, b, out): out_buf = torch.empty(14, 16, device=device) opt_fn = torch.compile(dynamic=False)(cat_out_fn) - before = _togsim_log_count() out = opt_fn(x, y, out_buf) - _assert_simulation_happened(before, "cat.out") cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0) _test_result("cat.out", out, cpu_out, rtol=1e-4, atol=1e-4) +def test_cat_4d_dim0(device): + def cat_4d_dim0_fn(a, b): + return torch.cat([a, b], dim=0) + + x = torch.randn(2, 3, 4, 5, device=device) + y = torch.randn(3, 3, 4, 5, device=device) + opt_fn = torch.compile(dynamic=False)(cat_4d_dim0_fn) + + out = opt_fn(x, y) + + cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0) + _test_result("cat.4d.dim0", out, cpu_out, rtol=1e-4, atol=1e-4) + + +def test_cat_4d_dim1(device): + def cat_4d_dim1_fn(a, b): + return torch.cat([a, b], dim=1) + + x = torch.randn(2, 3, 4, 5, device=device) + y = torch.randn(2, 5, 4, 5, device=device) + opt_fn = torch.compile(dynamic=False)(cat_4d_dim1_fn) + + out = opt_fn(x, y) + + cpu_out = torch.cat([x.cpu(), y.cpu()], dim=1) + _test_result("cat.4d.dim1", out, cpu_out, rtol=1e-4, atol=1e-4) + + +def test_cat_4d_dim2(device): + def cat_4d_dim2_fn(a, b): + return torch.cat([a, b], dim=2) + + x = torch.randn(2, 3, 4, 5, device=device) + y = torch.randn(2, 3, 6, 5, device=device) + opt_fn = torch.compile(dynamic=False)(cat_4d_dim2_fn) + + out = opt_fn(x, y) + + cpu_out = torch.cat([x.cpu(), y.cpu()], dim=2) + _test_result("cat.4d.dim2", out, cpu_out, rtol=1e-4, atol=1e-4) + + +def test_cat_4d_dim3(device): + def cat_4d_dim3_fn(a, b): + return torch.cat([a, b], dim=3) + + x = torch.randn(2, 3, 4, 5, device=device) + y = torch.randn(2, 3, 4, 7, device=device) + opt_fn = torch.compile(dynamic=False)(cat_4d_dim3_fn) + + out = opt_fn(x, y) + + cpu_out = torch.cat([x.cpu(), y.cpu()], dim=3) + _test_result("cat.4d.dim3", out, cpu_out, rtol=1e-4, atol=1e-4) + + +def test_cat_three_inputs(device): + def cat_three_inputs_fn(a, b, c): + return torch.cat([a, b, c], dim=0) + + x = torch.randn(4, 16, device=device) + y = torch.randn(5, 16, device=device) + z = torch.randn(3, 16, device=device) + opt_fn = torch.compile(dynamic=False)(cat_three_inputs_fn) + + out = opt_fn(x, y, z) + + cpu_out = torch.cat([x.cpu(), y.cpu(), z.cpu()], dim=0) + _test_result("cat.three_inputs", out, cpu_out, rtol=1e-4, atol=1e-4) + + +def test_cat_four_inputs(device): + def cat_four_inputs_fn(a, b, c, d): + return torch.cat([a, b, c, d], dim=0) + + x = torch.randn(3, 16, device=device) + y = torch.randn(4, 16, device=device) + z = torch.randn(5, 16, device=device) + w = torch.randn(2, 16, device=device) + opt_fn = torch.compile(dynamic=False)(cat_four_inputs_fn) + + out = opt_fn(x, y, z, w) + + cpu_out = torch.cat([x.cpu(), y.cpu(), z.cpu(), w.cpu()], dim=0) + _test_result("cat.four_inputs", out, cpu_out, rtol=1e-4, atol=1e-4) + + +def test_cat_4d_three_inputs(device): + def cat_4d_three_inputs_fn(a, b, c): + return torch.cat([a, b, c], dim=1) + + x = torch.randn(2, 3, 4, 5, device=device) + y = torch.randn(2, 4, 4, 5, device=device) + z = torch.randn(2, 5, 4, 5, device=device) + opt_fn = torch.compile(dynamic=False)(cat_4d_three_inputs_fn) + + out = opt_fn(x, y, z) + + cpu_out = torch.cat([x.cpu(), y.cpu(), z.cpu()], dim=1) + _test_result("cat.4d.three_inputs", out, cpu_out, rtol=1e-4, atol=1e-4) + + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run cat simulation tests") parser.add_argument( "--case", - choices=["default", "out", "all"], + choices=[ + "default", "out", "4d_dim0", "4d_dim1", "4d_dim2", "4d_dim3", + "three_inputs", "four_inputs", "4d_three_inputs", "all" + ], default="all", help="Which cat case to run", ) @@ -87,3 +170,17 @@ def cat_out_fn(a, b, out): test_cat_default(device) if args.case in ("out", "all"): test_cat_out(device) + if args.case in ("4d_dim0", "all"): + test_cat_4d_dim0(device) + if args.case in ("4d_dim1", "all"): + test_cat_4d_dim1(device) + if args.case in ("4d_dim2", "all"): + test_cat_4d_dim2(device) + if args.case in ("4d_dim3", "all"): + test_cat_4d_dim3(device) + if args.case in ("three_inputs", "all"): + test_cat_three_inputs(device) + if args.case in ("four_inputs", "all"): + test_cat_four_inputs(device) + if args.case in ("4d_three_inputs", "all"): + test_cat_4d_three_inputs(device) From 434bbb10793a68172e49e107bc3b639fd3b86264 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 4 Mar 2026 20:02:14 +0900 Subject: [PATCH 113/194] [WIP] --- PyTorchSimFrontend/mlir/mlir_cat_template.py | 13 ------------- PyTorchSimFrontend/mlir/mlir_template.py | 2 +- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py index d68af7d4..5062e629 100644 --- a/PyTorchSimFrontend/mlir/mlir_cat_template.py +++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py @@ -118,7 +118,6 @@ def render( input_reorder=self.input_reorder, ) - self._setup_epilogue_info(kernel, y) code = self._template_from_string(TEMPLATE).render(**kernel.render_options) return code @@ -294,15 +293,3 @@ def _build_output_tile_desc(self, kernel, input_tile_sizes_dim, tile_sizes, rank y_tile_desc.set_tile_size(output_full_tile_sizes) y_tile_desc.set_name("y_cat_tile") return y_tile_desc - - def _setup_epilogue_info(self, kernel, y): - """Setup epilogue information.""" - if hasattr(y, "get_numel"): - y_numel = y.get_numel() - elif hasattr(y, "node") and hasattr(y.node, "get_numel"): - y_numel = y.node.get_numel() - else: - y_numel = None - - if y_numel is not None: - kernel.exception_nodes[kernel.render_options["OUT_DVAR"]] = {"numel": y_numel} diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 04d327f8..59610228 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -813,7 +813,7 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com if dram_var in self.exception_nodes: numel = self.exception_nodes[dram_var]["numel"] else: - numel = self.get_arg_info(self.named_nodes[dram_var].get_name()).get_numel() + numel = self.named_nodes[dram_var].get_numel() mlir_dtype = mlir_common.DTYPE_TO_MLIR[node_layout.dtype] dram_shape = f"memref<{numel}x{mlir_dtype}>" dram_stride = [] From 5295dfb5a16e21fda57b12d73906c1bd290c4f94 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 4 Mar 2026 22:13:26 +0900 Subject: [PATCH 114/194] [Template] Delay def_dma_op codegen def_dma_op find data node using dram_var. But it can't locate the proper node when output buffer has not been created. --- PyTorchSimFrontend/mlir/mlir_template.py | 146 +++++++++++++---------- 1 file changed, 81 insertions(+), 65 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 59610228..7c52bfe6 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -112,7 +112,8 @@ def __init__(self, self.outer_func_name = outer_func_name self.outer_func_render = outer_func_render self.kernel_arg_attributes = kernel_arg_attributes - self.render_hooks = OrderedDict() + self.render_hooks = OrderedDict() # Stores {key: (priority, hook)} + self.dma_op_counter = itertools.count() # Add counter for unique DMA op keys self.buffer_names = dict() self.render_options = dict() self.tile_size = [] @@ -555,7 +556,7 @@ def template_store(): dram_var = self.epilogue_info["dram_var"] index_list = self.epilogue_info["dram_idx"] tile_desc = self.epilogue_info["dram_tile_desc"] - code = self.def_dma_op("MVOUT", dram_var, index_list, tile_desc) + code = self.def_dma_op("MVOUT", dram_var, index_list, tile_desc, lazy_mode=False) self.cse.generate(self.dma_stores, code, assignment = False) body = IndentedBuffer() @@ -653,7 +654,7 @@ def hook(): return f"({', '.join(renamed_arg_defs)})" assert "" not in self.render_hooks - self.render_hooks[""] = hook + self.render_hooks[""] = (5, hook) # Default priority 5 return "" # This function is a temporal function for convolution because currently convolution kernel is not considering padding. @@ -700,7 +701,7 @@ def kernel_hook(): return f"({', '.join(arg_defs)})" assert "" not in self.render_hooks - self.render_hooks[""] = kernel_hook + self.render_hooks[""] = (5, kernel_hook) # Default priority 5 return "" # This function is for convolution wrapper function finalizing. @@ -711,7 +712,7 @@ def wrapper_hook(): return f"({', '.join(wrapper_arg_defs)})" if "" not in self.render_hooks: - self.render_hooks[""] = wrapper_hook + self.render_hooks[""] = (5, wrapper_hook) # Default priority 5 return "" def get_conv_inputs(self): @@ -720,15 +721,15 @@ def get_conv_inputs(self): def get_conv_outputs(self): return {k: v for k, v in self.kernel_group.args.output_buffers.items() if v != 'REMOVED'} - def load_input(self, indent_size: int = 0): + def load_input(self, indent_size: int = 0, priority: int = 1): def hook(): code = IndentedBuffer() prologue_code = self.codegen_prologue_body() if prologue_code.getvalue(): input_dma_code = self.def_dma_op("MVIN", self.prologue_info["input_dram_var"], self.prologue_info["input_idx"], - self.prologue_info["input_tile_desc"], subtile_size=self.prologue_info["input_subtile_size"], async_type=False) + self.prologue_info["input_tile_desc"], subtile_size=self.prologue_info["input_subtile_size"], async_type=False, lazy_mode=False) weight_dma_code = self.def_dma_op("MVIN", self.prologue_info["weight_dram_var"], self.prologue_info["weight_idx"], - self.prologue_info["weight_tile_desc"], subtile_size=self.prologue_info["weight_subtile_size"], async_type=False) + self.prologue_info["weight_tile_desc"], subtile_size=self.prologue_info["weight_subtile_size"], async_type=False, lazy_mode=False) if (self.prologue_info["is_input_fused"]): code.splice(input_dma_code) code.splice(prologue_code) @@ -739,58 +740,63 @@ def hook(): code.splice(input_dma_code) else: dma_code = self.def_dma_op("MVIN", self.prologue_info["input_dram_var"], self.prologue_info["input_idx"], - self.prologue_info["input_tile_desc"], subtile_size=self.prologue_info["input_subtile_size"], async_type=False) + self.prologue_info["input_tile_desc"], subtile_size=self.prologue_info["input_subtile_size"], async_type=False, lazy_mode=False) code.splice(dma_code) dma_code = self.def_dma_op("MVIN", self.prologue_info["weight_dram_var"], self.prologue_info["weight_idx"], - self.prologue_info["weight_tile_desc"], subtile_size=self.prologue_info["weight_subtile_size"], async_type=False) + self.prologue_info["weight_tile_desc"], subtile_size=self.prologue_info["weight_subtile_size"], async_type=False, lazy_mode=False) code.splice(dma_code) code = textwrap.indent(code.getvalue(), " "*indent_size).strip() return code assert "" not in self.render_hooks - self.render_hooks[""] = hook - self.render_hooks.move_to_end("", last=False) # Force order to be triggered first + self.render_hooks[""] = (priority, hook) return "" - def store_output(self, indent_size: int = 0): + def store_output(self, indent_size: int = 0, priority: int = 1): def hook(): epilogue_code = self.codegen_epilogue_body() return textwrap.indent(epilogue_code.getvalue(), " "*indent_size).strip() assert "" not in self.render_hooks - self.render_hooks[""] = hook - self.render_hooks.move_to_end("", last=False) # Force order to be triggered first + self.render_hooks[""] = (priority, hook) return "" - def reduction_output(self, indent_size: int = 0): + def reduction_output(self, indent_size: int = 0, priority: int = 5): def hook(): return textwrap.indent(self.reductions_suffix.getvalue(), " "*indent_size).strip() assert "" not in self.render_hooks - self.render_hooks[""] = hook + self.render_hooks[""] = (priority, hook) return "" + def _sort_hooks_by_priority(self): + """Sort hooks by priority (lower priority executes first).""" + sorted_hooks = OrderedDict() + for key, (priority, hook) in sorted(self.render_hooks.items(), key=lambda x: x[1][0]): + sorted_hooks[key] = hook + return sorted_hooks + def def_function(self): _, call_args, _, _ = self.kernel_group.args.python_argdefs() if self.outer_func_render is not None: partial_code, function_name = self.outer_func_render(input_args=call_args) + return PartialRender( partial_code, - self.render_hooks, + self._sort_hooks_by_priority(), ), function_name else: return None, None - def def_global_vars(self): + def def_global_vars(self, priority: int = 10): key = "" def hook(): return textwrap.indent(self.global_vars.getvalue(), "").strip() - assert key not in self.render_hooks - self.render_hooks[key] = hook + self.render_hooks[key] = (priority, hook) return key - def def_local_vars(self, indent_size=0): + def def_local_vars(self, indent_size=0, priority: int = 10): key = "" def hook(): code = IndentedBuffer() @@ -799,52 +805,62 @@ def hook(): code.splice(self.alloc_buffer) return textwrap.indent(code.getvalue(), " "*indent_size).strip() - assert key not in self.render_hooks - self.render_hooks[key] = hook + self.render_hooks[key] = (priority, hook) return key def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_common.MLIRMultiDimTile, - subtile_size:list=[], async_type=None, indent_size=0): - # Prepare code block - local_code = IndentedBuffer() - with self, self.override_buffer_cse(buffer=local_code, cse=self.apply_cse): - index_var = self.parse_index_list(index_list, offset=tile_desc.offset) - node_layout = self.named_nodes[dram_var].get_layout() - if dram_var in self.exception_nodes: - numel = self.exception_nodes[dram_var]["numel"] - else: - numel = self.named_nodes[dram_var].get_numel() - mlir_dtype = mlir_common.DTYPE_TO_MLIR[node_layout.dtype] - dram_shape = f"memref<{numel}x{mlir_dtype}>" - dram_stride = [] - for idx in index_list: - if idx.is_Mul: - dram_stride.append(int(idx.args[0])) - elif idx == sympy.Symbol("c0"): - dram_stride.append(0) - elif not idx.is_Number: - dram_stride.append(1) + subtile_size:list=[], async_type=None, indent_size=0, priority: int = 5, lazy_mode: bool = True): + def generate_dma_code(): + """Internal method to generate DMA code directly.""" + local_code = IndentedBuffer() + with self, self.override_buffer_cse(buffer=local_code, cse=self.apply_cse): + index_var = self.parse_index_list(index_list, offset=tile_desc.offset) + node_layout = self.named_nodes[dram_var].get_layout() + if dram_var in self.exception_nodes: + numel = self.exception_nodes[dram_var]["numel"] else: - dram_stride.append(0) - - sram_var = tile_desc.get_name() - tile_shape = tile_desc.get_mlir_shape(mlir_dtype) - tile_stride = tile_desc.get_tile_stride() - vlane_split_axis = tile_desc.vmap.vlane_split_axis - vlane_stride = tile_desc.vmap.vlane_stride - - zero_cse = self.get_const_cse(0, "index") - sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim()) - - attribute_parts = [f"dram_stride={dram_stride}", f"sram_stride={tile_stride}", "padding=0"] - if subtile_size: - attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}") - attribute = " {" + ", ".join(attribute_parts) + "}" - code = self.get_dma_code(dma_type, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var, - dram_shape, tile_shape, "") - local_code.writeline(code) - local_code.writeline(attribute) - return textwrap.indent(local_code.getvalue(), " "*indent_size).strip() + numel = self.get_arg_info(self.named_nodes[dram_var].get_name()).get_numel() + mlir_dtype = mlir_common.DTYPE_TO_MLIR[node_layout.dtype] + dram_shape = f"memref<{numel}x{mlir_dtype}>" + dram_stride = [] + for idx in index_list: + if idx.is_Mul: + dram_stride.append(int(idx.args[0])) + elif idx == sympy.Symbol("c0"): + dram_stride.append(0) + elif not idx.is_Number: + dram_stride.append(1) + else: + dram_stride.append(0) + + sram_var = tile_desc.get_name() + tile_shape = tile_desc.get_mlir_shape(mlir_dtype) + tile_stride = tile_desc.get_tile_stride() + vlane_split_axis = tile_desc.vmap.vlane_split_axis + vlane_stride = tile_desc.vmap.vlane_stride + + zero_cse = self.get_const_cse(0, "index") + sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim()) + + attribute_parts = [f"dram_stride={dram_stride}", f"sram_stride={tile_stride}", "padding=0"] + if subtile_size: + attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}") + attribute = " {" + ", ".join(attribute_parts) + "}" + code = self.get_dma_code(dma_type, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var, + dram_shape, tile_shape, "") + local_code.writeline(code) + local_code.writeline(attribute) + return textwrap.indent(local_code.getvalue(), " "*indent_size).strip() + + if not lazy_mode: + # Immediate mode: generate code directly and return it + return generate_dma_code() + + # Lazy mode: register hook and return key + dma_op_id = next(self.dma_op_counter) + key = f"" + self.render_hooks[key] = (priority, generate_dma_code) + return key def def_sram_buffer(self, dram_name, tile_desc, id=0, indent_size=0): # Prepare code block @@ -862,7 +878,7 @@ def render(self, template, kwargs, define_function=None): return PartialRender( code, - self.render_hooks, + self._sort_hooks_by_priority(), ) def get_spad_size_per_lane(self, tile_m, tile_n): From 61caebd5708ca21a88950d4d5073445891ea32f1 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 5 Mar 2026 00:12:49 +0900 Subject: [PATCH 115/194] [Template/Cat] Fix apply offset setting --- PyTorchSimFrontend/mlir/mlir_cat_template.py | 80 +++++++++----------- 1 file changed, 37 insertions(+), 43 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py index 5062e629..5aaf3e71 100644 --- a/PyTorchSimFrontend/mlir/mlir_cat_template.py +++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py @@ -26,7 +26,7 @@ affine.for %index_local{{ DIM }}_{{ i }} = 0 to {{ INPUT_SIZES[i][DIM] }} step {{ INPUT_TILE_SIZES_DIM[i] }} { %index{{ DIM }}_{{i}} = affine.apply affine_map<(d0) -> (d0 + {{ CUMULATIVE_OFFSETS[i] }})> (%index_local{{ DIM }}_{{ i }}) {{ kernel.def_dma_op("MVIN", INPUT_BUFFER_NAMES[i], INPUT_IDXS[i], INPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }} - {{ kernel.def_dma_op("MVOUT", OUT_DVAR, OUTPUT_IDXS[i], INPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }} + {{ kernel.def_dma_op("MVOUT", OUT_DVAR, OUTPUT_IDXS[i], OUTPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }} } { inner_loop=true } {%- endfor %} @@ -52,10 +52,6 @@ def render( tile_info=None, **kwargs, ): - is_out_variant = template_buffer_node is not None - if is_out_variant: - self.output_node = template_buffer_node - # Extract info input_nodes = self.input_nodes y = self.output_node @@ -73,11 +69,8 @@ def render( kernel, input_sizes, tile_sizes, num_inputs, rank ) buffer_name_to_template_name, input_buffer_names = self._build_buffer_mapping(input_nodes) - input_tile_descs, unique_tile_descs = self._build_tile_descriptors( - kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names - ) - y_tile_desc = self._build_output_tile_desc( - kernel, input_tile_sizes_dim, tile_sizes, rank + input_tile_descs, output_tile_descs, unique_tile_descs = self._build_tile_descriptors( + kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, y ) input_idxs, output_idxs, cumulative_offsets = self._build_index_expressions( @@ -90,14 +83,14 @@ def render( if actual_name in unique_tile_descs: unique_buffer_tile_descs[template_name] = unique_tile_descs[actual_name] - names_str = ", ".join(input_buffer_names + ["out_ptr1" if is_out_variant else "Y"]) + names_str = ", ".join(input_buffer_names + ["Y"]) indent_size = 2 + (rank - 1) * 2 + 4 kernel.render_options = dict( KERNEL_NAME=self.name, kernel=kernel, Y=y, - OUT_DVAR="out_ptr1" if is_out_variant else "Y", + OUT_DVAR="Y", NAMES_STR=names_str, INPUT_NAMES=input_nodes, INPUT_BUFFER_NAMES=input_buffer_names, @@ -110,6 +103,7 @@ def render( TILE_SIZES=tile_sizes, INPUT_TILE_SIZES_DIM=input_tile_sizes_dim, INPUT_TILE_DESCS=input_tile_descs, + OUTPUT_TILE_DESCS=output_tile_descs, UNIQUE_BUFFER_TILE_DESCS=unique_buffer_tile_descs, INPUT_IDXS=input_idxs, OUTPUT_IDXS=output_idxs, @@ -209,14 +203,16 @@ def _build_buffer_mapping(self, input_nodes): return buffer_name_to_template_name, input_buffer_names def _build_tile_descriptors( - self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names + self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, output_node ): - """Build tile descriptors for each input.""" + """Build tile descriptors for each input and output.""" input_tile_descs = [] + output_tile_descs = [] unique_tile_descs = {} + output_offset = output_node.get_layout().offset for i, x in enumerate(input_nodes): - # Build full tile size list for this input + x_offset = x.get_layout().offset full_tile_sizes = [] tile_size_idx = 0 for d in range(rank): @@ -226,23 +222,37 @@ def _build_tile_descriptors( else: full_tile_sizes.append(input_tile_sizes_dim[i]) - tile_desc = mlir_common.MLIRMultiDimTile( + # Input tile descriptor + input_tile_desc = mlir_common.MLIRMultiDimTile( full_tile_sizes, kernel.vector_lane, vlane_split_axis=rank - 1, vlane_stride=1 ) - tile_desc.set_tile_size(full_tile_sizes) + input_tile_desc.set_tile_size(full_tile_sizes) template_buffer_name = input_buffer_names[i] - tile_desc.set_name(f"{template_buffer_name.lower()}_cat_tile") - input_tile_descs.append(tile_desc) + input_tile_desc.set_name(f"{template_buffer_name.lower()}_cat_tile") + input_tile_desc.offset = x_offset + input_tile_descs.append(input_tile_desc) + + # Output tile descriptor (same as input but with output offset) + output_tile_desc = mlir_common.MLIRMultiDimTile( + full_tile_sizes, + kernel.vector_lane, + vlane_split_axis=rank - 1, + vlane_stride=1 + ) + output_tile_desc.set_tile_size(full_tile_sizes) + output_tile_desc.set_name(f"{template_buffer_name.lower()}_cat_tile") + output_tile_desc.offset = output_offset + output_tile_descs.append(output_tile_desc) # Store unique tile desc by actual buffer name actual_name = x.get_name() if actual_name not in unique_tile_descs: - unique_tile_descs[actual_name] = tile_desc + unique_tile_descs[actual_name] = input_tile_desc - return input_tile_descs, unique_tile_descs + return input_tile_descs, output_tile_descs, unique_tile_descs def _build_index_expressions( self, input_nodes, input_sizes, output_strides, rank, num_inputs @@ -256,6 +266,12 @@ def _build_index_expressions( for i, x in enumerate(input_nodes): x_stride = x.get_layout().stride + x_offset = x.get_layout().offset + if hasattr(x, 'data') and hasattr(x.data, 'dims'): + # In case of PermuteView, the stride is permuted + perm_dims = x.data.dims + x_stride = [x_stride[perm_dims[d]] for d in range(rank)] + input_idx = [] output_idx = [] for d in range(rank): @@ -271,25 +287,3 @@ def _build_index_expressions( output_idxs.append(output_idx) return input_idxs, output_idxs, cumulative_offsets - - def _build_output_tile_desc(self, kernel, input_tile_sizes_dim, tile_sizes, rank): - """Build output tile descriptor.""" - max_output_tile_dim = max(input_tile_sizes_dim) if input_tile_sizes_dim else 1 - output_full_tile_sizes = [] - tile_size_idx = 0 - for d in range(rank): - if d != self.dim: - output_full_tile_sizes.append(tile_sizes[tile_size_idx]) - tile_size_idx += 1 - else: - output_full_tile_sizes.append(max_output_tile_dim) - - y_tile_desc = mlir_common.MLIRMultiDimTile( - output_full_tile_sizes, - kernel.vector_lane, - vlane_split_axis=rank - 1, - vlane_stride=1 - ) - y_tile_desc.set_tile_size(output_full_tile_sizes) - y_tile_desc.set_name("y_cat_tile") - return y_tile_desc From 47684a75942bf9d35e19a7a79a1862418c5649a6 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 5 Mar 2026 17:44:32 +0900 Subject: [PATCH 116/194] [TOGSim] Add help print --- TOGSim/src/DMA.cc | 2 +- TOGSim/src/helper/CommandLineParser.cc | 6 +++++- TOGSim/src/helper/CommandLineParser.h | 8 +++++++- TOGSim/src/main.cc | 13 +++++++++---- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/TOGSim/src/DMA.cc b/TOGSim/src/DMA.cc index f8f21025..fefee6d2 100644 --- a/TOGSim/src/DMA.cc +++ b/TOGSim/src/DMA.cc @@ -12,7 +12,7 @@ void DMA::issue_tile(std::shared_ptr inst) { _current_inst = std::move(inst); std::vector& tile_size = _current_inst->get_tile_size(); if (tile_size.size() <= 0 || tile_size.size() > get_max_dim()) { - spdlog::error("[DMA {}] issued tile is not supported format..", _id); + spdlog::error("[DMA {}] issued tile is not supported format.. tile.size: {}, tile_size: [{}]", _id, tile_size.size(), fmt::join(tile_size, ", ")); exit(EXIT_FAILURE); } _finished = false; diff --git a/TOGSim/src/helper/CommandLineParser.cc b/TOGSim/src/helper/CommandLineParser.cc index 66aebbe1..9cd177ac 100644 --- a/TOGSim/src/helper/CommandLineParser.cc +++ b/TOGSim/src/helper/CommandLineParser.cc @@ -12,9 +12,13 @@ void CommandLineParser::parse(int argc, char **argv) noexcept(false) { po::notify(variables_map); } +void CommandLineParser::print_help_message() const noexcept { + std::cout << options_description << std::endl; +} + void CommandLineParser::print_help_message_if_required() const noexcept { if (variables_map.count("help") > 0) { - std::cout << options_description << std::endl; + print_help_message(); exit(0); } } diff --git a/TOGSim/src/helper/CommandLineParser.h b/TOGSim/src/helper/CommandLineParser.h index 39174d5d..b41eabf3 100644 --- a/TOGSim/src/helper/CommandLineParser.h +++ b/TOGSim/src/helper/CommandLineParser.h @@ -19,7 +19,7 @@ class CommandLineParser { * Command Line Parser constructor */ CommandLineParser() noexcept { - options_description.add_options()("help", "Prints help message"); + options_description.add_options()("help,h", "Prints help message"); } /** @@ -38,6 +38,12 @@ class CommandLineParser { */ void print_help_message_if_required() const noexcept; + /** + * Prints the help message. + * (Can be called to show help for invalid options) + */ + void print_help_message() const noexcept; + /** * Add a new command line argument option. * (Should be called before `parse` method is called) diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc index 7c596af5..cda8f986 100644 --- a/TOGSim/src/main.cc +++ b/TOGSim/src/main.cc @@ -96,19 +96,24 @@ int main(int argc, char** argv) { // parse command line argumnet CommandLineParser cmd_parser = CommandLineParser(); cmd_parser.add_command_line_option( - "config", "Path for hardware configuration file"); + "config", "Path for hardware configuration file (.yml)"); cmd_parser.add_command_line_option( - "models_list", "Path for the models list file (can be FIFO or regular file)"); + "models_list", "Path for the trace file (.trace)"); cmd_parser.add_command_line_option( "log_level", "Set for log level [trace, debug, info], default = info"); try { cmd_parser.parse(argc, argv); } catch (const CommandLineParser::ParsingError& e) { spdlog::error( - "Command line argument parrsing error captured. Error message: {}", + "Command line argument parsing error captured. Error message: {}", e.what()); - throw(e); + std::cerr << std::endl; + cmd_parser.print_help_message(); + exit(1); } + + // Check if help was requested + cmd_parser.print_help_message_if_required(); std::string level = "info"; cmd_parser.set_if_defined("log_level", &level); From a24f1f1081a4ce7e5e09a59f61763850d11d994f Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 5 Mar 2026 17:45:00 +0900 Subject: [PATCH 117/194] [Template/Cat] Limit maximum rank of tile --- PyTorchSimFrontend/mlir/mlir_cat_template.py | 52 +++++++++++++++----- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py index 5aaf3e71..2a00ce95 100644 --- a/PyTorchSimFrontend/mlir/mlir_cat_template.py +++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py @@ -64,17 +64,30 @@ def render( tile_sizes = tile_info if tile_info is not None else [1] * len(output_sizes) output_strides = y.get_layout().stride + excluded_dims = list() + max_tiled_dims = 4 - 1 + if len(tile_sizes) > max_tiled_dims: + # Create index:tile_size dictionary and sort by tile_size + dim_tile_dict = {idx: sz for idx, sz in enumerate(tile_sizes)} + sorted_dims = sorted(dim_tile_dict.items(), key=lambda x: x[1], reverse=True) + # Keep top 4 dimensions, exclude the rest + excluded_dims = [idx for idx, _ in sorted_dims[max_tiled_dims:]] + for idx in excluded_dims: + tile_sizes[idx] = 1 + # Calculate input tile sizes input_tile_sizes_dim = self._calculate_input_tile_sizes( kernel, input_sizes, tile_sizes, num_inputs, rank ) buffer_name_to_template_name, input_buffer_names = self._build_buffer_mapping(input_nodes) input_tile_descs, output_tile_descs, unique_tile_descs = self._build_tile_descriptors( - kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, y + kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, y, + excluded_dims=excluded_dims ) input_idxs, output_idxs, cumulative_offsets = self._build_index_expressions( - input_nodes, input_sizes, output_strides, rank, num_inputs + input_nodes, input_sizes, output_strides, rank, num_inputs, + excluded_dims=excluded_dims ) # Map unique buffer names to their tile descriptors for template @@ -203,9 +216,12 @@ def _build_buffer_mapping(self, input_nodes): return buffer_name_to_template_name, input_buffer_names def _build_tile_descriptors( - self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, output_node + self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, output_node, excluded_dims=None ): """Build tile descriptors for each input and output.""" + if excluded_dims is None: + excluded_dims = set() + input_tile_descs = [] output_tile_descs = [] unique_tile_descs = {} @@ -217,16 +233,21 @@ def _build_tile_descriptors( tile_size_idx = 0 for d in range(rank): if d != self.dim: - full_tile_sizes.append(tile_sizes[tile_size_idx]) + # Skip excluded dimensions + if tile_size_idx not in excluded_dims: + full_tile_sizes.append(tile_sizes[tile_size_idx]) tile_size_idx += 1 else: full_tile_sizes.append(input_tile_sizes_dim[i]) + # Calculate vlane_split_axis for reduced dimensions + vlane_split_axis = len(full_tile_sizes) - 1 + # Input tile descriptor input_tile_desc = mlir_common.MLIRMultiDimTile( full_tile_sizes, kernel.vector_lane, - vlane_split_axis=rank - 1, + vlane_split_axis=vlane_split_axis, vlane_stride=1 ) input_tile_desc.set_tile_size(full_tile_sizes) @@ -239,7 +260,7 @@ def _build_tile_descriptors( output_tile_desc = mlir_common.MLIRMultiDimTile( full_tile_sizes, kernel.vector_lane, - vlane_split_axis=rank - 1, + vlane_split_axis=vlane_split_axis, vlane_stride=1 ) output_tile_desc.set_tile_size(full_tile_sizes) @@ -255,9 +276,12 @@ def _build_tile_descriptors( return input_tile_descs, output_tile_descs, unique_tile_descs def _build_index_expressions( - self, input_nodes, input_sizes, output_strides, rank, num_inputs + self, input_nodes, input_sizes, output_strides, rank, num_inputs, excluded_dims=None ): """Build index expressions for input and output.""" + if excluded_dims is None: + excluded_dims = set() + input_idxs = [] output_idxs = [] cumulative_offsets = [0] @@ -274,15 +298,21 @@ def _build_index_expressions( input_idx = [] output_idx = [] + tile_size_idx = 0 for d in range(rank): if d != self.dim: - input_idx_symbol = sympy.Symbol(f"index{d}") - output_idx_symbol = sympy.Symbol(f"index{d}") + # Skip excluded dimensions + if tile_size_idx not in excluded_dims: + input_idx_symbol = sympy.Symbol(f"index{d}") + output_idx_symbol = sympy.Symbol(f"index{d}") + input_idx.append(input_idx_symbol * x_stride[d]) + output_idx.append(output_idx_symbol * output_strides[d]) + tile_size_idx += 1 else: input_idx_symbol = sympy.Symbol(f"index_local{self.dim}_{i}") output_idx_symbol = sympy.Symbol(f"index{self.dim}_{i}") - input_idx.append(input_idx_symbol * x_stride[d]) - output_idx.append(output_idx_symbol * output_strides[d]) + input_idx.append(input_idx_symbol * x_stride[d]) + output_idx.append(output_idx_symbol * output_strides[d]) input_idxs.append(input_idx) output_idxs.append(output_idx) From 4e4300e2cda61dcc5eeec103c91fe5ef13ff3a73 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 5 Mar 2026 20:22:10 +0900 Subject: [PATCH 118/194] [Template/Cat] Refactor cat + Support explicit dram+stride in def_dma_op --- .github/workflows/pytorchsim_test.yml | 21 + PyTorchSimFrontend/mlir/mlir_cat_template.py | 401 ++++++++++--------- PyTorchSimFrontend/mlir/mlir_template.py | 48 ++- tests/test_cat.py | 16 +- 4 files changed, 288 insertions(+), 198 deletions(-) diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index 9589384b..eaaa7e50 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -163,6 +163,27 @@ jobs: -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_conv2d.py + test_cat: + name: Run test_cat.py + runs-on: self-hosted + steps: + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Run test_cat.py + run: | + echo "Running test_cat.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e vpu_num_lanes="${{ inputs.vector_lane }}" \ + -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_cat.py + test_matmul: name: Run test_matmul.py runs-on: self-hosted diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py index 2a00ce95..6eb60198 100644 --- a/PyTorchSimFrontend/mlir/mlir_cat_template.py +++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List, Optional, Set import math import itertools @@ -23,10 +23,12 @@ {%- endfor %} {%- for i in range(NUM_INPUTS) %} // Input tensor{{ i }} - affine.for %index_local{{ DIM }}_{{ i }} = 0 to {{ INPUT_SIZES[i][DIM] }} step {{ INPUT_TILE_SIZES_DIM[i] }} { - %index{{ DIM }}_{{i}} = affine.apply affine_map<(d0) -> (d0 + {{ CUMULATIVE_OFFSETS[i] }})> (%index_local{{ DIM }}_{{ i }}) - {{ kernel.def_dma_op("MVIN", INPUT_BUFFER_NAMES[i], INPUT_IDXS[i], INPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }} - {{ kernel.def_dma_op("MVOUT", OUT_DVAR, OUTPUT_IDXS[i], OUTPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }} + affine.for %index_local{{ DIM }}_{{ i }} = 0 to {{ INPUTS[i].sizes[DIM] }} step {{ INPUTS[i].tile_size_dim }} { + %index{{ DIM }}_{{ i }} = affine.apply affine_map<(d0) -> (d0 + {{ INPUTS[i].cum_offset }})> (%index_local{{ DIM }}_{{ i }}) + %input_dram_offset_{{ i }} = affine.apply {{ INPUTS[i].offset_map }}({{ INPUTS[i].offset_vars }}) + %output_dram_offset_{{ i }} = affine.apply {{ OUTPUTS[i].offset_map }}({{ OUTPUTS[i].offset_vars }}) + {{ kernel.def_dma_op("MVIN", INPUTS[i].dram_name, [], INPUTS[i].tile_desc, indent_size=INDENT_SIZE, dram_stride=INPUTS[i].dram_strides, dram_offset="input_dram_offset_" ~ i) }} + {{ kernel.def_dma_op("MVOUT", "Y", [], OUTPUTS[i].tile_desc, indent_size=INDENT_SIZE, dram_stride=OUTPUTS[i].dram_strides, dram_offset="output_dram_offset_" ~ i) }} } { inner_loop=true } {%- endfor %} @@ -52,81 +54,84 @@ def render( tile_info=None, **kwargs, ): - # Extract info input_nodes = self.input_nodes y = self.output_node - num_inputs = len(self.input_nodes) + num_inputs = len(input_nodes) rank = len(y.get_size()) input_sizes = [x.get_size() for x in input_nodes] - output_sizes = [sz for dim, sz in enumerate(y.get_size()) if dim != self.dim] - output_dim = [dim for dim, sz in enumerate(y.get_size()) if dim != self.dim] - tile_sizes = tile_info if tile_info is not None else [1] * len(output_sizes) + output_sizes = [sz for d, sz in enumerate(y.get_size()) if d != self.dim] + output_dim = [d for d, _ in enumerate(y.get_size()) if d != self.dim] output_strides = y.get_layout().stride - excluded_dims = list() - max_tiled_dims = 4 - 1 - if len(tile_sizes) > max_tiled_dims: - # Create index:tile_size dictionary and sort by tile_size - dim_tile_dict = {idx: sz for idx, sz in enumerate(tile_sizes)} - sorted_dims = sorted(dim_tile_dict.items(), key=lambda x: x[1], reverse=True) - # Keep top 4 dimensions, exclude the rest - excluded_dims = [idx for idx, _ in sorted_dims[max_tiled_dims:]] - for idx in excluded_dims: - tile_sizes[idx] = 1 - - # Calculate input tile sizes + tile_sizes = list(tile_info) if tile_info is not None else [1] * len(output_sizes) + excluded_dims = self._compute_excluded_dims(tile_sizes) + input_tile_sizes_dim = self._calculate_input_tile_sizes( kernel, input_sizes, tile_sizes, num_inputs, rank ) - buffer_name_to_template_name, input_buffer_names = self._build_buffer_mapping(input_nodes) + buffer_name_to_template_name, input_dram_names = self._build_buffer_mapping(input_nodes) input_tile_descs, output_tile_descs, unique_tile_descs = self._build_tile_descriptors( - kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, y, - excluded_dims=excluded_dims + kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, + input_dram_names, y, excluded_dims=excluded_dims ) - - input_idxs, output_idxs, cumulative_offsets = self._build_index_expressions( - input_nodes, input_sizes, output_strides, rank, num_inputs, - excluded_dims=excluded_dims + (input_offset_maps, input_offset_var_strs, input_dram_strides, + output_offset_maps, output_offset_var_strs, output_dram_strides, + cumulative_offsets) = self._build_dma_info( + input_nodes, input_sizes, output_strides, input_tile_descs, output_tile_descs, + rank, num_inputs, excluded_dims=excluded_dims ) - # Map unique buffer names to their tile descriptors for template - unique_buffer_tile_descs = {} - for actual_name, template_name in buffer_name_to_template_name.items(): - if actual_name in unique_tile_descs: - unique_buffer_tile_descs[template_name] = unique_tile_descs[actual_name] - - names_str = ", ".join(input_buffer_names + ["Y"]) + unique_buffer_tile_descs = { + buffer_name_to_template_name[name]: desc + for name, desc in unique_tile_descs.items() + } + names_str = ", ".join(input_dram_names + ["Y"]) indent_size = 2 + (rank - 1) * 2 + 4 + inputs_info = [ + dict( + dram_name = input_dram_names[i], + sizes = input_sizes[i], + tile_size_dim= input_tile_sizes_dim[i], + tile_desc = input_tile_descs[i], + offset_map = input_offset_maps[i], + offset_vars = input_offset_var_strs[i], + dram_strides = input_dram_strides[i], + cum_offset = cumulative_offsets[i], + ) + for i in range(num_inputs) + ] + outputs_info = [ + dict( + tile_desc = output_tile_descs[i], + offset_map = output_offset_maps[i], + offset_vars = output_offset_var_strs[i], + dram_strides = output_dram_strides[i], + ) + for i in range(num_inputs) + ] + kernel.render_options = dict( - KERNEL_NAME=self.name, - kernel=kernel, - Y=y, - OUT_DVAR="Y", - NAMES_STR=names_str, - INPUT_NAMES=input_nodes, - INPUT_BUFFER_NAMES=input_buffer_names, - NUM_INPUTS=num_inputs, - RANK=rank, - DIM=self.dim, - INPUT_SIZES=input_sizes, - OUTPUT_SIZES=output_sizes, - OUTPUT_DIM=output_dim, - TILE_SIZES=tile_sizes, - INPUT_TILE_SIZES_DIM=input_tile_sizes_dim, - INPUT_TILE_DESCS=input_tile_descs, - OUTPUT_TILE_DESCS=output_tile_descs, - UNIQUE_BUFFER_TILE_DESCS=unique_buffer_tile_descs, - INPUT_IDXS=input_idxs, - OUTPUT_IDXS=output_idxs, - CUMULATIVE_OFFSETS=cumulative_offsets, - INDENT_SIZE=indent_size, - input_reorder=self.input_reorder, + KERNEL_NAME = self.name, + kernel = kernel, + NUM_INPUTS = num_inputs, + NAMES_STR = names_str, + Y = y, + INPUT_NAMES = input_nodes, + RANK = rank, + DIM = self.dim, + OUTPUT_SIZES = output_sizes, + OUTPUT_DIM = output_dim, + TILE_SIZES = tile_sizes, + UNIQUE_BUFFER_TILE_DESCS = unique_buffer_tile_descs, + INPUTS = inputs_info, + OUTPUTS = outputs_info, + INDENT_SIZE = indent_size, + input_reorder = self.input_reorder, ) - code = self._template_from_string(TEMPLATE).render(**kernel.render_options) - return code + return self._template_from_string(TEMPLATE).render(**kernel.render_options) def get_tile_candidates( self, @@ -141,179 +146,217 @@ def get_tile_candidates( y = self.output_node num_inputs = len(self.input_nodes) - output_sizes = [sz for dim, sz in enumerate(y.get_size()) if dim != self.dim] - num_non_dim_dims = len(output_sizes) + output_sizes = [sz for d, sz in enumerate(y.get_size()) if d != self.dim] - if num_non_dim_dims == 0: + if not output_sizes: return [[1]] - tile_candidates = [] - dim_tile_candidates = [] + max_tile_total = kernel.spad_info["spad_size"] // ( + kernel.vector_lane * kernel.precision * 2 * num_inputs + ) + dim_tile_candidates = [] for dim_size in output_sizes: - dim_candidates = [] - max_tile = min(dim_size, kernel.spad_info["spad_size"] // (kernel.vector_lane * kernel.precision * 2 * num_inputs)) - + max_tile = min(dim_size, max_tile_total) + candidates = set() for mult in range(1, max_tile // kernel.vector_lane + 1): - tile = mult * kernel.vector_lane - if tile <= dim_size: - dim_candidates.append(tile) - + t = mult * kernel.vector_lane + if t <= dim_size: + candidates.add(t) if max_tile > 0: for exp in range(int(math.log2(max_tile)) + 1): - tile = 2 ** exp - if tile <= dim_size and tile not in dim_candidates: - dim_candidates.append(tile) - - if dim_size not in dim_candidates: - dim_candidates.append(dim_size) - - dim_tile_candidates.append(sorted(set(dim_candidates))[:5]) - - for tile_combo in itertools.product(*dim_tile_candidates): - total_elements = math.prod(tile_combo) - total_spad_needed = total_elements * (num_inputs + 1) * kernel.precision - - if total_spad_needed <= kernel.spad_info["spad_size"] * kernel.vector_lane: - tile_candidates.append(list(tile_combo)) + t = 2 ** exp + if t <= dim_size: + candidates.add(t) + candidates.add(dim_size) + dim_tile_candidates.append(sorted(candidates)[:5]) + + tile_candidates = [ + list(combo) + for combo in itertools.product(*dim_tile_candidates) + if math.prod(combo) * (num_inputs + 1) * kernel.precision + <= kernel.spad_info["spad_size"] * kernel.vector_lane + ] if not tile_candidates: - tile_candidates = [[1] * num_non_dim_dims] + tile_candidates = [[1] * len(output_sizes)] tile_candidates.sort(key=lambda x: -math.prod(x)) return tile_candidates[:4] - def _calculate_input_tile_sizes( - self, kernel, input_sizes, tile_sizes, num_inputs, rank - ): - """Calculate tile sizes for concat dimension for each input.""" + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + def _compute_excluded_dims(self, tile_sizes: list) -> list: + """Return non-tiled dimension indices when rank exceeds the 4-dim limit.""" + max_tiled = 3 + if len(tile_sizes) <= max_tiled: + return [] + sorted_dims = sorted(enumerate(tile_sizes), key=lambda x: x[1], reverse=True) + excluded = [idx for idx, _ in sorted_dims[max_tiled:]] + for idx in excluded: + tile_sizes[idx] = 1 + return excluded + + def _calculate_input_tile_sizes(self, kernel, input_sizes, tile_sizes, num_inputs, rank): + """Calculate tile sizes along the concat dimension for each input.""" non_dim_tile_elements = math.prod(tile_sizes) if tile_sizes else 1 - non_dim_tile_spad = non_dim_tile_elements * kernel.precision max_spad_per_input = kernel.spad_info["spad_size"] * kernel.vector_lane // 2 - extra_concat_input = math.ceil(max_spad_per_input / non_dim_tile_spad) - num_inputs + extra_concat = math.ceil(max_spad_per_input / (non_dim_tile_elements * kernel.precision)) - num_inputs input_tile_sizes_dim = [] for i in range(num_inputs): - input_dim_size = input_sizes[i][self.dim] - if extra_concat_input > 0 and non_dim_tile_elements > 0: - max_tile_dim = min(input_dim_size, extra_concat_input) - extra_concat_input -= max_tile_dim + if extra_concat > 0 and non_dim_tile_elements > 0: + tile_dim = min(input_sizes[i][self.dim], extra_concat) + extra_concat -= tile_dim else: - max_tile_dim = 1 - input_tile_sizes_dim.append(max_tile_dim) + tile_dim = 1 + input_tile_sizes_dim.append(tile_dim) return input_tile_sizes_dim def _build_buffer_mapping(self, input_nodes): - """Map actual buffer names to template buffer names """ - buffer_name_to_template_name = {} - input_buffer_names = [] + """Map actual buffer names to short template names (X0, X1, ...).""" + name_map = {} + template_names = [] for x in input_nodes: - actual_name = x.get_name() - template_name = buffer_name_to_template_name.setdefault( - actual_name, f"X{len(buffer_name_to_template_name)}" - ) - input_buffer_names.append(template_name) - return buffer_name_to_template_name, input_buffer_names + actual = x.get_name() + template = name_map.setdefault(actual, f"X{len(name_map)}") + template_names.append(template) + return name_map, template_names def _build_tile_descriptors( - self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, output_node, excluded_dims=None + self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, + input_buffer_names, output_node, excluded_dims=None ): - """Build tile descriptors for each input and output.""" + """Build tile descriptors for every input (and its paired output).""" if excluded_dims is None: excluded_dims = set() - input_tile_descs = [] - output_tile_descs = [] - unique_tile_descs = {} + def make_tile_desc(tile_sz, vector_lane, name, offset): + desc = mlir_common.MLIRMultiDimTile( + tile_sz, vector_lane, + vlane_split_axis=len(tile_sz) - 1, + vlane_stride=1 + ) + desc.set_tile_size(tile_sz) + desc.set_name(name) + desc.offset = offset + return desc + output_offset = output_node.get_layout().offset + input_tile_descs, output_tile_descs, unique_tile_descs = [], [], {} for i, x in enumerate(input_nodes): - x_offset = x.get_layout().offset - full_tile_sizes = [] - tile_size_idx = 0 + # Collect tile sizes for tiled dimensions only (skip excluded non-concat dims) + tile_sz = [] + tile_idx = 0 for d in range(rank): if d != self.dim: - # Skip excluded dimensions - if tile_size_idx not in excluded_dims: - full_tile_sizes.append(tile_sizes[tile_size_idx]) - tile_size_idx += 1 + if tile_idx not in excluded_dims: + tile_sz.append(tile_sizes[tile_idx]) + tile_idx += 1 else: - full_tile_sizes.append(input_tile_sizes_dim[i]) + tile_sz.append(input_tile_sizes_dim[i]) - # Calculate vlane_split_axis for reduced dimensions - vlane_split_axis = len(full_tile_sizes) - 1 + sram_name = f"{input_buffer_names[i].lower()}_cat_tile" + input_tile_descs.append(make_tile_desc(tile_sz, kernel.vector_lane, sram_name, x.get_layout().offset)) + output_tile_descs.append(make_tile_desc(tile_sz, kernel.vector_lane, sram_name, output_offset)) - # Input tile descriptor - input_tile_desc = mlir_common.MLIRMultiDimTile( - full_tile_sizes, - kernel.vector_lane, - vlane_split_axis=vlane_split_axis, - vlane_stride=1 - ) - input_tile_desc.set_tile_size(full_tile_sizes) - template_buffer_name = input_buffer_names[i] - input_tile_desc.set_name(f"{template_buffer_name.lower()}_cat_tile") - input_tile_desc.offset = x_offset - input_tile_descs.append(input_tile_desc) - - # Output tile descriptor (same as input but with output offset) - output_tile_desc = mlir_common.MLIRMultiDimTile( - full_tile_sizes, - kernel.vector_lane, - vlane_split_axis=vlane_split_axis, - vlane_stride=1 - ) - output_tile_desc.set_tile_size(full_tile_sizes) - output_tile_desc.set_name(f"{template_buffer_name.lower()}_cat_tile") - output_tile_desc.offset = output_offset - output_tile_descs.append(output_tile_desc) - - # Store unique tile desc by actual buffer name actual_name = x.get_name() if actual_name not in unique_tile_descs: - unique_tile_descs[actual_name] = input_tile_desc + unique_tile_descs[actual_name] = input_tile_descs[-1] return input_tile_descs, output_tile_descs, unique_tile_descs - def _build_index_expressions( - self, input_nodes, input_sizes, output_strides, rank, num_inputs, excluded_dims=None + def _build_dma_info( + self, input_nodes, input_sizes, output_strides, + input_tile_descs, output_tile_descs, + rank, num_inputs, excluded_dims=None ): - """Build index expressions for input and output.""" + """Build per-input DRAM offset affine maps and tile strides. + + Three stride concepts are maintained: + + * layout_strides (internal) - raw DRAM buffer strides for every rank + dimension, used to compute the flat base-address affine map. + These reflect how the tensor is physically laid out in DRAM. + * dram_strides (returned, ``def_dma_op dram_stride=``) - stride in + DRAM per *tiled* dimension (excluded dims removed). The DMA engine + uses these to walk DRAM when loading/storing a tile. + * sram_strides (inside ``def_dma_op``, from tile_desc) - stride in + SRAM per tiled dimension. The DMA engine uses these to place data + into the SRAM tile buffer. + + Returns: + input_offset_maps, input_offset_var_strs, input_dram_strides, + output_offset_maps, output_offset_var_strs, output_dram_strides, + cumulative_offsets + """ if excluded_dims is None: excluded_dims = set() - input_idxs = [] - output_idxs = [] + def make_affine_map(idx_syms, strides, layout_offset): + terms = [] + for j, s in enumerate(strides): + s = int(s) + if s == 1: + terms.append(f"d{j}") + elif s != 0: + terms.append(f"d{j} * {s}") + try: + off = int(layout_offset) + except (TypeError, ValueError): + off = 0 + if off: + terms.append(str(off)) + dim_str = ", ".join(f"d{j}" for j in range(len(idx_syms))) + return f"affine_map<({dim_str}) -> ({' + '.join(terms) if terms else '0'})>" + cumulative_offsets = [0] for i in range(num_inputs - 1): cumulative_offsets.append(cumulative_offsets[-1] + input_sizes[i][self.dim]) + input_offset_maps, input_offset_var_strs, input_dram_strides = [], [], [] + output_offset_maps, output_offset_var_strs, output_dram_strides = [], [], [] + for i, x in enumerate(input_nodes): x_stride = x.get_layout().stride - x_offset = x.get_layout().offset if hasattr(x, 'data') and hasattr(x.data, 'dims'): - # In case of PermuteView, the stride is permuted - perm_dims = x.data.dims - x_stride = [x_stride[perm_dims[d]] for d in range(rank)] + # PermuteView: re-order strides according to the permutation + perm = x.data.dims + x_stride = [x_stride[perm[d]] for d in range(rank)] + + in_syms, in_layout_strides, in_dram_strides = [], [], [] + out_syms, out_layout_strides, out_dram_strides = [], [], [] + tile_idx = 0 - input_idx = [] - output_idx = [] - tile_size_idx = 0 for d in range(rank): if d != self.dim: - # Skip excluded dimensions - if tile_size_idx not in excluded_dims: - input_idx_symbol = sympy.Symbol(f"index{d}") - output_idx_symbol = sympy.Symbol(f"index{d}") - input_idx.append(input_idx_symbol * x_stride[d]) - output_idx.append(output_idx_symbol * output_strides[d]) - tile_size_idx += 1 + in_syms.append(sympy.Symbol(f"index{d}")) + in_layout_strides.append(int(x_stride[d])) + out_syms.append(sympy.Symbol(f"index{d}")) + out_layout_strides.append(int(output_strides[d])) + if tile_idx not in excluded_dims: + in_dram_strides.append(int(x_stride[d])) + out_dram_strides.append(int(output_strides[d])) + tile_idx += 1 else: - input_idx_symbol = sympy.Symbol(f"index_local{self.dim}_{i}") - output_idx_symbol = sympy.Symbol(f"index{self.dim}_{i}") - input_idx.append(input_idx_symbol * x_stride[d]) - output_idx.append(output_idx_symbol * output_strides[d]) - input_idxs.append(input_idx) - output_idxs.append(output_idx) - - return input_idxs, output_idxs, cumulative_offsets + in_syms.append(sympy.Symbol(f"index_local{self.dim}_{i}")) + in_layout_strides.append(int(x_stride[d])) + out_syms.append(sympy.Symbol(f"index{self.dim}_{i}")) + out_layout_strides.append(int(output_strides[d])) + in_dram_strides.append(int(x_stride[d])) + out_dram_strides.append(int(output_strides[d])) + + input_offset_maps.append(make_affine_map(in_syms, in_layout_strides, input_tile_descs[i].offset)) + input_offset_var_strs.append(", ".join(f"%{s}" for s in in_syms)) + input_dram_strides.append(in_dram_strides) + + output_offset_maps.append(make_affine_map(out_syms, out_layout_strides, output_tile_descs[i].offset)) + output_offset_var_strs.append(", ".join(f"%{s}" for s in out_syms)) + output_dram_strides.append(out_dram_strides) + + return (input_offset_maps, input_offset_var_strs, input_dram_strides, + output_offset_maps, output_offset_var_strs, output_dram_strides, + cumulative_offsets) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 7c52bfe6..9cc79e0a 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -809,12 +809,18 @@ def hook(): return key def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_common.MLIRMultiDimTile, - subtile_size:list=[], async_type=None, indent_size=0, priority: int = 5, lazy_mode: bool = True): + subtile_size:list=[], async_type=None, indent_size=0, priority: int = 5, lazy_mode: bool = True, + dram_stride:list=None, dram_offset=None): + # Todo. Remove legacy behavior (i.e., index_list parsing) def generate_dma_code(): """Internal method to generate DMA code directly.""" local_code = IndentedBuffer() with self, self.override_buffer_cse(buffer=local_code, cse=self.apply_cse): - index_var = self.parse_index_list(index_list, offset=tile_desc.offset) + if dram_offset is not None: + # Use explicitly provided offset (pre-computed MLIR SSA variable name) + index_var = dram_offset + else: + index_var = self.parse_index_list(index_list, offset=tile_desc.offset) node_layout = self.named_nodes[dram_var].get_layout() if dram_var in self.exception_nodes: numel = self.exception_nodes[dram_var]["numel"] @@ -822,27 +828,33 @@ def generate_dma_code(): numel = self.get_arg_info(self.named_nodes[dram_var].get_name()).get_numel() mlir_dtype = mlir_common.DTYPE_TO_MLIR[node_layout.dtype] dram_shape = f"memref<{numel}x{mlir_dtype}>" - dram_stride = [] - for idx in index_list: - if idx.is_Mul: - dram_stride.append(int(idx.args[0])) - elif idx == sympy.Symbol("c0"): - dram_stride.append(0) - elif not idx.is_Number: - dram_stride.append(1) - else: - dram_stride.append(0) - sram_var = tile_desc.get_name() - tile_shape = tile_desc.get_mlir_shape(mlir_dtype) - tile_stride = tile_desc.get_tile_stride() - vlane_split_axis = tile_desc.vmap.vlane_split_axis - vlane_stride = tile_desc.vmap.vlane_stride + if dram_stride is not None: + # Use explicitly provided dram_stride + _dram_stride = dram_stride + else: + # Extract dram_stride from index_list (legacy behavior) + _dram_stride = [] + for idx in index_list: + if idx.is_Mul: + _dram_stride.append(int(idx.args[0])) + elif idx == sympy.Symbol("c0"): + _dram_stride.append(0) + elif not idx.is_Number: + _dram_stride.append(1) + else: + _dram_stride.append(0) + + sram_var = tile_desc.get_name() + tile_shape = tile_desc.get_mlir_shape(mlir_dtype) + sram_strides = tile_desc.get_tile_stride() + vlane_split_axis = tile_desc.vmap.vlane_split_axis + vlane_stride = tile_desc.vmap.vlane_stride zero_cse = self.get_const_cse(0, "index") sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim()) - attribute_parts = [f"dram_stride={dram_stride}", f"sram_stride={tile_stride}", "padding=0"] + attribute_parts = [f"dram_stride={_dram_stride}", f"sram_stride={sram_strides}", "padding=0"] if subtile_size: attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}") attribute = " {" + ", ".join(attribute_parts) + "}" diff --git a/tests/test_cat.py b/tests/test_cat.py index 62de6759..97fcc754 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -150,13 +150,25 @@ def cat_4d_three_inputs_fn(a, b, c): cpu_out = torch.cat([x.cpu(), y.cpu(), z.cpu()], dim=1) _test_result("cat.4d.three_inputs", out, cpu_out, rtol=1e-4, atol=1e-4) +def test_cat_5d(device, dim=0): + def cat_5d_fn(a, b): + return torch.cat([a, b], dim=dim) + + x = torch.randn(2, 3, 4, 5, 6, device=device) + y = torch.randn(3, 3, 4, 5, 6, device=device) + opt_fn = torch.compile(dynamic=False)(cat_5d_fn) + + out = opt_fn(x, y) + + cpu_out = torch.cat([x.cpu(), y.cpu()], dim=dim) + _test_result("cat.5d.dim0", out, cpu_out, rtol=1e-4, atol=1e-4) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run cat simulation tests") parser.add_argument( "--case", choices=[ - "default", "out", "4d_dim0", "4d_dim1", "4d_dim2", "4d_dim3", + "default", "out", "4d_dim0", "4d_dim1", "4d_dim2", "4d_dim3", "5d" "three_inputs", "four_inputs", "4d_three_inputs", "all" ], default="all", @@ -184,3 +196,5 @@ def cat_4d_three_inputs_fn(a, b, c): test_cat_four_inputs(device) if args.case in ("4d_three_inputs", "all"): test_cat_4d_three_inputs(device) + if args.case in ("5d", "all"): + test_cat_5d(device) From 3d9cb387b2ba27853efb983241fa4450c3174d9d Mon Sep 17 00:00:00 2001 From: jung-min Date: Thu, 5 Mar 2026 11:45:36 +0000 Subject: [PATCH 119/194] [Frontend/template] Connect SDPA template to NPU using Torch OpenReg --- PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp | 34 +--- PyTorchSimDevice/csrc/aten/native/Extra.cpp | 51 +---- .../torch_openreg/openreg/__init__.py | 4 +- PyTorchSimFrontend/mlir/mlir_lowering.py | 14 +- PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 186 +----------------- 5 files changed, 14 insertions(+), 275 deletions(-) diff --git a/PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp b/PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp index 04ba6d48..f048f878 100644 --- a/PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp +++ b/PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -40,36 +41,6 @@ void wrapper_quantize_tensor_per_tensor_affine_stub( rtensor, qtensor, scale, zero_point); } -std::tuple< - at::Tensor, - at::Tensor, - at::Tensor, - at::Tensor, - c10::SymInt, - c10::SymInt, - at::Tensor, - at::Tensor, - at::Tensor> -wrapper__scaled_dot_product_fused_attention_overrideable( - const at::Tensor& query, - const at::Tensor& key, - const at::Tensor& value, - const std::optional& attn_bias, - double dropout_p, - bool is_causal, - bool return_debug_mask, - std::optional scale) { - return at::native::openreg::_scaled_dot_product_fused_attention_overrideable( - query, - key, - value, - attn_bias, - dropout_p, - is_causal, - return_debug_mask, - scale); -} - std::tuple wrapper_scaled_dot_product_fused_attention_overrideable_backward( const at::Tensor& grad_out, @@ -172,9 +143,6 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { m.impl("abs.out", &wrapper_abs_out); m.impl("quantize_per_tensor", &wrapper_quantize_per_tensor); m.impl("_fused_sdp_choice", &wrapper__fused_sdp_choice); - m.impl( - "_scaled_dot_product_fused_attention_overrideable", - &wrapper__scaled_dot_product_fused_attention_overrideable); m.impl( "_scaled_dot_product_fused_attention_overrideable_backward", &wrapper_scaled_dot_product_fused_attention_overrideable_backward); diff --git a/PyTorchSimDevice/csrc/aten/native/Extra.cpp b/PyTorchSimDevice/csrc/aten/native/Extra.cpp index 711d114c..aaf28e1a 100644 --- a/PyTorchSimDevice/csrc/aten/native/Extra.cpp +++ b/PyTorchSimDevice/csrc/aten/native/Extra.cpp @@ -19,7 +19,8 @@ int64_t _fused_sdp_choice( bool is_causal, std::optional scale, bool enable_gqa) { - auto backend = sdp::SDPBackend::math; + + auto backend = sdp::SDPBackend::overrideable; return static_cast(backend); } @@ -29,54 +30,6 @@ void quantize_tensor_per_tensor_affine_stub( double scale, int64_t zero_point) {} -std::tuple< - at::Tensor, - at::Tensor, - at::Tensor, - at::Tensor, - c10::SymInt, - c10::SymInt, - at::Tensor, - at::Tensor, - at::Tensor> -_scaled_dot_product_fused_attention_overrideable( - const at::Tensor& query, - const at::Tensor& key, - const at::Tensor& value, - const std::optional& attn_bias, - double dropout_p, - bool is_causal, - bool return_debug_mask, - std::optional scale) { - const int64_t batch_size = query.size(0); - const int64_t num_heads = query.size(1); - const int64_t head_dim_v = value.size(3); - const int64_t max_seqlen_q = query.size(2); - const int64_t max_seqlen_kv = key.size(2); - - auto opts = query.options(); - auto output = - at::empty({batch_size, num_heads, max_seqlen_q, head_dim_v}, opts); - auto logsumexp = - at::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat)); - auto debug_attn_mask = at::empty( - {batch_size, num_heads, max_seqlen_q, max_seqlen_kv}, - opts.dtype(at::kFloat)); - auto philox_seed = at::empty({}, at::dtype(at::kLong)); - auto philox_offset = at::empty({}, at::dtype(at::kLong)); - - return std::make_tuple( - output, - logsumexp, - at::Tensor(), - at::Tensor(), - max_seqlen_q, - max_seqlen_kv, - philox_seed, - philox_offset, - debug_attn_mask); -} - std::tuple _scaled_dot_product_fused_attention_overrideable_backward( const at::Tensor& grad_out, diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py index 5a0de6c3..9d10f90e 100644 --- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py +++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py @@ -66,8 +66,8 @@ def _lazy_init(): return # Replace the global C++ binding with our custom dispatcher patch - from PyTorchSimFrontend.mlir.mlir_sdpa_template import patched_scaled_dot_product_attention - torch._C._nn.scaled_dot_product_attention = patched_scaled_dot_product_attention + # from PyTorchSimFrontend.mlir.mlir_sdpa_template import patched_scaled_dot_product_attention + # torch._C._nn.scaled_dot_product_attention = patched_scaled_dot_product_attention torch_openreg._C._init() register_interface_for_device(custom_device(), ExtensionDeviceInterface) diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py index e09dcf57..a6b2478c 100644 --- a/PyTorchSimFrontend/mlir/mlir_lowering.py +++ b/PyTorchSimFrontend/mlir/mlir_lowering.py @@ -15,7 +15,7 @@ from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate -from PyTorchSimFrontend.mlir.mlir_sdpa_template import MLIRFlashSDPATemplate, flash_sdpa_args +from PyTorchSimFrontend.mlir.mlir_sdpa_template import MLIRFlashSDPATemplate, flash_sdpa_args, calculate_scale from PyTorchSimFrontend import extension_config aten = torch.ops.aten @@ -44,14 +44,16 @@ def tuned_flash_sdpa( query : TensorBox, key : TensorBox, value : TensorBox, - scale : float, + attn_bias : Optional[TensorBox] = None, dropout_p : float = 0.0, is_causal : bool = False, - return_debug_mask : bool =False) -> tuple: + return_debug_mask : bool = False, + scale : Optional[float] = None) -> tuple: - print("Enter tuned_flash_sdpa") - + + scale = calculate_scale(query, scale) N, Hq, H, L, S, E, Ev, layout, query, key, value = flash_sdpa_args(query, key, value) + mlir_template = MLIRFlashSDPATemplate([query, key, value], layout, scale) # _scaled_dot_product_flash_attention has to return a tuple which has 9 values @@ -211,4 +213,4 @@ def custom_unsafe_index(x, indices): if extension_config.CONFIG_USE_TIMING_POOLING: lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template -lowerings.update({getattr(aten._scaled_dot_product_flash_attention, overload): tuned_flash_sdpa for overload in aten._scaled_dot_product_flash_attention.overloads()}) \ No newline at end of file +lowerings.update({getattr(aten._scaled_dot_product_fused_attention_overrideable, overload): tuned_flash_sdpa for overload in aten._scaled_dot_product_fused_attention_overrideable.overloads()}) \ No newline at end of file diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py index 49c6c6bb..05030f27 100644 --- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py +++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py @@ -73,121 +73,6 @@ def flash_sdpa_args( ) return [n, hq, h, l, s, e, ev, layout, query, key, value] - -def validate_sdpa_input( - query : torch.Tensor, - key : torch.Tensor, - value : torch.Tensor, - attn_mask : torch.Tensor = None, - dropout_p : float = 0.0, - is_casual : bool = False, - scale : float = None, - enable_gqa : bool = False) -> None: - """ - Validates input tensors and parameters for Scaled Dot Product Attention (SDPA). - This function's logic can be found in: - https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/transformers/attention.cpp(504 line) - https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html - """ - - # Tensor class, dtype, and device consistency - # Ensure all primary inputs are torch.Tensors - if not all(isinstance(t, torch.Tensor) for t in [query, key, value]): - raise TypeError( - f"Expected query, key and value to be Tensors, but got " - f"{type(query).__name__}, {type(key).__name__}, and {type(value).__name__}." - ) - - # Check for dtype mismatch - if query.dtype != key.dtype or query.dtype != value.dtype: - raise TypeError( - f"Expected query, key, and value to have the same dtype, " - f"but got {query.dtype}, {key.dtype}, and {value.dtype}." - ) - - # Check for device mismatch (e.g., mixing CPU and NPU) - if query.device != key.device or query.device != value.device: - raise ValueError( - f"Expected query, key, and value to be on the same device, " - f"but got {query.device}, {key.device}, and {value.device}." - ) - - # Shape and dimension validation - # SDPA typically expects 4D (B, H, S, D), but we check for at least 2D here - if any(t.dim() < 2 for t in [query, key, value]): - raise ValueError( - f"Expected query, key, and value to be at least 2D, " - f"but got Q:{query.dim()}D, K:{key.dim()}D, V:{value.dim()}D." - ) - - # Attention mask validation - if attn_mask is not None: - if not isinstance(attn_mask, torch.Tensor): - raise TypeError(f"Expected attn_mask to be a Tensor, but got {type(attn_mask).__name__}.") - - # Dtype check: floating point masks must match query dtype; bool masks are also allowed - if attn_mask.dtype.is_floating_point: - if attn_mask.dtype != query.dtype: - raise TypeError(f"Floating point attn_mask must match query dtype ({query.dtype}), but got {attn_mask.dtype}.") - elif attn_mask.dtype != torch.bool: - raise TypeError(f"attn_mask must be floating point or bool, but got {attn_mask.dtype}.") - - # Nested tensor limitation with explicit masking - if query.is_nested or key.is_nested: - raise ValueError("Nested tensors are not supported when an explicit attn_mask is set.") - - # Dropout and causal flag validation (added) - # Dropout probability must be in the range [0, 1) - if not (0.0 <= dropout_p < 1.0): - raise ValueError(f"Expected dropout_p to be in [0, 1), but got {dropout_p}.") - - # Mutual exclusivity: cannot use both explicit mask and causal flag (added) - if is_casual and attn_mask is not None: - raise ValueError("Both attn_mask and is_casual cannot be set at the same time.") - - # Scaling factor validation (added) - if scale is not None and scale <= 0.0: - raise ValueError(f"Expected scale to be a positive number, but got {scale}.") - - # GQA (Grouped Query Attention) constraints (added) - n_head_q = query.size(1) - n_head_k = key.size(1) - n_head_v = value.size(1) - - # The aten._scaled_dot_product_flash_attention kernel does not accept an explicit enable_gqa parameter. - # Instead, the Flash SDPA implementation infers GQA usage by checking if n_head_q != n_head_k. - if not enable_gqa and n_head_q != n_head_k: - raise ValueError(f"Query and Key must have the same number of heads when enable_gqa is false (Q:{n_head_q} vs K:{n_head_k}).") - - if enable_gqa: - if n_head_q == n_head_k: - raise ValueError(f"enable_gqa Query and Key ") - - if n_head_k != n_head_v: - raise ValueError(f"Key and Value must have the same number of heads (K:{n_head_k} vs V:{n_head_v}).") - - # Query heads must be an integer multiple of key heads for grouping - if n_head_q % n_head_k != 0: - raise ValueError( - f"Number of query heads ({n_head_q}) must be divisible by " - f"number of key heads ({n_head_k}) for GQA." - ) - -def convert_boolean_attn_mask(attn_mask: torch.Tensor, target_dtype: torch.dtype) -> float: - """ - Equivalent to the C++ 'convert_boolean_attn_mask' function. - Converts a boolean mask to a floating-point mask for SDPA. - """ - - if attn_mask is not None and attn_mask.dtype == torch.bool: - - new_mask = torch.zeros_like(attn_mask, dtype=target_dtype) - minus_inf = torch.finfo(target_dtype).min - new_mask.masked_fill_(attn_mask.logical_not(), minus_inf) - - return new_mask - - return attn_mask def calculate_scale(query: torch.Tensor, scale: float) -> float: """ @@ -195,79 +80,10 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: Otherwise, use the provided scale. """ if scale is None: - return 1.0 / math.sqrt(query.size(-1)) + return 1.0 / math.sqrt(query.layout.size[-1]) else: return scale -def patched_scaled_dot_product_attention( - query_ : torch.Tensor, - key : torch.Tensor, - value : torch.Tensor, - dropout_p : float = 0.0, - is_casual : bool = False, - attn_mask_ : torch.Tensor = None, - scale_ : float = None, - enable_gqa : bool = None, - orig_fn = torch._C._nn.scaled_dot_product_attention) -> torch.Tensor : - """ - Custom patch for Scaled Dot Product Attention (SDPA) to intercept high-level calls. - For NPU devices, it redirects execution to specific ATen kernels based on global flags. - For all devices, it maintains parity with the original dispatcher logic found in: - https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/transformers/attention.cpp - - This function acts as a custom override that replaces the default PyTorch SDPA implementation, - invoked via 'PyTorchSim/PyTorchSimDevice/torch_openreg/openreg/__init__.py'. - """ - - # Device-specific Dispatching: redirect to specialized kernels if on NPU - if "npu" in str(query_.device): - - validate_sdpa_input(query_, key, value, attn_mask_, dropout_p, is_casual, scale_, enable_gqa) - attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype) - - # Kernel selection logic: emulate C++ dispatcher priority - # Selection priority(can be changed): flash attention > memory efficient > math (cuDNN is not supported) - aten = torch.ops.aten - scale = calculate_scale(query_, scale_) - - if flash_sdp_enabled(): - # Skip padding query, key and value for alignment. - dispatch_kwargs = { - "dropout_p" : dropout_p, - "is_causal" : is_casual, - "return_debug_mask" : False, - "scale" : scale - } - - out_lse_softmax = aten._scaled_dot_product_flash_attention( - query_, key, value, **dispatch_kwargs - ) - - return out_lse_softmax[0] - elif mem_efficient_sdp_enabled(): - # out_and_lse = aten._scaled_dot_product_efficient_attention(...) - # return out_and_lse[0] - raise NotImplementedError("Memory efficient SDPA is not implemented yet.") - else: - dispatch_kwargs = { - "attn_mask" : attn_mask, - "dropout_p" : dropout_p, - "is_causal" : is_casual, - "dropout_mask" : None, - "scale": scale, - "enable_gqa" : enable_gqa - } - - out_lse_softmax = aten._scaled_dot_product_attention_math( - query_, - key, - value, - **dispatch_kwargs) - - return out_lse_softmax[0] - else: - # Fallback: Delegate to the original C++ Dispatcher for other devices - return orig_fn(query_, key, value) FLASH_SDPA_TEMPLATE = r""" // SDPA kernel From 591e8a98cdb7a734f58c3e2afff6b252f5b86bee Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 5 Mar 2026 23:16:40 +0900 Subject: [PATCH 120/194] [Templte/Cat] Apply copy operation when node has view --- PyTorchSimFrontend/mlir/mlir_cat_template.py | 11 +++------- PyTorchSimFrontend/mlir/mlir_lowering.py | 23 +++++++++++++++++--- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py index 6eb60198..7bee54ac 100644 --- a/PyTorchSimFrontend/mlir/mlir_cat_template.py +++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py @@ -161,14 +161,14 @@ def get_tile_candidates( candidates = set() for mult in range(1, max_tile // kernel.vector_lane + 1): t = mult * kernel.vector_lane - if t <= dim_size: + if t <= dim_size and dim_size % t == 0: candidates.add(t) if max_tile > 0: for exp in range(int(math.log2(max_tile)) + 1): t = 2 ** exp - if t <= dim_size: + if t <= dim_size and dim_size % t == 0: candidates.add(t) - candidates.add(dim_size) + candidates.add(dim_size) # dim_size always divides itself dim_tile_candidates.append(sorted(candidates)[:5]) tile_candidates = [ @@ -322,11 +322,6 @@ def make_affine_map(idx_syms, strides, layout_offset): for i, x in enumerate(input_nodes): x_stride = x.get_layout().stride - if hasattr(x, 'data') and hasattr(x.data, 'dims'): - # PermuteView: re-order strides according to the permutation - perm = x.data.dims - x_stride = [x_stride[perm[d]] for d in range(rank)] - in_syms, in_layout_strides, in_dram_strides = [], [], [] out_syms, out_layout_strides, out_dram_strides = [], [], [] tile_idx = 0 diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py index d7aee715..e5df4b78 100644 --- a/PyTorchSimFrontend/mlir/mlir_lowering.py +++ b/PyTorchSimFrontend/mlir/mlir_lowering.py @@ -1,3 +1,4 @@ +import math from typing import List, Optional, Sequence import torch @@ -205,11 +206,27 @@ def _cat_layout(tensors: Sequence[TensorBox], dim: int) -> ir.Layout: def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0): if tensors and dim < 0: dim += len(tensors[0].get_size()) - + copy_default_lowering = lowerings.get(aten.copy_.default) + empty_strided_lowering = lowerings.get(aten.empty_strided.default) + new_tensors = [] for t in tensors: t.realize() - layout = _cat_layout(tensors, dim) - mlir_template = MLIRCatTemplate(list(tensors), layout, dim=dim) + # If the tensor is backed by a view (ReinterpretView, PermuteView, etc.), + # materialise it into a fresh contiguous FixedLayout buffer so the cat + # kernel always receives plain, dense strides. + if isinstance(t.data, ir.BaseView): + sizes = list(t.get_size()) + strides = [math.prod(sizes[i + 1:]) for i in range(len(sizes))] + new_buf = empty_strided_lowering( + sizes, strides, dtype=t.get_dtype(), device=t.get_device() + ) + tt = copy_default_lowering(new_buf, t) + else: + tt = t + new_tensors.append(tt) + + layout = _cat_layout(new_tensors, dim) + mlir_template = MLIRCatTemplate(list(new_tensors), layout, dim=dim) return mlir_template.generate().output_node() def _custom_sort_values_impl( From dab34954d61d5558658684dcb1415fa75c3c6935 Mon Sep 17 00:00:00 2001 From: jung-min Date: Sat, 7 Mar 2026 10:11:57 +0000 Subject: [PATCH 121/194] [Refactor] Refactored TopK test code for the OpenReg device --- tests/test_topk.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/tests/test_topk.py b/tests/test_topk.py index c8565310..caf56779 100644 --- a/tests/test_topk.py +++ b/tests/test_topk.py @@ -31,21 +31,11 @@ def topk_fn(a): opt_topk = torch.compile(dynamic=False)(topk_fn) res_values, res_indices = opt_topk(x) - ref_values, ref_indices = torch.topk(x.cpu(), k, dim=dim, largest=largest, sorted=sorted) test_result("TopK/values", res_values, ref_values) test_result("TopK/indices", res_indices, ref_indices) if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape") - parser.add_argument('--shape', type=str, default="(512,768)") - args = parser.parse_args() - shape = tuple(map(int, args.shape.strip('()').split(','))) - - from Scheduler.scheduler import ExecutionEngine - module = ExecutionEngine.setup_device() - device = module.custom_device() + device = torch.device('npu:0') test_topk(device, (128, 128), k=2, dim=-1) \ No newline at end of file From a15f5d2128429c5fa9580e8eb2b1f625a55f054d Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 11 Mar 2026 11:03:29 +0900 Subject: [PATCH 122/194] [Template/Sort] Add template code for Bitonic sort --- PyTorchSimFrontend/mlir/mlir_lowering.py | 133 +--- PyTorchSimFrontend/mlir/mlir_ops.py | 76 ++- PyTorchSimFrontend/mlir/mlir_sort_template.py | 627 ++++++++++++------ tests/test_sort.py | 128 ++-- 4 files changed, 591 insertions(+), 373 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py index e5df4b78..36e9955b 100644 --- a/PyTorchSimFrontend/mlir/mlir_lowering.py +++ b/PyTorchSimFrontend/mlir/mlir_lowering.py @@ -17,13 +17,11 @@ from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate from PyTorchSimFrontend.mlir.mlir_cat_template import MLIRCatTemplate -from PyTorchSimFrontend.mlir.mlir_sort_template import MLIRSortTemplate +from PyTorchSimFrontend.mlir.mlir_sort_template import MLIRSortTemplate, MLIRStableSortTemplate from PyTorchSimFrontend import extension_config aten = torch.ops.aten aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm") -_orig_cat_default_lowering = lowerings.get(aten.cat.default) -_orig_cat_out_lowering = lowerings.get(aten.cat.out) _orig_sort_values_stable_lowering = lowerings.get(aten.sort.values_stable) def tuned_mm(mat1, mat2, * ,layout=None): @@ -229,48 +227,35 @@ def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0): mlir_template = MLIRCatTemplate(list(new_tensors), layout, dim=dim) return mlir_template.generate().output_node() -def _custom_sort_values_impl( - self: TensorBox, +def custom_sort_default( + value: TensorBox, dim: int = -1, descending: bool = False, - values: Optional[TensorBox] = None, - indices: Optional[TensorBox] = None, stable: Optional[bool] = None, ): - if values is None or indices is None: - raise RuntimeError("sort.values* lowering requires both out tensors: values, indices") + if dim < 0: + dim += len(value.get_size()) - def _normalize_dim(rank: int, d: int) -> int: - return d + rank if d < 0 else d + value.realize() - if not hasattr(self, "get_size"): - raise RuntimeError("sort.values* lowering requires TensorBox input") - - rank = len(self.get_size()) - norm_dim = _normalize_dim(rank, dim) - if norm_dim < 0 or norm_dim >= rank: - raise RuntimeError(f"sort.values* dim out of range: dim={dim}, rank={rank}") - if rank != 2: - raise RuntimeError(f"sort.values* lowering currently supports rank-2 only, got rank={rank}") - if norm_dim not in (0, 1): - raise RuntimeError(f"sort.values* lowering currently supports dim in {{0,1}} only, got dim={norm_dim}") - - self.realize() - if isinstance(values, TensorBox): - values.realize() - if isinstance(indices, TensorBox): - indices.realize() - - value_layout, _ = _sort_layouts(self, norm_dim, descending) - mlir_template = MLIRSortTemplate( - [self], + value_layout, index_layout = _sort_layouts(value, dim, descending) + empty_strided_lowering = lowerings.get(aten.empty_strided.default) + indices = empty_strided_lowering( + value.get_size(), + index_layout.stride, + dtype=torch.int64, + device=value.get_device(), + ) + stable_required = True if stable is None else stable + sort_template_cls = MLIRStableSortTemplate if stable_required else MLIRSortTemplate + mlir_template = sort_template_cls( + [value, indices], value_layout, - dim=norm_dim, + dim=dim, descending=descending, - stable=True if stable is None else stable, - indices_node=indices, + stable=stable_required, ) - sorted_values = mlir_template.generate(template_buffer_node=values, epilogue_nodes=[indices]).output_node() + sorted_values = mlir_template.generate(template_buffer_node=value).output_node() return sorted_values, indices @@ -290,78 +275,6 @@ def _sort_layouts(x: TensorBox, dim: int, descending: bool): index_layout = ir.FixedLayout(x.get_device(), torch.int64, i_sizes, i_stride) return value_layout, index_layout - -def custom_sort_stable( - self: TensorBox, - *, - stable: Optional[bool] = None, - dim: int = -1, - descending: bool = False, -): - empty_strided_lowering = lowerings.get(aten.empty_strided.default) - if empty_strided_lowering is None: - if _orig_sort_values_stable_lowering is None: - raise RuntimeError("sort.stable lowering requires aten.empty_strided.default") - return _orig_sort_values_stable_lowering(self, dim=dim, descending=descending, stable=True) - - rank = len(self.get_size()) if hasattr(self, "get_size") else 0 - norm_dim = dim + rank if dim < 0 else dim - if rank > 0 and (norm_dim < 0 or norm_dim >= rank): - raise RuntimeError(f"sort.stable dim out of range: dim={dim}, rank={rank}") - - # Template specialization supports rank-2 and dim in {0,1}. - if rank == 2 and norm_dim not in (0, 1): - if _orig_sort_values_stable_lowering is None: - raise RuntimeError("Original aten.sort.values_stable lowering is missing") - return _orig_sort_values_stable_lowering(self, dim=dim, descending=descending, stable=True) - - try: - value_layout, index_layout = _sort_layouts(self, norm_dim, descending) - values = empty_strided_lowering( - list(value_layout.size), - list(value_layout.stride), - dtype=value_layout.dtype, - device=self.get_device(), - ) - indices = empty_strided_lowering( - list(index_layout.size), - list(index_layout.stride), - dtype=index_layout.dtype, - device=self.get_device(), - ) - return _custom_sort_values_impl( - self=self, - dim=dim, - descending=descending, - values=values, - indices=indices, - stable=True if stable is None else stable, - ) - except Exception: - if _orig_sort_values_stable_lowering is None: - raise - return _orig_sort_values_stable_lowering(self, dim=dim, descending=descending, stable=stable) - - -def custom_sort_values_stable( - self: TensorBox, - *, - stable: Optional[bool] = None, - dim: int = -1, - descending: bool = False, - values: Optional[TensorBox] = None, - indices: Optional[TensorBox] = None, -): - return _custom_sort_values_impl( - self=self, - dim=dim, - descending=descending, - values=values, - indices=indices, - stable=stable, - ) - - lowerings.update({getattr(aten.mm, overload): tuned_mm for overload in aten.mm.overloads()}) lowerings.update({getattr(aten.addmm, overload): tuned_addmm for overload in aten.addmm.overloads()}) lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()}) @@ -369,9 +282,7 @@ def custom_sort_values_stable( lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()}) lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()}) lowerings.update({getattr(aten.cat, overload): custom_cat_default for overload in aten.cat.overloads()}) - -lowerings.update({aten.sort.stable: custom_sort_stable}) -lowerings.update({aten.sort.values_stable: custom_sort_values_stable}) +lowerings.update({getattr(aten.sort, overload): custom_sort_default for overload in aten.sort.overloads()}) if extension_config.CONFIG_USE_TIMING_POOLING: lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index 9edd2e44..ace4f9ea 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -182,7 +182,7 @@ def to_dtype(operand, dst_mlir_dtype, *args, **kwargs): # Case A: Integer -> Float if src_type_char == "i" and dst_type_char == "f": - op_str = f"arith.sitofp %{operand} : {src_shape} to {shape}" + op_str = f"arith.uitofp %{operand} : {src_shape} to {shape}" # Case B: Float -> Integer elif src_type_char == "f" and dst_type_char == "i": op_str = f"arith.fptosi %{operand} : {src_shape} to {shape}" @@ -1142,6 +1142,80 @@ def multi_reduction(acc, init, vec_size, red_size, red_shape, red_type, type_nam line = reduction_combine_vec(red_type, value, init, axis=0, shape=new_vshape, reduced_shape=final_reduced_shape) return line, [red_size, type_name] + @staticmethod + def vector_shuffle(operand, indices, operand2=None, *args, **kwargs): + tile_size1, dtype1 = V.kernel.var_info[operand] + if operand2 is None: + operand2 = operand + tile_size2, dtype2 = V.kernel.var_info[operand2] + if dtype1 != dtype2: + raise ValueError( + f"vector_shuffle expects same element type, got {dtype1} and {dtype2}" + ) + total_size = tile_size1 + tile_size2 + for idx in indices: + if idx < -1 or idx >= total_size: + raise ValueError( + f"vector_shuffle index out of range: {idx}, expected in [-1, {total_size - 1}]" + ) + vt1 = f"vector<{tile_size1}x{dtype1}>" + vt2 = f"vector<{tile_size2}x{dtype1}>" + idx_str = ", ".join(str(i) for i in indices) + op_str = f"vector.shuffle %{operand}, %{operand2} [{idx_str}]" + return format_mlir_op(op_str, f"{vt1}, {vt2}", **kwargs), [len(indices), dtype1] + + @staticmethod + def constant_mask(select_min, N, *args, **kwargs): + vals = ", ".join("true" if x else "false" for x in select_min) + op_str = f"arith.constant dense<[{vals}]>" + return format_mlir_op(op_str, f"vector<{N}xi1>", **kwargs), [N, "i1"] + + @staticmethod + def bitonic_sort(operand, descending=False, *args, **kwargs): + def _compute_bitonic_stages(N: int, descending: bool): + assert N >= 2 and (N & (N - 1)) == 0, "N must be power-of-2 >= 2" + stages = [] + size = 2 + while size <= N: + stride = size // 2 + while stride >= 1: + merged_shuffle = list(range(N)) + merged_mask = [None] * N + + for start in range(0, N, size): + blk_dir = "ASCENDING" if (start // size) % 2 == 0 else "DESCENDING" + for i in range(start, start + size - stride, stride * 2): + for j in range(stride): + a, b = i + j, i + j + stride + merged_shuffle[a] = b + merged_shuffle[b] = a + if blk_dir == "ASCENDING": + merged_mask[a] = True # a = min + merged_mask[b] = False # b = max + else: + merged_mask[a] = False # a = max + merged_mask[b] = True # b = min + select_min = [bool(x) if x is not None else False for x in merged_mask] + if descending: + select_min = [not x for x in select_min] + stages.append({ + "shuffle": merged_shuffle, + "select_min": select_min, + }) + stride //= 2 + size *= 2 + return stages + + tile_size, _ = V.kernel.var_info[operand] + cur = operand + for stage in _compute_bitonic_stages(tile_size, descending): + mask = ops.constant_mask(stage["select_min"], tile_size) + shuffled = ops.vector_shuffle(cur, stage["shuffle"]) + vmin = ops.minimum(cur, shuffled) + vmax = ops.maximum(cur, shuffled) + cur = ops.where(mask, vmin, vmax) + return cur, V.kernel.var_info[cur] + @staticmethod def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, **kwargs): if compute_vec_size == 1: diff --git a/PyTorchSimFrontend/mlir/mlir_sort_template.py b/PyTorchSimFrontend/mlir/mlir_sort_template.py index d12c7570..24b3a460 100644 --- a/PyTorchSimFrontend/mlir/mlir_sort_template.py +++ b/PyTorchSimFrontend/mlir/mlir_sort_template.py @@ -1,130 +1,189 @@ from typing import List, Optional +import contextlib -import sympy -from torch._inductor.ir import IRNode -from torch._inductor.virtualized import V +from torch._inductor.ir import Buffer, IRNode +from torch._inductor.virtualized import _ops as ops +from torch._inductor.codegen import common from PyTorchSimFrontend.mlir import mlir_common from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate, MLIRTemplateKernel +from PyTorchSimFrontend.mlir.mlir_common import LoopLevel + +VECTOR_SIZE = 16 TEMPLATE = r""" {{kernel.def_global_vars()}} +// chunk index -> element index +#map_chunk_to_elem = affine_map<(d0) -> (d0 * {{ VECTOR_SIZE }})> -func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=[X, YI], outputs=[YV], names_str=NAMES_STR, input_reorder=input_reorder)}} { - {{ kernel.def_sram_buffer("YI", YI_TILE_DESC, id=1, indent_size=2) }} - {{ kernel.def_sram_buffer(OUT_DVAR, YV_TILE_DESC, id=2, indent_size=2) }} +func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=[X, XI], outputs=[YV], names_str=NAMES_STR, input_reorder=input_reorder)}} { + {{ kernel.def_sram_buffer("X", X_TILE_DESC, id=0, indent_size=2) }} + {{ kernel.def_sram_buffer("XI", XI_TILE_DESC, id=1, indent_size=2) }} + {{ kernel.def_sram_buffer("YV", YV_TILE_DESC, id=2, indent_size=2) }} {{ kernel.def_local_vars(indent_size=2) }} - %c0 = arith.constant 0 : index - %c_cols = arith.constant {{ COLS }} : index affine.for %sort_block = 0 to 1 step 1 { - // Initialize output value/index buffers. - affine.for %row = 0 to {{ ROWS }} step 1 { - affine.for %col = 0 to {{ COLS }} step 1 { - {{ kernel.def_dma_op("MVIN", "X", INIT_X_IDX, X_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=8) }} - {{ kernel.def_dma_op("MVOUT", OUT_DVAR, INIT_YV_IDX, X_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=8) }} -{% if DIM == 1 %} - %idx_i64 = arith.index_cast %col : index to {{ YI_ELEM_TYPE }} -{% else %} - %idx_i64 = arith.index_cast %row : index to {{ YI_ELEM_TYPE }} -{% endif %} - memref.store %idx_i64, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} - {{ kernel.def_dma_op("MVOUT", "YI", INIT_YI_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=8) }} - } - } - -{% if DIM == 1 %} - // Stable bubble sort on each row (dim=1). - affine.for %row = 0 to {{ ROWS }} step 1 { - affine.for %pass = 0 to {{ COLS }} step 1 { - affine.for %j = 0 to {{ COLS_MINUS1 }} step 1 { - {{ kernel.def_dma_op("MVIN", OUT_DVAR, D1_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }} - %lhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} - - {{ kernel.def_dma_op("MVIN", OUT_DVAR, D1_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }} - %rhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} - -{% if DESCENDING %} - %need_swap = arith.cmpf olt, %lhs, %rhs : {{ YV_ELEM_TYPE }} -{% else %} - %need_swap = arith.cmpf ogt, %lhs, %rhs : {{ YV_ELEM_TYPE }} -{% endif %} - scf.if %need_swap { - memref.store %rhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} - {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D1_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} - - memref.store %lhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} - {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D1_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} - - {{ kernel.def_dma_op("MVIN", "YI", D1_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} - %li = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} - - {{ kernel.def_dma_op("MVIN", "YI", D1_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} - %ri = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} - - memref.store %ri, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} - {{ kernel.def_dma_op("MVOUT", "YI", D1_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} - - memref.store %li, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} - {{ kernel.def_dma_op("MVOUT", "YI", D1_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} - } - } - } - } -{% else %} - // Stable bubble sort on each column (dim=0). - affine.for %col = 0 to {{ COLS }} step 1 { - affine.for %pass = 0 to {{ ROWS }} step 1 { - affine.for %i = 0 to {{ ROWS_MINUS1 }} step 1 { - {{ kernel.def_dma_op("MVIN", OUT_DVAR, D0_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }} - %lhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} - - {{ kernel.def_dma_op("MVIN", OUT_DVAR, D0_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }} - %rhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} - -{% if DESCENDING %} - %need_swap = arith.cmpf olt, %lhs, %rhs : {{ YV_ELEM_TYPE }} -{% else %} - %need_swap = arith.cmpf ogt, %lhs, %rhs : {{ YV_ELEM_TYPE }} -{% endif %} - scf.if %need_swap { - memref.store %rhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} - {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D0_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} - - memref.store %lhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }} - {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D0_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} - - {{ kernel.def_dma_op("MVIN", "YI", D0_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} - %li = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} - - {{ kernel.def_dma_op("MVIN", "YI", D0_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} - %ri = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} - - memref.store %ri, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} - {{ kernel.def_dma_op("MVOUT", "YI", D0_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} - - memref.store %li, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }} - {{ kernel.def_dma_op("MVOUT", "YI", D0_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }} - } - } - } - } -{% endif %} + {%- for d in range(RANK-1) %} + affine.for %index{{ OUTPUT_DIM[d] }} = 0 to {{ OUTPUT_SIZES[d] }} step {{ STEP_SIZES[d] }} { + {%- endfor %} + + %x_dram_offset = affine.apply {{ X_OFFSET_MAP }}({{ OUTER_VARS }}) + %xi_dram_offset = affine.apply {{ XI_OFFSET_MAP }}({{ OUTER_VARS }}) + %yv_dram_offset = affine.apply {{ YV_OFFSET_MAP }}({{ OUTER_VARS }}) + {{ kernel.def_dma_op("MVIN", "X", [], X_TILE_DESC, indent_size=INDENT_SIZE, dram_stride=X_DRAM_STRIDE, dram_offset="x_dram_offset") }} + + // SIMD local sort + loop-based chunk merge. +{{ BITONIC_BODY }} + + {{ kernel.def_dma_op("MVOUT", "XI", [], XI_TILE_DESC, indent_size=INDENT_SIZE, dram_stride=XI_DRAM_STRIDE, dram_offset="xi_dram_offset") }} + {{ kernel.def_dma_op("MVOUT", "YV", [], YV_TILE_DESC, indent_size=INDENT_SIZE, dram_stride=YV_DRAM_STRIDE, dram_offset="yv_dram_offset") }} + {%- for d in range(RANK-1) %} + } { outer_loop=true } + {%- endfor %} } { outer_loop=true } return } """ +def _make_offset_map(outer_dims, all_strides, layout_offset): + """Build an affine_map over outer-dim loop variables that computes the flat DRAM offset.""" + terms = [] + for j, d in enumerate(outer_dims): + s = int(all_strides[d]) + if s == 1: + terms.append(f"d{j}") + elif s != 0: + terms.append(f"d{j} * {s}") + try: + off = int(layout_offset) + except (TypeError, ValueError): + off = 0 + if off: + terms.append(str(off)) + nd = len(outer_dims) + dim_str = ", ".join(f"d{j}" for j in range(nd)) + expr = " + ".join(terms) if terms else "0" + return f"affine_map<({dim_str}) -> ({expr})>" + + +def _compute_bitonic_stages(n: int, descending: bool): + stages = [] + size = 2 + while size <= n: + stride = size // 2 + while stride >= 1: + merged_shuffle = list(range(n)) + merged_mask = [None] * n + for start in range(0, n, size): + blk_dir = "ASCENDING" if (start // size) % 2 == 0 else "DESCENDING" + for i in range(start, start + size - stride, stride * 2): + for j2 in range(stride): + a, b = i + j2, i + j2 + stride + merged_shuffle[a] = b + merged_shuffle[b] = a + if blk_dir == "ASCENDING": + merged_mask[a] = True + merged_mask[b] = False + else: + merged_mask[a] = False + merged_mask[b] = True + select_min = [bool(x) if x is not None else False for x in merged_mask] + if descending: + select_min = [not x for x in select_min] + stages.append({"shuffle": merged_shuffle, "select_min": select_min}) + stride //= 2 + size *= 2 + return stages + + +def _pair_less_equal(left_v, right_v, left_i, right_i): + cmp_val = ops.lt(left_v, right_v) + cmp_eq = ops.eq(left_v, right_v) + cmp_idx = ops.le(left_i, right_i) + return ops.or_(cmp_val, ops.and_(cmp_eq, cmp_idx)) + + +def _pair_greater_equal(left_v, right_v, left_i, right_i): + cmp_val = ops.gt(left_v, right_v) + cmp_eq = ops.eq(left_v, right_v) + cmp_idx = ops.le(left_i, right_i) + return ops.or_(cmp_val, ops.and_(cmp_eq, cmp_idx)) + + +def _bitonic_sort_pair(values, indices, vector_size: int, descending: bool, stable_sort: bool): + cur_v = values + cur_i = indices + for stage_desc in _compute_bitonic_stages(vector_size, descending): + mask = ops.constant_mask(stage_desc["select_min"], vector_size) + shuf_v = ops.vector_shuffle(cur_v, stage_desc["shuffle"]) + shuf_i = ops.vector_shuffle(cur_i, stage_desc["shuffle"]) + if stable_sort: + # `cmp` drives the "min side" selection in the bitonic network. + # For descending stable sort, tie elements with smaller original index + # must stay earlier, so the min side should treat larger index as smaller. + if descending: + cmp_val = ops.lt(cur_v, shuf_v) + cmp_eq = ops.eq(cur_v, shuf_v) + cmp_idx = ops.ge(cur_i, shuf_i) + cmp = ops.or_(cmp_val, ops.and_(cmp_eq, cmp_idx)) + else: + cmp = _pair_less_equal(cur_v, shuf_v, cur_i, shuf_i) + else: + cmp = ops.le(cur_v, shuf_v) + min_v = ops.where(cmp, cur_v, shuf_v) + min_i = ops.where(cmp, cur_i, shuf_i) + max_v = ops.where(cmp, shuf_v, cur_v) + max_i = ops.where(cmp, shuf_i, cur_i) + cur_v = ops.where(mask, min_v, max_v) + cur_i = ops.where(mask, min_i, max_i) + return cur_v, cur_i + + +def _merge_sorted_pair_vectors( + left_norm, + left_idx_norm, + right_norm, + right_idx_norm, + ascending: bool, + stable_sort: bool, + vector_size: int, + rev_indices, +): + right_pair = ops.vector_shuffle(right_norm, rev_indices, right_norm) + right_idx_pair = ops.vector_shuffle(right_idx_norm, rev_indices, right_idx_norm) + if ascending: + cmp = ( + _pair_less_equal(left_norm, right_pair, left_idx_norm, right_idx_pair) + if stable_sort + else ops.le(left_norm, right_pair) + ) + else: + cmp = ( + _pair_greater_equal(left_norm, right_pair, left_idx_norm, right_idx_pair) + if stable_sort + else ops.ge(left_norm, right_pair) + ) + left_merge = ops.where(cmp, left_norm, right_pair) + left_idx_merge = ops.where(cmp, left_idx_norm, right_idx_pair) + right_merge = ops.where(cmp, right_pair, left_norm) + right_idx_merge = ops.where(cmp, right_idx_pair, left_idx_norm) + return left_merge, left_idx_merge, right_merge, right_idx_merge + + class MLIRSortTemplate(MLIRTemplate): - def __init__(self, input_nodes, layout, dim, descending=False, stable=False, indices_node=None, input_reorder=None): + def __init__(self, input_nodes, layout, dim, descending=False, stable=False, input_reorder=None): super().__init__("kernel", input_nodes, layout, input_reorder) self.dim = dim self.descending = descending self.stable = stable - self.indices_node = indices_node + self.use_stable_sort = False + self.output_nodes = [ + Buffer(name="buf_out_values", layout=layout), + ] + self.output_node = self.output_nodes[0] def render( self, @@ -135,119 +194,281 @@ def render( **kwargs, ): if template_buffer_node is not None: + self.output_nodes[0] = template_buffer_node self.output_node = template_buffer_node - if self.indices_node is None: - raise RuntimeError("MLIRSortTemplate requires indices output node") x = self.input_nodes[0] - yv = self.output_node - yi = self.indices_node - - def _as_int(v): - try: - return int(v) - except Exception: - return int(V.graph.sizevars.size_hint(v)) - - x_size = x.get_size() - if len(x_size) != 2: - raise RuntimeError("MLIRSortTemplate currently supports rank-2 input only") - if self.dim not in (0, 1): - raise RuntimeError(f"MLIRSortTemplate currently supports dim in {{0,1}} only, got dim={self.dim}") - - rows = _as_int(x_size[0]) - cols = _as_int(x_size[1]) - cols_minus1 = max(0, cols - 1) - rows_minus1 = max(0, rows - 1) - - x_dtype = x.get_dtype() - yv_dtype = yv.get_dtype() - yi_dtype = yi.get_dtype() - if x_dtype != yv_dtype: - raise RuntimeError("sort template requires input/value dtype match") - - yi_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) - yi_tile_desc.set_tile_size_stride([1, 1], [1, 1]) - yi_tile_desc.set_name("yi_sort_tile") - yv_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) - yv_tile_desc.set_tile_size_stride([1, 1], [1, 1]) - yv_tile_desc.set_name("yv_sort_tile") - # Neighbor element descriptors use DRAM offset to preserve affine stride metadata. - yv_s1_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) - yv_s1_tile_desc.set_tile_size_stride([1, 1], [1, 1]) - yv_s1_tile_desc.set_name("yv_sort_tile") - yi_s1_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1) - yi_s1_tile_desc.set_tile_size_stride([1, 1], [1, 1]) - yi_s1_tile_desc.set_name("yi_sort_tile") - if int(self.dim) == 1: - yv_s1_tile_desc.offset = sympy.Integer(1) - yi_s1_tile_desc.offset = sympy.Integer(1) + xi = self.input_nodes[1] + yv = self.output_nodes[0] + # XI is updated in-place by the sort kernel, so mark it as an inout arg. + kernel.kernel_group.args.make_inplace(xi.get_name(), xi.get_name()) + sort_size = int(x.get_size()[self.dim]) + vector_size = VECTOR_SIZE + if sort_size <= 0: + raise NotImplementedError("Sort size must be > 0") + if sort_size < vector_size or sort_size % vector_size != 0: + raise NotImplementedError( + f"Sort size must be a multiple of vector size (sort_size={sort_size}, vector_size={vector_size})" + ) + num_chunks = sort_size // vector_size + if num_chunks & (num_chunks - 1): + raise NotImplementedError( + f"Loop-based bitonic chunk merge requires power-of-two chunk count (num_chunks={num_chunks})" + ) + + # --- N-D generalization: outer loops over all non-sort dims --- + rank = len(x.get_size()) + sort_dim = self.dim if self.dim >= 0 else self.dim + rank + if sort_dim < 0 or sort_dim >= rank: + raise NotImplementedError(f"Invalid sort dim for rank-{rank} tensor (dim={self.dim})") + x_layout = x.get_layout() + xi_layout = xi.get_layout() + yv_layout = yv.get_layout() + + if rank == 1: + # Edge case for 1D tensor + output_sizes = [1] + output_dim = [0] + step_sizes = [1] + tile_sizes = [1, sort_size] + x_dram_stride = [int(x_layout.stride[sort_dim]), int(x_layout.stride[sort_dim])] + xi_dram_stride = [int(xi_layout.stride[sort_dim]), int(xi_layout.stride[sort_dim])] + yv_dram_stride = [int(yv_layout.stride[sort_dim]), int(yv_layout.stride[sort_dim])] + template_rank = 2 else: - yv_s1_tile_desc.offset = sympy.Integer(cols) - yi_s1_tile_desc.offset = sympy.Integer(cols) - - row = sympy.Symbol("row") - col = sympy.Symbol("col") - i = sympy.Symbol("i") - j = sympy.Symbol("j") - - init_x_idx = [row * cols, col] - init_yv_idx = [row * cols, col] - init_yi_idx = [row * cols, col] + output_sizes = [sz for d, sz in enumerate(yv.get_size()) if d != sort_dim] + output_dim = [d for d, _ in enumerate(yv.get_size()) if d != sort_dim] + step_sizes = [1] * len(output_sizes) + + tile_dim = max(output_dim, key=lambda d: int(yv.get_size()[d])) + tile_sizes = [min(kernel.vector_lane, int(yv.get_size()[tile_dim])), sort_size] + step_sizes[output_dim.index(tile_dim)] = tile_sizes[0] + + x_dram_stride = [int(x_layout.stride[tile_dim]), int(x_layout.stride[sort_dim])] + xi_dram_stride = [int(xi_layout.stride[tile_dim]), int(xi_layout.stride[sort_dim])] + yv_dram_stride = [int(yv_layout.stride[tile_dim]), int(yv_layout.stride[sort_dim])] + template_rank = rank + + x_offset_map = _make_offset_map(output_dim, x_layout.stride, x_layout.offset) + xi_offset_map = _make_offset_map(output_dim, xi_layout.stride, xi_layout.offset) + yv_offset_map = _make_offset_map(output_dim, yv_layout.stride, yv_layout.offset) + outer_vars = ", ".join(f"%index{d}" for d in output_dim) + + # indent for DMA ops = 2 (inside func) + 2 per outer loop + indent_size = 2 + len(output_dim) * 2 + 4 + + vlane_stride = 1 + vlane_split_axis = 0 + x_tile_desc = mlir_common.MLIRMultiDimTile(tile_sizes, kernel.vector_lane, vlane_split_axis, vlane_stride) + x_tile_desc.set_tile_size_stride(tile_sizes, [sort_size, 1]) + x_tile_desc.set_name("X_buffer") + x_tile_desc.offset = x_layout.offset + + xi_tile_desc = mlir_common.MLIRMultiDimTile(tile_sizes, kernel.vector_lane, vlane_split_axis, vlane_stride) + xi_tile_desc.set_tile_size_stride(tile_sizes, [sort_size, 1]) + xi_tile_desc.set_name("XI_buffer") + xi_tile_desc.offset = xi_layout.offset + + yv_tile_desc = mlir_common.MLIRMultiDimTile(tile_sizes, kernel.vector_lane, vlane_split_axis, vlane_stride) + yv_tile_desc.set_tile_size_stride(tile_sizes, [sort_size, 1]) + yv_tile_desc.set_name("YV_buffer") + yv_tile_desc.offset = yv_layout.offset + + data_stype = mlir_common.DTYPE_TO_MLIR[x.get_dtype()] + idx_stype = mlir_common.DTYPE_TO_MLIR[xi.get_dtype()] + + elem_memref_t = f"memref<1x{sort_size}x{data_stype}, 1>" + rev_indices = list(range(vector_size - 1, -1, -1)) + + bitonic_body = mlir_common.ParallelLoopBuffer(initial_indent=2) + bitonic_body.tabwidth = 2 + # 1) Local SIMD sort per chunk. + init_cse = common.CSE(kernel.newvar_prefix, kernel.suffix, name_prefix="sort_init") + with kernel, kernel.override_buffer_cse(buffer=bitonic_body, cse=init_cse): + bitonic_body.writelines(LoopLevel("chunk", num_chunks).lines()) + with bitonic_body.indent(attribute="{inner_loop=true}"): + bitonic_body.writeline("%elem = affine.apply #map_chunk_to_elem(%chunk)") + x_chunk = ops._load( + vector_size, + data_stype, + "X_buffer", + "%t_const0, %elem", + x_tile_desc.get_mlir_shape(data_stype), + ) + idx_step_index = kernel.register_var_cse("idx_step_index", vector_size, "index") + bitonic_body.writeline(f"%{idx_step_index} = vector.step : vector<{vector_size}xindex>") + idx_step = ops.index_cast(idx_step_index, idx_stype) + idx_base = kernel.register_var_cse("idx_base", 1, idx_stype) + bitonic_body.writeline(f"%{idx_base} = arith.index_cast %elem : index to {idx_stype}") + idx_base_vec = ops.broadcast(idx_base, vector_size) + idx_chunk = ops.add(idx_base_vec, idx_step) + yv_chunk, yi_chunk = _bitonic_sort_pair( + x_chunk, idx_chunk, vector_size, descending=self.descending, stable_sort=self.use_stable_sort + ) + ops._store( + yv_chunk, + "YV_buffer", + "%t_const0, %elem", + yv_tile_desc.get_mlir_shape(data_stype), + ) + ops._store( + yi_chunk, + "XI_buffer", + "%t_const0, %elem", + xi_tile_desc.get_mlir_shape(idx_stype), + ) + + # 2) Chunk-level bitonic merge (loop form). + stage = 0 + k = 2 + while k <= num_chunks: + j = k // 2 + while j >= 1: + for block_start, is_even_block in ((0, True), (k, False)): + if block_start >= num_chunks: + continue + asc_dir = is_even_block if not self.descending else (not is_even_block) + stage_cse = common.CSE(kernel.newvar_prefix, kernel.suffix, name_prefix=f"sort_stage_{stage}") + with kernel, kernel.override_buffer_cse(buffer=bitonic_body, cse=stage_cse): + stage_loops = [ + LoopLevel("base", num_chunks, start=block_start, step=2 * k), + LoopLevel("p", k, step=2 * j), + LoopLevel("q", j), + ] + with contextlib.ExitStack() as stack: + for loop in stage_loops: + bitonic_body.writelines(loop.lines()) + stack.enter_context(bitonic_body.indent(attribute="{inner_loop=true}")) + + bitonic_body.writeline( + f"%left_elem = affine.apply affine_map<(d0, d1, d2) -> ((d0 + d1 + d2) * {vector_size})>(%base, %p, %q)" + ) + bitonic_body.writeline( + f"%right_elem = affine.apply affine_map<(d0, d1, d2) -> ((d0 + d1 + d2 + {j}) * {vector_size})>(%base, %p, %q)" + ) + + left_vec = ops._load( + vector_size, + data_stype, + "YV_buffer", + "%t_const0, %left_elem", + yv_tile_desc.get_mlir_shape(data_stype), + ) + right_vec = ops._load( + vector_size, + data_stype, + "YV_buffer", + "%t_const0, %right_elem", + yv_tile_desc.get_mlir_shape(data_stype), + ) + left_idx = ops._load( + vector_size, + idx_stype, + "XI_buffer", + "%t_const0, %left_elem", + xi_tile_desc.get_mlir_shape(idx_stype), + ) + right_idx = ops._load( + vector_size, + idx_stype, + "XI_buffer", + "%t_const0, %right_elem", + xi_tile_desc.get_mlir_shape(idx_stype), + ) + norm_desc = not asc_dir + left_norm, left_idx_norm = _bitonic_sort_pair( + left_vec, left_idx, vector_size, descending=norm_desc, stable_sort=self.use_stable_sort + ) + right_norm, right_idx_norm = _bitonic_sort_pair( + right_vec, right_idx, vector_size, descending=norm_desc, stable_sort=self.use_stable_sort + ) + left_merge, left_idx_merge, right_merge, right_idx_merge = _merge_sorted_pair_vectors( + left_norm, + left_idx_norm, + right_norm, + right_idx_norm, + ascending=asc_dir, + stable_sort=self.use_stable_sort, + vector_size=vector_size, + rev_indices=rev_indices, + ) + left_new, left_idx_new = _bitonic_sort_pair( + left_merge, left_idx_merge, vector_size, descending=norm_desc, stable_sort=self.use_stable_sort + ) + right_new, right_idx_new = _bitonic_sort_pair( + right_merge, right_idx_merge, vector_size, descending=norm_desc, stable_sort=self.use_stable_sort + ) + ops._store( + left_new, + "YV_buffer", + "%t_const0, %left_elem", + yv_tile_desc.get_mlir_shape(data_stype), + ) + ops._store( + right_new, + "YV_buffer", + "%t_const0, %right_elem", + yv_tile_desc.get_mlir_shape(data_stype), + ) + ops._store( + left_idx_new, + "XI_buffer", + "%t_const0, %left_elem", + xi_tile_desc.get_mlir_shape(idx_stype), + ) + ops._store( + right_idx_new, + "XI_buffer", + "%t_const0, %right_elem", + xi_tile_desc.get_mlir_shape(idx_stype), + ) + stage += 1 + j //= 2 + k *= 2 - d1_s0_idx = [row * cols, j] - d1_s1_idx = [row * cols, j] - - d0_s0_idx = [i * cols, col] - d0_s1_idx = [i * cols, col] - - kernel.loop_size = None - numel = rows * cols kernel.render_options = dict( KERNEL_NAME=self.name, + NAMES_STR="X, XI, YV", kernel=kernel, X=x, + XI=xi, YV=yv, - YI=yi, - OUT_DVAR="YV", - NAMES_STR="X, YI, YV", - ROWS=rows, - COLS=cols, - COLS_MINUS1=cols_minus1, - ROWS_MINUS1=rows_minus1, - DIM=int(self.dim), - DESCENDING=bool(self.descending), - YI_TILE_DESC=yi_tile_desc, + X_TILE_DESC=x_tile_desc, + XI_TILE_DESC=xi_tile_desc, YV_TILE_DESC=yv_tile_desc, - YI_S1_TILE_DESC=yi_s1_tile_desc, - YV_S1_TILE_DESC=yv_s1_tile_desc, - INIT_X_IDX=init_x_idx, - INIT_YV_IDX=init_yv_idx, - INIT_YI_IDX=init_yi_idx, - D1_S0_IDX=d1_s0_idx, - D1_S1_IDX=d1_s1_idx, - D0_S0_IDX=d0_s0_idx, - D0_S1_IDX=d0_s1_idx, - YV_ELEM_TYPE=mlir_common.DTYPE_TO_MLIR[yv_dtype], - YI_ELEM_TYPE=mlir_common.DTYPE_TO_MLIR[yi_dtype], - X_MEMREF_TYPE=f"memref<{numel}x{mlir_common.DTYPE_TO_MLIR[x_dtype]}>", - YV_MEMREF_TYPE=f"memref<{numel}x{mlir_common.DTYPE_TO_MLIR[yv_dtype]}>", - YI_MEMREF_TYPE=f"memref<{numel}x{mlir_common.DTYPE_TO_MLIR[yi_dtype]}>", - YV_TILE_MEMREF_TYPE=yv_tile_desc.get_mlir_shape(mlir_common.DTYPE_TO_MLIR[yv_dtype]), - YI_TILE_MEMREF_TYPE=yi_tile_desc.get_mlir_shape(mlir_common.DTYPE_TO_MLIR[yi_dtype]), - X_TILE_DESC=yv_tile_desc, + SORT_SIZE=sort_size, + VECTOR_SIZE=vector_size, + DATA_STYPE=data_stype, + IDX_STYPE=idx_stype, + ELEM_MEMREF_T=elem_memref_t, + BITONIC_BODY=bitonic_body.getvalue().rstrip(), input_reorder=self.input_reorder, + # N-D generalization + RANK = template_rank, + OUTPUT_SIZES = output_sizes, + OUTPUT_DIM = output_dim, + STEP_SIZES = step_sizes, + OUTER_VARS = outer_vars, + X_OFFSET_MAP = x_offset_map, + XI_OFFSET_MAP = xi_offset_map, + YV_OFFSET_MAP = yv_offset_map, + X_DRAM_STRIDE = x_dram_stride, + XI_DRAM_STRIDE = xi_dram_stride, + YV_DRAM_STRIDE = yv_dram_stride, + INDENT_SIZE = indent_size, ) - - output_node_name = yv.get_name() if hasattr(yv, "get_name") else yv.name - kernel.epilogue_info = dict( - output_node=output_node_name, - sram_var="yv_sort_tile", - dram_var=kernel.render_options["OUT_DVAR"], - dram_tile_desc=yv_tile_desc, - ) - kernel.exception_nodes[kernel.render_options["OUT_DVAR"]] = {"numel": yv.get_numel()} - kernel.exception_nodes["YI"] = {"numel": yi.get_numel()} - code = self._template_from_string(TEMPLATE).render(**kernel.render_options) return code + + +class MLIRStableSortTemplate(MLIRSortTemplate): + def __init__(self, input_nodes, layout, dim, descending=False, stable=True, input_reorder=None): + super().__init__( + input_nodes=input_nodes, + layout=layout, + dim=dim, + descending=descending, + stable=stable, + input_reorder=input_reorder, + ) + self.use_stable_sort = True diff --git a/tests/test_sort.py b/tests/test_sort.py index 2b070223..05afe92b 100644 --- a/tests/test_sort.py +++ b/tests/test_sort.py @@ -1,7 +1,5 @@ import argparse import torch -import torch._dynamo -import torch.utils.cpp_extension def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): @@ -34,63 +32,85 @@ def test_equal(name, out, cpu_out): print("cpu out:", cpu_out) raise SystemExit(1) - -def _normalize_dim(dim: int, rank: int) -> int: - d = dim if dim >= 0 else rank + dim - if d < 0 or d >= rank: - raise ValueError(f"dim out of range: dim={dim}, rank={rank}") - return d - - -def test_sort_stable(device, size=(128, 128), dim=-1, descending=False): - _normalize_dim(dim, len(size)) - - def sort_stable_fn(x): - return torch.sort(x, stable=True, dim=dim, descending=descending) - - x = torch.randn(size, dtype=torch.float32) - x_npu = x.to(device=device) - - opt_sort = torch.compile(dynamic=False)(sort_stable_fn) - out_values, out_indices = opt_sort(x_npu) - - ref_values, ref_indices = torch.sort(x, stable=True, dim=dim, descending=descending) - - test_result("Sort.stable/values", out_values, ref_values) - test_equal("Sort.stable/indices", out_indices, ref_indices) - - -def test_sort_values_stable(device, size=(128, 128), dim=-1, descending=False): - _normalize_dim(dim, len(size)) - - def sort_out_fn(x): - out_values = torch.empty_like(x, device=x.device) - out_indices = torch.empty_like(x, dtype=torch.int64, device=x.device) - return torch.sort(x, stable=True, dim=dim, descending=descending, out=(out_values, out_indices)) +def test_sort(device, size=(128, 128), dim=-1, descending=False, stable=True): + def sort_test(x): + return torch.sort(x, dim=dim, descending=descending, stable=stable) x = torch.randn(size, dtype=torch.float32) x_npu = x.to(device=device) - opt_sort = sort_out_fn# torch.compile(dynamic=False)(sort_out_fn) + opt_sort = torch.compile(dynamic=False)(sort_test) out_values, out_indices = opt_sort(x_npu) + ref_values, ref_indices = torch.sort(x, stable=stable, dim=dim, descending=descending) - ref_values, ref_indices = torch.sort(x, stable=True, dim=dim, descending=descending) - - test_result("Sort.values_stable/values", out_values, ref_values) - test_equal("Sort.values_stable/indices", out_indices, ref_indices) - + prefix = "Sort.stable" if stable else "Sort.unstable" + test_result(f"{prefix}/values size={size}, dim={dim}, desc={descending}", out_values, ref_values) + if stable: + test_result(f"{prefix}/indices size={size}, dim={dim}, desc={descending}", out_indices, ref_indices) + else: + # Unstable sort does not guarantee tie ordering; validate index-value consistency instead. + gathered = torch.gather(x, dim, out_indices.cpu()) + test_result(f"{prefix}/indices_gather size={size}, dim={dim}, desc={descending}", gathered, out_values.cpu()) + + +def test_sort_stable_suite(device): + # Keep sort-axis sizes compatible with backend constraints (vector-size multiple). + cases = [ + {"size": (64,), "dim": 0, "descending": False}, # 1D + {"size": (4, 64), "dim": 1, "descending": True}, # 2D, last dim + {"size": (2, 8, 32), "dim": 2, "descending": False}, # 3D, last dim + {"size": (2, 16, 4), "dim": 1, "descending": True}, # 3D, middle dim + {"size": (2, 4, 8, 32), "dim": 3, "descending": False}, # 4D, last dim + {"size": (4, 2, 32, 8), "dim": 2, "descending": True}, # 4D, inner dim + ] + for case in cases: + test_sort( + device=device, + size=case["size"], + dim=case["dim"], + descending=case["descending"], + stable=True, + ) + + +def test_sort_duplicate_cases(device): + duplicate_cases = [ + {"size": (64,), "dim": 0, "descending": False}, + {"size": (4, 64), "dim": 1, "descending": True}, + {"size": (2, 8, 32), "dim": 2, "descending": False}, + ] + for case in duplicate_cases: + base = torch.arange(case["size"][case["dim"]], dtype=torch.int64) % 7 + view_shape = [1] * len(case["size"]) + view_shape[case["dim"]] = case["size"][case["dim"]] + x = base.view(view_shape).expand(case["size"]).to(torch.float32) + noise = torch.randn(case["size"], dtype=torch.float32) * 0.0 + x = x + noise + + def sort_test(inp): + return torch.sort(inp, dim=case["dim"], descending=case["descending"], stable=True) + + out_values, out_indices = torch.compile(dynamic=False)(sort_test)(x.to(device=device)) + ref_values, ref_indices = torch.sort( + x, dim=case["dim"], descending=case["descending"], stable=True + ) + test_result(f"Sort.dup/stable_values {case}", out_values, ref_values) + test_equal(f"Sort.dup/stable_indices {case}", out_indices, ref_indices) + + def sort_test_unstable(inp): + return torch.sort(inp, dim=case["dim"], descending=case["descending"], stable=False) + + out_values_u, out_indices_u = torch.compile(dynamic=False)(sort_test_unstable)(x.to(device=device)) + ref_values_u, _ = torch.sort(x, dim=case["dim"], descending=case["descending"], stable=False) + test_result(f"Sort.dup/unstable_values {case}", out_values_u, ref_values_u) + gathered_u = torch.gather(x, case["dim"], out_indices_u.cpu()) + test_result(f"Sort.dup/unstable_gather {case}", gathered_u, out_values_u.cpu()) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run sort tests") - parser.add_argument("--shape", type=str, default="(128,128)") + parser.add_argument("--shape", type=str, default="(64, 32, 16)") parser.add_argument("--dim", type=int, default=0) parser.add_argument("--descending", action="store_true") - parser.add_argument( - "--mode", - type=str, - default="all", - choices=["all", "default", "values"], - ) args = parser.parse_args() shape = tuple(map(int, args.shape.strip("()").split(","))) @@ -100,13 +120,5 @@ def sort_out_fn(x): module = PyTorchSimRunner.setup_device() device = module.custom_device() - # Register recursive-compile bridge only when values_stable path is explicitly tested. - if args.mode in ("all", "values"): - torch.npu.register_eager_to_compile([ - "aten::sort.values_stable", - ]) - - if args.mode in ("all", "default"): - test_sort_stable(device, size=shape, dim=args.dim, descending=args.descending) - if args.mode in ("all", "values"): - test_sort_values_stable(device, size=shape, dim=args.dim, descending=args.descending) + test_sort_stable_suite(device) + test_sort_duplicate_cases(device) \ No newline at end of file From 752cbb834df7705fe12ec18da281d5b76032034e Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 11 Mar 2026 11:55:53 +0900 Subject: [PATCH 123/194] [Template] Use buffer type instead of hard-coded type --- PyTorchSimFrontend/extension_codecache.py | 21 +++------- PyTorchSimFrontend/mlir/mlir_bmm_template.py | 39 +++++++++++-------- .../mlir/mlir_caller_codegen.py | 4 -- PyTorchSimFrontend/mlir/mlir_common.py | 2 +- PyTorchSimFrontend/mlir/mlir_conv_common.py | 6 +++ .../mlir/mlir_conv_mt_template.py | 18 +++++---- .../mlir/mlir_conv_sb_template.py | 18 +++++---- .../mlir/mlir_conv_sbs_template.py | 18 +++++---- PyTorchSimFrontend/mlir/mlir_conv_template.py | 18 +++++---- PyTorchSimFrontend/mlir/mlir_gemm_template.py | 20 +++++++--- Simulator/simulator.py | 3 +- 11 files changed, 92 insertions(+), 75 deletions(-) diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index d6b47123..8454dee6 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -67,9 +67,10 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256): f""" {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \ -relocation-model=pic -march=riscv64 -O3 --stack-size-section \ - -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b \ + -mattr=+m,+f,+d,+a,+c,+v,+zvfh,+xsfvcp,zvl{vlen}b \ + -filetype=obj \ {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_LLVM_IR else ''} \ - -O2 {filename}.ll -o {filename}.s + -O2 {filename}.ll -o {filename}.o """, ).strip()] @@ -109,9 +110,10 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si f""" {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \ -relocation-model=pic -march=riscv64 -O3 --stack-size-section \ - -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b \ + -mattr=+m,+f,+d,+a,+c,+v,+zvfh,+xsfvcp,zvl{vlen}b \ + -filetype=obj \ {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_LLVM_IR else ''} \ - -O2 {sample_filename}.ll -o {sample_filename}.s + -O2 {sample_filename}.ll -o {sample_filename}.o """, ).strip()] @@ -180,17 +182,6 @@ def load(cls, source_code, val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name) val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name, validation_binary_name, new_link_option) - - stack_size = val_llvm_caller.parse_stack_sizes(f"{write_path}/{key}.s", vlenb=vlenb) - spad_size = val_llvm_caller.get_spad_size(validation_binary_path) - spad_usage = stack_size + spad_size # Spad usage per lane - if extension_config.CONFIG_SPAD_INFO["spad_size"] < spad_usage: - logger.debug( - f"Scratchpad size exceeded: required {spad_usage} bytes, " - f"but only {extension_config.CONFIG_SPAD_INFO['spad_size']} bytes available." - ) - raise SpadOverflowError() - # Skip if TOG file already exists if os.path.isfile(tog_path): return key diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py index 9398f90c..417d97cd 100644 --- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py @@ -26,20 +26,20 @@ {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }} {% if not Bias %} - %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32> + %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}> {% endif %} %c0 = arith.constant 0 : index {{ kernel.def_local_vars(indent_size=2) }} affine.for %index0 = 0 to {{ B }} { affine.for %index1 = 0 to {{ M }} step {{ TILE_M }} { affine.for %index2 = 0 to {{ N }} step {{ TILE_N }} { - %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1> - %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1> - %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1> + %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1> + %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1> + %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1> {% if Bias -%} {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[1, SUB_TILE_M, SUB_TILE_N], indent_size=8) }} {%- else -%} - affine.vector_store %v0, %Y_buffer[0, 0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32> + affine.vector_store %v0, %Y_buffer[0, 0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}> {% endif %} affine.for %index3 = 0 to {{ K }} step {{ TILE_K }} { @@ -74,20 +74,20 @@ {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }} {% if not Bias %} - %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32> + %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}> {% endif %} %c0 = arith.constant 0 : index {{ kernel.def_local_vars(indent_size=2) }} affine.for %index0 = 0 to {{ B }} { affine.for %index1 = 0 to {{ M }} step {{ TILE_M }} { affine.for %index2 = 0 to {{ N }} step {{ TILE_N }} { - %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1> - %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1> - %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1> + %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, 1> + %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, 1> + %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, 1> {% if Bias -%} {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[1, SUB_TILE_M, SUB_TILE_N], indent_size=8) }} {%- else -%} - affine.vector_store %v0, %Y_buffer[0, 0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32> + affine.vector_store %v0, %Y_buffer[0, 0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}> {% endif %} affine.for %index3 = 0 to {{ K }} step {{ TILE_K }} { {{kernel.load_input(indent_size=10)}} @@ -120,21 +120,21 @@ {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }} {% if not Bias %} - %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32> + %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}> {% endif %} %c0 = arith.constant 0 : index {{ kernel.def_local_vars(indent_size=2) }} affine.for %index0=0 to {{ B }} { affine.for %index2 = 0 to {{ N }} step {{ TILE_N }} { affine.for %index1 = 0 to {{ M }} step {{ TILE_M }} { - %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1> - %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1> - %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_N }}x{{ TILE_M }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1> + %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, 1> + %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, 1> + %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_N }}x{{ TILE_M }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, 1> {% if Bias -%} {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[1, SUB_TILE_M, SUB_TILE_N], indent_size=8) }} // Why not N,M? Currently, dma-fine-grained pass assume M->N order... {%- else -%} - affine.vector_store %v0, %Y_buffer[0, 0, 0] : memref<1x{{ TILE_N }}x{{ TILE_M }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32> + affine.vector_store %v0, %Y_buffer[0, 0, 0] : memref<1x{{ TILE_N }}x{{ TILE_M }}x{{DATA_STYPE}}, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}> {% endif %} affine.for %index3 = 0 to {{ K }} step {{ TILE_K }} { {{ kernel.def_dma_op("MVIN", "X", X_idx, X_tile_desc, subtile_size=[1, SUB_TILE_M, SUB_TILE_K], indent_size=10) }} @@ -237,6 +237,7 @@ def render(self, else: Bias_idx = None + data_stype = mlir_common.DTYPE_TO_MLIR[X.get_dtype()] kernel.render_options = dict( KERNEL_NAME=self.name, kernel=kernel, @@ -245,7 +246,7 @@ def render(self, SUB_TILE_M=SUB_TILE_M, SUB_TILE_N=SUB_TILE_N, SUB_TILE_K=SUB_TILE_K, - DATA_STYPE="f32", + DATA_STYPE=data_stype, X = X, W = W,Y = Y, Bias = Bias, X_idx = X_idx, W_idx = W_idx, @@ -319,6 +320,12 @@ def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes): X, W = self.input_nodes[0], self.input_nodes[1] Y = self.output_node Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] + dtype_infos = [("X", X.get_dtype()), ("W", W.get_dtype()), ("Y", Y.get_dtype())] + if Bias is not None: + dtype_infos.append(("Bias", Bias.get_dtype())) + if len({dtype for _, dtype in dtype_infos}) != 1: + dtype_desc = ", ".join(f"{name}={dtype}" for name, dtype in dtype_infos) + raise NotImplementedError(f"Mixed dtype BMM is not implemented yet ({dtype_desc})") W_tensor = empty_strided(W.layout.size, W.layout.stride) X_tensor = empty_strided(X.layout.size, X.layout.stride) diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py index 06d41ea2..7c842272 100644 --- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py +++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py @@ -182,22 +182,18 @@ def add_extention(self, name, extension): def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, link_option=""): main_path = os.path.join(write_path, self.add_extention(wrapper_name, 'c')) main_obj_path = os.path.join(write_path, self.add_extention(wrapper_name, 'o')) - kernel_path = os.path.join(write_path, self.add_extention(llvm_name, 's')) kernel_obj_path = os.path.join(write_path, self.add_extention(llvm_name, 'o')) main_compile = f'riscv64-unknown-elf-gcc -march=rv64gcv -c {main_path} -o {main_obj_path}' - kernel_compile = f'clang -c --target="riscv64" -march=rv64gcv -O2 -nostdlib {kernel_path} -o {kernel_obj_path}' target = os.path.join(write_path, binary_name) link = f'riscv64-unknown-elf-gcc -march=rv64gcv {main_obj_path} {kernel_obj_path} -o {target} -lm {link_option}' main_compile_cmd = shlex.split(main_compile) - kernel_compile_cmd = shlex.split(kernel_compile) link_cmd = shlex.split(link) try: subprocess.check_call(main_compile_cmd) - subprocess.check_call(kernel_compile_cmd) subprocess.check_call(link_cmd) except subprocess.CalledProcessError as e: print("Command failed with exit code", e.returncode) diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 256d7101..3c408681 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -67,7 +67,7 @@ DTYPE_TO_C = { torch.float32: "float", torch.float64: "double", - torch.float16: "half", + torch.float16: "uint16_t", torch.int64: "int64_t", torch.int32: "int32_t", torch.int16: "int16_t", diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py index f72a7663..91e200a8 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_common.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py @@ -52,6 +52,12 @@ def extract_info(self, kernel, template_buffer_node, epilogue_nodes): X, W = self.input_nodes[0], self.input_nodes[1] Y = self.output_node Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] + dtype_infos = [("X", X.get_dtype()), ("W", W.get_dtype()), ("Y", Y.get_dtype())] + if Bias is not None: + dtype_infos.append(("Bias", Bias.get_dtype())) + if len({dtype for _, dtype in dtype_infos}) != 1: + dtype_desc = ", ".join(f"{name}={dtype}" for name, dtype in dtype_infos) + raise NotImplementedError(f"Mixed dtype Conv is not implemented yet ({dtype_desc})") if epilogue_nodes is not None: extra_node_rw = { diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py index da2bc829..e91014fa 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py @@ -47,7 +47,7 @@ {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }} - %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32> + %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}> %c0 = arith.constant 0 : index {{- kernel.def_local_vars(indent_size=2) }} @@ -59,7 +59,7 @@ {%- if BIAS %} {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Bias_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N, TILE_O_H, TILE_O_W], indent_size=10) }} {%- else %} - affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32> + affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}> {%- endif %} affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} { affine.for %tile_k = 0 to {{ I_C * K_W }} step {{ TILE_K }} { @@ -71,16 +71,16 @@ affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order. affine.for %tile_k_w = 0 to 1 { %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w) - %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1> + %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1> affine.for %tile_o_h = 0 to {{ TILE_O_H }} { affine.for %tile_o_w = 0 to {{ TILE_O_W }} { %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h) %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_o_w) %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w) - %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1> - %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1> - linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) - outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) + %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1> + %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1> + linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) + outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) } { inner_loop=true } } { inner_loop=true } } { inner_loop=true } @@ -179,6 +179,8 @@ def render(self, if Bias is not None: Bias_tile_desc.offset = Bias.get_layout().offset + data_stype = mlir_common.DTYPE_TO_MLIR[X.get_dtype()] + kernel.render_options = dict( KERNEL_NAME=self.name, kernel=kernel, @@ -220,7 +222,7 @@ def render(self, X_idx = X_idx, W_idx = W_idx, Bias_idx = Bias_idx, - DATA_STYPE="f32", + DATA_STYPE=data_stype, input_reorder=self.input_reorder ) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py index cc284522..db2c64db 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py @@ -48,7 +48,7 @@ {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }} - %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32> + %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}> %c0 = arith.constant 0 : index {{- kernel.def_local_vars(indent_size=2) }} affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} { @@ -58,7 +58,7 @@ {%- if BIAS %} {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Bias_tile_desc, subtile_size=[1, SUB_TILE_N, TILE_O_H, SUB_TILE_M], indent_size=8) }} {%- else %} - affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32> + affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}> {%- endif %} affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} { affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} { @@ -72,16 +72,16 @@ affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order. affine.for %tile_k_w = 0 to {{ TILE_K_W }} { %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w) - %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1> + %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1> affine.for %tile_o_h = 0 to {{ TILE_O_H }} { affine.for %tile_o_w = 0 to {{ 1 }} { // TILE_O_W %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h) %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_k_w) %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w) - %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1> - %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1> - linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) - outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) + %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1> + %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1> + linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) + outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) } { inner_loop=true } } { inner_loop=true } } { inner_loop=true } @@ -178,6 +178,8 @@ def render(self, if Bias is not None: Bias_tile_desc.offset = Bias.get_layout().offset + data_stype = mlir_common.DTYPE_TO_MLIR[X.get_dtype()] + kernel.render_options = dict( KERNEL_NAME=self.name, kernel=kernel, @@ -219,7 +221,7 @@ def render(self, X_idx = X_idx, W_idx = W_idx, Bias_idx = Bias_idx, - DATA_STYPE="f32", + DATA_STYPE=data_stype, input_reorder=self.input_reorder ) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py index 6d768bf2..95db53c3 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py @@ -48,7 +48,7 @@ {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }} - %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32> + %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}> %c0 = arith.constant 0 : index {{- kernel.def_local_vars(indent_size=2) }} @@ -59,7 +59,7 @@ {%- if BIAS %} {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Bias_tile_desc, subtile_size=[1, SUB_TILE_N, TILE_O_H, SUB_TILE_M], indent_size=8) }} {%- else %} - affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32> + affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}> {%- endif %} affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} { affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} { @@ -72,16 +72,16 @@ affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order. affine.for %tile_k_w = 0 to {{ TILE_K_W }} { %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w) - %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1> + %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1> affine.for %tile_o_h = 0 to {{ TILE_O_H }} { affine.for %tile_o_w = 0 to {{ 1 }} { // TILE_O_W %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h) %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_k_w) %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w) - %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1> - %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1> - linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) - outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) + %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1> + %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1> + linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) + outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) } { inner_loop=true } } { inner_loop=true } } { inner_loop=true } @@ -179,6 +179,8 @@ def render(self, if Bias is not None: Bias_tile_desc.offset = Bias.get_layout().offset + data_stype = mlir_common.DTYPE_TO_MLIR[X.get_dtype()] + kernel.render_options = dict( KERNEL_NAME=self.name, kernel=kernel, @@ -220,7 +222,7 @@ def render(self, X_idx = X_idx, W_idx = W_idx, Bias_idx = Bias_idx, - DATA_STYPE="f32", + DATA_STYPE=data_stype, input_reorder=self.input_reorder ) diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py index e2cd61fd..3666b3c9 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py @@ -48,7 +48,7 @@ {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }} - %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32> + %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}> %c0 = arith.constant 0 : index {{ kernel.def_local_vars(indent_size=2) }} @@ -60,7 +60,7 @@ {%- if BIAS %} {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Bias_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N, TILE_O_H, TILE_O_W], indent_size=10) }} {%- else %} - affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32> + affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}> {%- endif %} affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} { affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} { @@ -74,17 +74,17 @@ affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order. affine.for %tile_k_w = 0 to {{ TILE_K_W }} { %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w) - %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1> + %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1> affine.for %tile_o_h = 0 to {{ TILE_O_H }} { affine.for %tile_o_w = 0 to {{ TILE_O_W }} { %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h) %tile_i_w = affine.apply #map_I_W(%tile_o_w, %tile_k_w) %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_i_w) %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w) - %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1> - %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1> - linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) - outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) + %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1> + %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1> + linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) + outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>) } { inner_loop=true } } { inner_loop=true } } { inner_loop=true } @@ -183,6 +183,8 @@ def render(self, if Bias is not None: Bias_tile_desc.offset = Bias.get_layout().offset + data_stype = mlir_common.DTYPE_TO_MLIR[X.get_dtype()] + kernel.render_options = dict( KERNEL_NAME=self.name, kernel=kernel, @@ -224,7 +226,7 @@ def render(self, X_idx = X_idx, W_idx = W_idx, Bias_idx = Bias_idx, - DATA_STYPE="f32", + DATA_STYPE=data_stype, input_reorder=self.input_reorder ) diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py index 5b116807..eb391dba 100644 --- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py @@ -27,14 +27,14 @@ {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }} {% if not Bias %} - %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %} + %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}>{% endif %} {{ kernel.def_local_vars(indent_size=2) }} affine.for %index0 = 0 to {{ M }} step {{ TILE_M }} { affine.for %index1 = 0 to {{ N }} step {{ TILE_N }} { {%- if Bias %} {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Bias_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N], indent_size=6) }} {%- else %} - affine.vector_store %v0, %Y_buffer[0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32> + affine.vector_store %v0, %Y_buffer[0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}> {%- endif %} affine.for %index2 = 0 to {{ K }} step {{ TILE_K }} { {% if prologue_nodes -%} @@ -77,16 +77,16 @@ {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }} {% if not Bias %} - %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32> + %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}> {% endif %} {{ kernel.def_local_vars(indent_size=2) }} affine.for %index1 = 0 to {{ N }} step {{ TILE_N }} { affine.for %index0 = 0 to {{ M }} step {{ TILE_M }} { - %Y_bufferT = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1> + %Y_bufferT = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, 1> {%- if Bias %} {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Bias_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N], indent_size=6) }} {%- else %} - affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_N }}x{{ TILE_M }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32> + affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_N }}x{{ TILE_M }}x{{DATA_STYPE}}, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}> {%- endif %} affine.for %index2 = 0 to {{ K }} step {{ TILE_K }} { {{ kernel.def_dma_op("MVIN", "X", X_idx, X_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_K], indent_size=8) }} @@ -187,6 +187,8 @@ def render(self, else: Bias_idx = None + data_stype = mlir_common.DTYPE_TO_MLIR[X.get_dtype()] + kernel.render_options = dict( KERNEL_NAME=self.name, kernel=kernel, @@ -197,7 +199,7 @@ def render(self, SUB_TILE_M=SUB_TILE_M, SUB_TILE_N=SUB_TILE_N, SUB_TILE_K=SUB_TILE_K, - DATA_STYPE="f32", + DATA_STYPE=data_stype, X = X, W = W, Y = Y, Bias = Bias, X_idx = X_idx, @@ -280,6 +282,12 @@ def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes): # Extract input arguments info X, W, Y = self.input_nodes[0], self.input_nodes[1], self.output_node + dtype_infos = [("X", X.get_dtype()), ("W", W.get_dtype()), ("Y", Y.get_dtype())] + if len(self.input_nodes) > 2: + dtype_infos.append(("Bias", self.input_nodes[2].get_dtype())) + if len({dtype for _, dtype in dtype_infos}) != 1: + dtype_desc = ", ".join(f"{name}={dtype}" for name, dtype in dtype_infos) + raise NotImplementedError(f"Mixed dtype GEMM is not implemented yet ({dtype_desc})") X_tensor = empty_strided(X.layout.size, X.layout.stride) W_tensor = empty_strided(W.layout.size, W.layout.stride) if len(W_tensor.size()) > 2 or len(X_tensor.size()) > 2: diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 13f2b4f0..f24835ba 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -68,6 +68,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): torch.uint8: np.uint8, torch.bool: np.uint8, torch.bfloat16: np.float16, + torch.float16: np.float16, } class FunctionalSimulator(): @@ -143,7 +144,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size= base_path= f"--base-path={runtime_path}" os.makedirs(os.path.join(runtime_path, "indirect_access"), exist_ok=True) os.makedirs(os.path.join(runtime_path, "dma_access"), exist_ok=True) - run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}' + run = f'spike --isa rv64gcv_zfh --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}' if not silent_mode: logger.debug(f"[Spike] cmd> {run}") logger.info("[Spike] Running Spike simulator") From 7af91dedeca74703c35ec9446ec167fcb8e4ec88 Mon Sep 17 00:00:00 2001 From: HamHyungkyu Date: Thu, 12 Mar 2026 10:09:40 +0900 Subject: [PATCH 124/194] [Frontend] Fix incorrect constant key usage and boolean scientific-notation edge case --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 10 +++++----- PyTorchSimFrontend/mlir/mlir_ops.py | 4 ++++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index d6ddb025..43cb65a4 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1423,11 +1423,11 @@ def get_const_cse(self, value, dtype="index") -> common.CSEVariable: value = float(value) else: value = int(value) - - if value not in self.consts: - self.consts[str(value)+dtype] = self.const_cse.generate(self.const_buffer, f"arith.constant {value} : {dtype}") - self.register_var_info(self.consts[str(value)+dtype], [1, dtype]) - return self.consts[str(value)+dtype] + key = str(value)+dtype + if key not in self.consts: + self.consts[key] = self.const_cse.generate(self.const_buffer, f"arith.constant {value} : {dtype}") + self.register_var_info(self.consts[key], [1, dtype]) + return self.consts[key] def get_tag_cse(self, value=None, shape="memref<1xi32>"): if value is None: diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index ace4f9ea..76a0e273 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -59,6 +59,10 @@ def constant(value, src_type, *args, **kwargs): str_val = str(value) if "inf" == str_val or "-inf" == str_val or "nan" == str_val: value = f"0x{mlir_common.MLIR_INF[str_val][src_type]:x}" + elif isinstance(value, bool): + value = 1 if value else 0 + if src_type[0] == "f": + value = format(float(value), ".20f") # scientific notation check elif "e" in str_val: value = format(float(value), ".20f") From 7bad17ae337873511a8b4e584d73767da56145bb Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 11 Mar 2026 19:51:41 +0900 Subject: [PATCH 125/194] [Fix] Refactor MLIR precision handling to be dtype-driven --- PyTorchSimFrontend/extension_config.py | 4 +- PyTorchSimFrontend/mlir/mlir_bmm_template.py | 10 +++-- PyTorchSimFrontend/mlir/mlir_cat_template.py | 20 +++++++--- PyTorchSimFrontend/mlir/mlir_common.py | 7 +++- PyTorchSimFrontend/mlir/mlir_conv_common.py | 11 +++--- .../mlir/mlir_conv_mt_template.py | 10 ++--- .../mlir/mlir_conv_sb_template.py | 8 ++-- .../mlir/mlir_conv_sbs_template.py | 8 ++-- PyTorchSimFrontend/mlir/mlir_conv_template.py | 8 ++-- PyTorchSimFrontend/mlir/mlir_gemm_template.py | 10 +++-- PyTorchSimFrontend/mlir/mlir_template.py | 38 +++++++++---------- README.md | 1 - 12 files changed, 76 insertions(+), 59 deletions(-) diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index eff6f573..fe8cc380 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -31,8 +31,6 @@ def __getattr__(name): "spad_size" : config_yaml["vpu_spad_size_kb_per_lane"] << 10 # Note: spad size per lane } - if name == "CONFIG_PRECISION": - return 4 # 32bit if name == "CONFIG_NUM_CORES": return config_yaml["num_cores"] if name == "vpu_vector_length_bits": @@ -132,7 +130,7 @@ def load_plan_from_module(module_path): CONFIG_USE_TIMING_POOLING = int(os.environ.get('TORCHSIM_USE_TIMING_POOLING', default=0)) -CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=0)) +CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=1)) def setup_logger(name=None, level=None): diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py index 417d97cd..c5fd902f 100644 --- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py @@ -166,8 +166,9 @@ def render(self, tile_info = None, **kwargs): X, W, Y, Bias, W_tensor, X_tensor, B, M, N, K, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) + precision_bytes = mlir_common.get_dtype_nbytes(X.get_dtype()) if tile_info is None: - TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node)[0] + TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node, precision_bytes)[0] else: TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info @@ -350,10 +351,11 @@ def get_tile_candidates(self, prologue_nodes: Optional[List[IRNode]] = None, **kwargs): X, W, Y, Bias, W_tensor, X_tensor, B, M, N, K, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) - return self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node) + precision_bytes = mlir_common.get_dtype_nbytes(X.get_dtype()) + return self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node, precision_bytes) - def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node): - tile_candidates = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node) + def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node, precision_bytes): + tile_candidates = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, precision_bytes=precision_bytes) for idx, (TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or n_prologue_node else kernel.vector_lane SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py index 7bee54ac..7abdfee6 100644 --- a/PyTorchSimFrontend/mlir/mlir_cat_template.py +++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py @@ -56,6 +56,11 @@ def render( ): input_nodes = self.input_nodes y = self.output_node + dtype_infos = [("Y", y.get_dtype())] + [(f"X{i}", x.get_dtype()) for i, x in enumerate(input_nodes)] + if len({dtype for _, dtype in dtype_infos}) != 1: + dtype_desc = ", ".join(f"{name}={dtype}" for name, dtype in dtype_infos) + raise NotImplementedError(f"Mixed dtype Cat is not implemented yet ({dtype_desc})") + precision_bytes = mlir_common.get_dtype_nbytes(y.get_dtype()) num_inputs = len(input_nodes) rank = len(y.get_size()) @@ -68,7 +73,7 @@ def render( excluded_dims = self._compute_excluded_dims(tile_sizes) input_tile_sizes_dim = self._calculate_input_tile_sizes( - kernel, input_sizes, tile_sizes, num_inputs, rank + kernel, input_sizes, tile_sizes, num_inputs, rank, precision_bytes ) buffer_name_to_template_name, input_dram_names = self._build_buffer_mapping(input_nodes) input_tile_descs, output_tile_descs, unique_tile_descs = self._build_tile_descriptors( @@ -145,6 +150,11 @@ def get_tile_candidates( self.output_node = template_buffer_node y = self.output_node + dtype_infos = [("Y", y.get_dtype())] + [(f"X{i}", x.get_dtype()) for i, x in enumerate(self.input_nodes)] + if len({dtype for _, dtype in dtype_infos}) != 1: + dtype_desc = ", ".join(f"{name}={dtype}" for name, dtype in dtype_infos) + raise NotImplementedError(f"Mixed dtype Cat is not implemented yet ({dtype_desc})") + precision_bytes = mlir_common.get_dtype_nbytes(y.get_dtype()) num_inputs = len(self.input_nodes) output_sizes = [sz for d, sz in enumerate(y.get_size()) if d != self.dim] @@ -152,7 +162,7 @@ def get_tile_candidates( return [[1]] max_tile_total = kernel.spad_info["spad_size"] // ( - kernel.vector_lane * kernel.precision * 2 * num_inputs + kernel.vector_lane * precision_bytes * 2 * num_inputs ) dim_tile_candidates = [] @@ -174,7 +184,7 @@ def get_tile_candidates( tile_candidates = [ list(combo) for combo in itertools.product(*dim_tile_candidates) - if math.prod(combo) * (num_inputs + 1) * kernel.precision + if math.prod(combo) * (num_inputs + 1) * precision_bytes <= kernel.spad_info["spad_size"] * kernel.vector_lane ] @@ -199,11 +209,11 @@ def _compute_excluded_dims(self, tile_sizes: list) -> list: tile_sizes[idx] = 1 return excluded - def _calculate_input_tile_sizes(self, kernel, input_sizes, tile_sizes, num_inputs, rank): + def _calculate_input_tile_sizes(self, kernel, input_sizes, tile_sizes, num_inputs, rank, precision_bytes): """Calculate tile sizes along the concat dimension for each input.""" non_dim_tile_elements = math.prod(tile_sizes) if tile_sizes else 1 max_spad_per_input = kernel.spad_info["spad_size"] * kernel.vector_lane // 2 - extra_concat = math.ceil(max_spad_per_input / (non_dim_tile_elements * kernel.precision)) - num_inputs + extra_concat = math.ceil(max_spad_per_input / (non_dim_tile_elements * precision_bytes)) - num_inputs input_tile_sizes_dim = [] for i in range(num_inputs): diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 3c408681..9f5dc6ab 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -90,6 +90,12 @@ "index": 64 } +def get_dtype_nbytes(dtype): + mlir_dtype = DTYPE_TO_MLIR.get(dtype) + if mlir_dtype is None or mlir_dtype not in MLIR_TO_BIT: + raise NotImplementedError(f"Unsupported dtype for precision calculation: {dtype}") + return MLIR_TO_BIT[mlir_dtype] // 8 + DTYPE_LOWP_FP = [ torch.bfloat16, torch.float16, @@ -579,7 +585,6 @@ def __init__(self): # Default HW setting self.vector_lane = extension_config.vpu_num_lanes self.spad_info = extension_config.CONFIG_SPAD_INFO - self.precision = extension_config.CONFIG_PRECISION self.num_cores = extension_config.CONFIG_NUM_CORES self.vlen = extension_config.vpu_vector_length_bits diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py index 91e200a8..386e9bd5 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_common.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py @@ -2,7 +2,7 @@ import math from typing import List, Optional -from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs +from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs, get_dtype_nbytes from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel from torch._inductor.ir import IRNode @@ -40,7 +40,7 @@ def render(self, **kwargs): raise NotImplementedError() - def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): + def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes): raise NotImplementedError() def extract_info(self, kernel, template_buffer_node, epilogue_nodes): @@ -58,6 +58,7 @@ def extract_info(self, kernel, template_buffer_node, epilogue_nodes): if len({dtype for _, dtype in dtype_infos}) != 1: dtype_desc = ", ".join(f"{name}={dtype}" for name, dtype in dtype_infos) raise NotImplementedError(f"Mixed dtype Conv is not implemented yet ({dtype_desc})") + precision_bytes = get_dtype_nbytes(X.get_dtype()) if epilogue_nodes is not None: extra_node_rw = { @@ -75,7 +76,7 @@ def extract_info(self, kernel, template_buffer_node, epilogue_nodes): PADDING_W=self.padding[1] STRIDE_H=self.stride[0] STRIDE_W=self.stride[1] - return X,W,Y,Bias,n_extra_node,BATCH,I_C,I_H,I_W,O_C,K_H,K_W,O_H,O_W,PADDING_H,PADDING_W,STRIDE_H,STRIDE_W + return X,W,Y,Bias,n_extra_node,BATCH,I_C,I_H,I_W,O_C,K_H,K_W,O_H,O_W,PADDING_H,PADDING_W,STRIDE_H,STRIDE_W,precision_bytes def get_tile_candidates(self, kernel: MLIRTemplateKernel, @@ -83,8 +84,8 @@ def get_tile_candidates(self, epilogue_nodes: Optional[List[IRNode]] = None, **kwargs): # Extract input arguments info - X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) - return self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W) + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W, precision_bytes = self.extract_info(kernel, template_buffer_node, epilogue_nodes) + return self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes) def outer_func_render(self, kernel_name, input_args): X, W = self.input_nodes[0], self.input_nodes[1] diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py index e91014fa..8b8288a8 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py @@ -131,12 +131,12 @@ def render(self, tile_info = None, **kwargs): # Extract input arguments info - X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W, precision_bytes = self.extract_info(kernel, template_buffer_node, epilogue_nodes) # Select tile size adn template conv_template = CONV_TEMPLATE if tile_info is None: - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0] + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes)[0] else: TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N @@ -170,7 +170,7 @@ def render(self, Y_tile_desc.set_name("output_buffer") Y_dim = [Symbol("tile_m"), Symbol("tile_n"), Symbol("o_h"), Symbol("o_w")] Y_idx = [Y_dim[0]*O_C*O_H*O_W, Y_dim[1]*O_H*O_W, Y_dim[2]*O_W, Y_dim[3]] - + # Extract Bias info Bias_idx = [Number(0), Symbol("tile_n"), Number(0), Number(0)] Bias_tile_desc = mlir_common.MLIRMultiDimTile(Y_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride) @@ -239,8 +239,8 @@ def render(self, kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]]) return code - def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): - tile_candidates = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) + def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes): + tile_candidates = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node, precision_bytes=precision_bytes) for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py index db2c64db..92efff66 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py @@ -132,12 +132,12 @@ def render(self, tile_info = None, **kwargs): # Extract input arguments info - X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W, precision_bytes = self.extract_info(kernel, template_buffer_node, epilogue_nodes) # Select tile size adn template conv_template = CONV_TEMPLATE if tile_info is None: - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0] + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes)[0] else: TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N @@ -238,8 +238,8 @@ def render(self, kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]]) return code - def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): - tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W + def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes): + tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node, precision_bytes=precision_bytes) # TODO: implement K_W for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py index 95db53c3..dfd418d9 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py @@ -132,12 +132,12 @@ def render(self, tile_info = None, **kwargs): # Extract input arguments info - X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W, precision_bytes = self.extract_info(kernel, template_buffer_node, epilogue_nodes) # Select tile size adn template conv_template = CONV_TEMPLATE if tile_info is None: - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0] + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes)[0] else: TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N @@ -239,8 +239,8 @@ def render(self, kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]]) return code - def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): - tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W + def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes): + tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node, precision_bytes=precision_bytes) # TODO: implement K_W for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py index 3666b3c9..178ba7c6 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_template.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py @@ -136,12 +136,12 @@ def render(self, tile_info = None, **kwargs): # Extract input arguments info - X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes) + X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W, precision_bytes = self.extract_info(kernel, template_buffer_node, epilogue_nodes) # Select tile size adn template conv_template = CONV_TEMPLATE if tile_info is None: - TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0] + TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes)[0] else: TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info TOG_latency = BATCH if TILE_M > BATCH else TILE_M @@ -243,8 +243,8 @@ def render(self, kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]]) return code - def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): - tile_candidates = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) + def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes): + tile_candidates = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node, precision_bytes=precision_bytes) for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0] TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1] diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py index eb391dba..9c61c3d9 100644 --- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py @@ -117,8 +117,9 @@ def render(self, tile_info = None, **kwargs): X, W, Y, M, N, K, n_epilogue_node, n_prologue_node, n_extra_read = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) + precision_bytes = mlir_common.get_dtype_nbytes(X.get_dtype()) if tile_info is None: - TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node)[0] + TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node, precision_bytes)[0] else: TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info @@ -274,7 +275,8 @@ def get_tile_candidates(self, prologue_nodes: Optional[List[IRNode]] = None, **kwargs): X, W, Y, M, N, K, n_epilogue_node, n_prologue_node, n_extra_read = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) - return self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node) + precision_bytes = mlir_common.get_dtype_nbytes(X.get_dtype()) + return self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node, precision_bytes) def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes): if template_buffer_node is not None: @@ -307,7 +309,7 @@ def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes): M, N, K = X_tensor.size()[0], W_tensor.size()[1], X_tensor.size()[1] return X,W,Y,M,N,K,n_epilogue_node,n_prologue_node,len(n_extra_read) - def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node): + def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node, precision_bytes): data = {} gemm_shape = f"{M}_{N}_{K}" if "external" in extension_config.codegen_mapping_strategy: @@ -327,7 +329,7 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no else: # case 2: use heuristic mapping min_tile = (n_extra_node + n_prologue_node) == 0 - tile_candidates = kernel.gemm_combination_mapping(M, N, K, max(n_extra_read-2, 0), n_prologue_node, min_tile=True) + tile_candidates = kernel.gemm_combination_mapping(M, N, K, max(n_extra_read-2, 0), n_prologue_node, min_tile=True, precision_bytes=precision_bytes) # Edge case if (M == 0) or (N == 0) or (K == 0): diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 9cc79e0a..81b3d606 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -150,10 +150,10 @@ def add_loop_info(self, mat_size, tile_size): for idx, (loop_size, stride) in enumerate(zip(mat_size, tile_size)): self.loop_info[f"index{idx}"] = [0, loop_size, stride] - def gemmini_gemm_mapping(self, M, N, K): + def gemmini_gemm_mapping(self, M, N, K, precision_bytes=4): spad_size = self.spad_info["spad_size"] * self.vector_lane num_cores = self.num_cores - precision = self.precision + precision = precision_bytes dim_I, dim_J, dim_K = M, N, K dim = self.vector_lane @@ -205,7 +205,7 @@ def gemmini_gemm_mapping(self, M, N, K): return inner_I, inner_J, inner_K - def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, pad_k=True, min_tile=False, is_conv=False): + def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, pad_k=True, min_tile=False, is_conv=False, precision_bytes=4): tile_candidates = [] spad_size_per_lane = self.spad_info["spad_size"] spad_size = spad_size_per_lane * self.vector_lane @@ -233,11 +233,11 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p tile_M = i * self.vector_lane if M > self.vector_lane else M_padded for j in tile_N_range: tile_N = j * self.vector_lane if N > self.vector_lane else N_padded - used_spad_size = (tile_M * tile_K * (1 + n_prologue_node) + tile_K * tile_N + tile_M * tile_N * (1 + n_extra_node)) * self.precision + used_spad_size = (tile_M * tile_K * (1 + n_prologue_node) + tile_K * tile_N + tile_M * tile_N * (1 + n_extra_node)) * precision_bytes weight_size_per_lane = self.get_spad_size_per_lane(tile_K, tile_N) input_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_prologue_node), tile_K) output_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_extra_node), tile_N) - used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision + used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * precision_bytes check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane) if check_spad_size: dir_path = f"{extension_config.CONFIG_TORCHSIM_DIR}/validation/gemm_candidates" @@ -259,11 +259,11 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p tile_M = i * self.vector_lane if M > self.vector_lane else M_padded for j in tile_N_range: tile_N = j * self.vector_lane if N > self.vector_lane else N_padded - used_spad_size = (tile_M * tile_K * (1 + n_prologue_node) + tile_K * tile_N + tile_M * tile_N * (1 + n_extra_node)) * self.precision + used_spad_size = (tile_M * tile_K * (1 + n_prologue_node) + tile_K * tile_N + tile_M * tile_N * (1 + n_extra_node)) * precision_bytes weight_size_per_lane = self.get_spad_size_per_lane(tile_K, tile_N) input_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_prologue_node), tile_K) output_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_extra_node), tile_N) - used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision + used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * precision_bytes n_tile = math.ceil(M / max(tile_M, 128)) * math.ceil(N / max(tile_N, 128)) check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane) if check_spad_size and max_used_spad_size < used_spad_size and maximize_i_j <= tile_M * tile_N and n_tile >= minimum_n_tile and max(tile_N, 128) // max(tile_M, 128) < 10: @@ -277,7 +277,7 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p tile_candidates = [v for _, v in tile_candidates] return tile_candidates - def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0): + def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0, precision_bytes=4): tile_candidates = [] spad_size_per_lane = self.spad_info["spad_size"] spad_size = spad_size_per_lane * self.vector_lane @@ -285,7 +285,7 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation max_spad_per_lane = spad_size_per_lane // 2 # double buffer max_used_spad_size = 0 - M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0] + M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True, precision_bytes=precision_bytes)[0] max_k_h_w = 1 # maximize kernel size max_o_h_w = 1 # maximize output size K = min(K, self.vector_lane) @@ -298,11 +298,11 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation weight_size = k_w * k_h * K * N input_size = i_w * i_h * M * K output_size = o_w * o_h * M * N - used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision + used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * precision_bytes weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N) input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K) output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M * (1 + n_extra_node), N) - used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision + used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * precision_bytes check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane) if check_spad_size: tile_candidates.append((used_spad_size, (k_h, k_w, o_h, o_w, M, N, K))) @@ -318,7 +318,7 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation tile_candidates = [v for _, v in tile_candidates] return tile_candidates - def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0): + def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0, precision_bytes=4): tile_candidates = [] spad_size_per_lane = self.spad_info["spad_size"] spad_size = spad_size_per_lane * self.vector_lane @@ -326,7 +326,7 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, max_spad_per_lane = spad_size_per_lane // 2 max_used_spad_size = 0 - M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0] + M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node=n_extra_node, pad_k=False, is_conv=True, precision_bytes=precision_bytes)[0] max_k_h_w = K_W for o_h in sympy.divisors(O_H): for o_w in sympy.divisors(O_W): @@ -336,11 +336,11 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, weight_size = 1 * k_h * K * N input_size = i_w * i_h * M * K output_size = o_w * o_h * M * N - used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision + used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * precision_bytes weight_size_per_lane = self.get_spad_size_per_lane(1 * k_h * K, N) input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K) output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M * (1 + n_extra_node), N) - used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision + used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * precision_bytes check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane) if check_spad_size: tile_candidates.append((used_spad_size, (k_h, K_W, o_h, o_w, M, N, K))) @@ -354,7 +354,7 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, tile_candidates = [v for _, v in tile_candidates] return tile_candidates - def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0): + def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0, precision_bytes=4): tile_candidates = [] spad_size_per_lane = self.spad_info["spad_size"] spad_size = spad_size_per_lane * self.vector_lane @@ -362,7 +362,7 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio max_spad_per_lane = spad_size_per_lane // 2 max_used_spad_size = 0 - M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0] + M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True, precision_bytes=precision_bytes)[0] max_k_h_w = 1 for o_h in sympy.divisors(O_H): for k_h in sympy.divisors(K_H): @@ -372,11 +372,11 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio weight_size = k_w * k_h * K * N input_size = i_w * i_h * k_w * K output_size = M * o_h * N - used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision + used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * precision_bytes weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N) input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * k_w, K) output_size_per_lane = self.get_spad_size_per_lane(M * o_h * (1 + n_extra_node), N) - used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision + used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * precision_bytes check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane) if check_spad_size: tile_candidates.append((used_spad_size, (k_h, k_w, o_h, M, M, N, K))) diff --git a/README.md b/README.md index 4a3ef145..f55995c9 100644 --- a/README.md +++ b/README.md @@ -396,7 +396,6 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing "icnt_injection_ports_per_core" : 16 // Interconnect injection ports per core "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", // Booksim2 config file path - "precision" : 4, // Element's precision in tensor (Byte) "scheduler" : "simple", // Scheduler type (Now, only support simple scheduler) "num_partition" : 2, // Multi-core Partitioning "partition": { // allocate request queue index From fadba78ef71f69992b321c9318a23a1377506121 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 12 Mar 2026 14:42:37 +0900 Subject: [PATCH 126/194] [Fix] malloc size align + fix origin info --- AsmParser/tog_generator.py | 4 ++-- PyTorchSimFrontend/extension_codecache.py | 21 +++++++++++++++++++ PyTorchSimFrontend/mlir/mlir_autotune.py | 2 +- .../mlir/mlir_codegen_backend.py | 3 ++- PyTorchSimFrontend/mlir/mlir_scheduling.py | 6 ++++-- 5 files changed, 30 insertions(+), 6 deletions(-) diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py index 5f586d99..a12460e3 100644 --- a/AsmParser/tog_generator.py +++ b/AsmParser/tog_generator.py @@ -37,7 +37,7 @@ class tog_generator: StonneTraceCompute= 6 StonneTraceLoad = 7 StonneTraceStore = 8 - def __init__(self, origins="Unknown") -> None: + def __init__(self, origins={"Unknown"}) -> None: self.module_name = "tile_operation_graph" self.module = None self.raw_graph = {} @@ -226,7 +226,7 @@ def generate_tile_graph(self, name="tile_graph", cycle_list=list, x_offset=int, offset = w_offset if is_preload else x_offset iter_node.torchsim_overlapping_cycle = max(iter_node.torchsim_cycle - offset, 0) - origin_info = "_".join(map(str, self.origins)) + origin_info = self.origins if isinstance(self.origins, str) else "_".join(map(str, self.origins)) onnx_node_list = [node.to_onnx() for node in node_list] # Exclude root node dump_onnx_graph(name, onnx_node_list, vector_lane, origin_info, stonneGraph=stonneGraph) diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 8454dee6..b1c457d3 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -72,6 +72,14 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256): {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_LLVM_IR else ''} \ -O2 {filename}.ll -o {filename}.o """, + ).strip(), + re.sub(r"[ \n]+", " ", + f""" + {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \ + -relocation-model=pic -march=riscv64 -O3 --stack-size-section \ + -mattr=+m,+f,+d,+a,+c,+v,+zvfh,+xsfvcp,zvl{vlen}b \ + -O2 {filename}.ll -o {filename}.s + """, ).strip()] def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_size, vlen=256): @@ -168,11 +176,13 @@ def load(cls, source_code, opt_cmd = shlex.split(cmds[0]) translate_cmd = shlex.split(cmds[1]) llc_cmd = shlex.split(cmds[2]) + llc_asm_cmd = shlex.split(cmds[3]) with lock: try: subprocess.check_call(opt_cmd) subprocess.check_call(translate_cmd) subprocess.check_call(llc_cmd) + subprocess.check_call(llc_asm_cmd) except subprocess.CalledProcessError as e: logger.error(f"Command failed with exit code {e.returncode}") logger.error(f"Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}") @@ -182,6 +192,17 @@ def load(cls, source_code, val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name) val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name, validation_binary_name, new_link_option) + + stack_size = val_llvm_caller.parse_stack_sizes(f"{write_path}/{key}.s", vlenb=vlenb) + spad_size = val_llvm_caller.get_spad_size(validation_binary_path) + spad_usage = stack_size + spad_size # Spad usage per lane + if extension_config.CONFIG_SPAD_INFO["spad_size"] < spad_usage: + logger.debug( + f"Scratchpad size exceeded: required {spad_usage} bytes, " + f"but only {extension_config.CONFIG_SPAD_INFO['spad_size']} bytes available." + ) + raise SpadOverflowError() + # Skip if TOG file already exists if os.path.isfile(tog_path): return key diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py index 4503584c..caf4d6da 100644 --- a/PyTorchSimFrontend/mlir/mlir_autotune.py +++ b/PyTorchSimFrontend/mlir/mlir_autotune.py @@ -85,7 +85,7 @@ def cached_run_fn(*args, **kwargs): self.source_code, vectorlane_size=self.extra_args["vector_lane"], loop_size=None, spad_info=self.extra_args["spad_info"], vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"], - origins="Unknown", silent_mode=True, + origins=self.extra_args["origins"], silent_mode=True, autotune=self.extra_args['autotune']) args = [ diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 43cb65a4..24d6636a 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -285,7 +285,7 @@ def __init__(self, kernel_group, reason=None): self.gem5_header = IndentedBuffer() self.header.writeline("#include ") self.header.writeline("#include ") - self.header.writeline("void* __wrap_malloc(size_t size) { return sbrk(size); }") + self.header.writeline("void* __wrap_malloc(size_t size) { size = (size + 511UL) & ~511UL; return sbrk(size); }") # Align to 512 bytes self.header.writeline("void __wrap_free(void *ptr) { return; }") self.reduction_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc") self.spad_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="spad") @@ -1060,6 +1060,7 @@ def run_bench(self, nodes, kernel_name, src_code): "vlen" : self.vlen, "arg_attributes" : arg_attributes, "autotune" : True, + "origins" : {str(i) for node in nodes for i in node.node.origins}, }, source_code=src_code, ) diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 2f9c9704..22d1011b 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -276,7 +276,7 @@ def codegen_node(self, _node): MLIRScheduling.count += 1 src_code, meta_code = ex_kernel.codegen_nodes(nodes, kernel_name_candidate) kernel_name = self.define_kernel(src_code, meta_code, kernel_name_candidate, ex_kernel.vector_lane, - ex_kernel.spad_info, origins= {str(i) for i in nodes[0].node.origins}) + ex_kernel.spad_info, origins={str(i) for node in nodes for i in node.node.origins}) ex_kernel.call_kernel(kernel_name) _, args, _, _ = ex_kernel.args.mlir_argdefs() args = ", ".join(args) @@ -332,8 +332,10 @@ def codegen_template(self, template_node, epilogue_nodes, prologue_nodes): src_code, meta_code = kernel.codegen_nodes(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes) with kernel: + all_nodes = [template_node] + (epilogue_nodes or []) + (prologue_nodes or []) + origins = {str(i) for n in all_nodes for i in n.node.origins} kernel_name = self.define_kernel(src_code, meta_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info, - kernel.loop_size, origins={str(i) for i in template_node.node.origins}) + kernel.loop_size, origins=origins) self.define_function(kernel) kernel.call_kernel(kernel_name) From 0189ab978fbe3ce02e72bb77f66c2bd10342babe Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 12 Mar 2026 15:06:28 +0900 Subject: [PATCH 127/194] [TOGSim] Fix local/remote memory stat --- TOGSim/src/Simulator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc index b5b9c778..d7fe9f1b 100644 --- a/TOGSim/src/Simulator.cc +++ b/TOGSim/src/Simulator.cc @@ -121,7 +121,7 @@ void Simulator::icnt_cycle() { front->set_core_id(core_id); if (!_icnt->is_full(port_id, front)) { int node_id = _dram->get_channel_id(front) / _config.dram_channels_per_partitions; - if (core_id == node_id) + if (get_partition_id(core_id) == node_id) _cores[core_id]->inc_numa_local_access(); else _cores[core_id]->inc_numa_remote_access(); From 5268be2df8352f3470bee4e60739b9467fa07ca8 Mon Sep 17 00:00:00 2001 From: HamHyungkyu Date: Thu, 12 Mar 2026 19:30:04 +0900 Subject: [PATCH 128/194] [Frontend/template] add SPDA decode GQA template imlementation --- .../mlir/mlir_codegen_backend.py | 7 +- PyTorchSimFrontend/mlir/mlir_lowering.py | 37 +- PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 888 +++++++++++++++++- PyTorchSimFrontend/mlir/mlir_template.py | 4 +- tests/test_sdpa.py | 57 +- 5 files changed, 973 insertions(+), 20 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 24d6636a..38125e31 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -470,7 +470,12 @@ def parse_index_list(self, expr_list:list, offset=sympy.Number(0)) -> common.CSE new_expr_list[idx] = arg.subs(arg.args[1], dim_list[idx]) indices.append(str(new_arg)) elif not arg.is_number: - new_arg = sympy.Symbol(str(self.convert_index(arg))) + try: + new_arg = sympy.Symbol(str(self.convert_index(arg))) + #not implemented case + except NotImplementedError: + print(f"Not implemented case: {arg}") + raise NotImplementedError(f"Not implemented case: {arg}") new_expr_list[idx] = new_arg.subs(new_arg, dim_list[idx]) indices.append(str(new_arg)) else: diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py index 9d49f212..ac7eb853 100644 --- a/PyTorchSimFrontend/mlir/mlir_lowering.py +++ b/PyTorchSimFrontend/mlir/mlir_lowering.py @@ -16,9 +16,15 @@ from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate -from PyTorchSimFrontend.mlir.mlir_sdpa_template import MLIRFlashSDPATemplate, flash_sdpa_args, calculate_scale from PyTorchSimFrontend.mlir.mlir_cat_template import MLIRCatTemplate from PyTorchSimFrontend.mlir.mlir_sort_template import MLIRSortTemplate, MLIRStableSortTemplate +from PyTorchSimFrontend.mlir.mlir_sdpa_template import ( + MLIRFlashSDPATemplate, + MLIRDecodeGQASDPAPartialTemplate, + MLIRDecodeGQASDPAReduceTemplate, + flash_sdpa_args, + calculate_scale, +) from PyTorchSimFrontend import extension_config aten = torch.ops.aten @@ -58,6 +64,35 @@ def tuned_flash_sdpa( scale = calculate_scale(query, scale) N, Hq, H, L, S, E, Ev, layout, query, key, value = flash_sdpa_args(query, key, value) + # Decode-only GQA fast path: q is (B,Hq,1,Dh), B==1, Hq!=H, Hq%H==0. + # Always use the 2-kernel decode path: + # 1) block partials over (kv head, sequence block) + # 2) reduce/merge across blocks + # This keeps KV shared across qsub, avoids dh0-outer duplication, and + # stores compact partials instead of full score/prob tensors in DRAM. + if L == 1 and Hq != H and N == 1 and (Hq % H) == 0: + g = Hq // H + vector_lane = extension_config.vpu_num_lanes + tile_e = vector_lane + dh_tiles = E // tile_e + decode_gqa_block_size = 512 + BlkS = decode_gqa_block_size if S >= decode_gqa_block_size else int(S) + # Padding-based tail handling: allow S not divisible by BlkS. + nblk = (S + BlkS - 1) // BlkS + HgDhTiles = H * g * dh_tiles + tile_pack = tile_e * 2 + + partial_layout = ir.FixedLayout( + query.get_device(), + torch.float32, + [HgDhTiles, nblk, tile_pack], + ) + partial_tmpl = MLIRDecodeGQASDPAPartialTemplate([query, key, value], partial_layout, scale, BlkS=BlkS) + partial = partial_tmpl.generate().output_node() + reduce_tmpl = MLIRDecodeGQASDPAReduceTemplate([partial], layout, BlkS=BlkS) + out_node = reduce_tmpl.generate().output_node() + return (out_node, None, None, None, None, None, None, None, None) + mlir_template = MLIRFlashSDPATemplate([query, key, value], layout, scale) # _scaled_dot_product_flash_attention has to return a tuple which has 9 values diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py index 05030f27..1cd810e8 100644 --- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py +++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py @@ -48,23 +48,28 @@ def flash_sdpa_args( s = V.graph.sizevars.guard_equals(sk, sv) e = V.graph.sizevars.guard_equals(eq, ek) - # While there are no theoretical requirements for e == ev, - # this implementation enforces e == ev for simplicity. - # Distinct notations are still maintained to ensure future compatibility and clarity. + # While there are no theoretical requirements for e == ev, + # this implementation currently enforces e == ev for simplicity. if e != ev: - raise NotImplementedError("Flash SDPA does not support mismatched head dimensions between query and value.") - - # Flash attention does not split tiles along the head dimension (e or ev). - # Therefore, the head dimension size must be less than or equal to the number of vlanes. - vector_lane = extension_config.vpu_num_lanes - if e > vector_lane or ev > vector_lane: - raise ValueError(f"The head dimension size must be less than or equal to the number of vlanes (e: {e}, ev: {ev}, vlanes: {vector_lane}).") + raise NotImplementedError( + "Flash SDPA currently requires matching head dimensions between query and value (e == ev)." + ) + + # Support head dimensions larger than vector lanes by tiling e/ev. + # For now, require multiples of vector lanes (covers 64/128 with vlanes=16). + vector_lane = extension_config.vpu_num_lanes + if (e % vector_lane) != 0: + raise NotImplementedError( + f"Flash SDPA currently requires e to be a multiple of vlanes (e: {e}, vlanes: {vector_lane})." + ) - # The aten._scaled_dot_product_flash_attention kernel does not accept an explicit enable_gqa parameter. - # Instead, the Flash SDPA implementation infers GQA usage by checking if hq != hk. - # The Flash SDPA for GQA will be implemented after implementing its native version. - if hq != h : - raise NotImplementedError("Flash SDPA for GQA is not supported yet.") + # Minimal GQA support (single-batch only for now). + # We map each query head to a KV head by grouping: hq = g * h. + if hq != h: + if n != 1: + raise NotImplementedError("Flash SDPA GQA is currently supported only for n == 1.") + if (hq % h) != 0: + raise NotImplementedError(f"Flash SDPA GQA requires hq % h == 0 (hq: {hq}, h: {h}).") layout = FixedLayout( query.get_device(), @@ -479,3 +484,856 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no tile_candidates[idx] = tile_l,tile_s,tile_e,subtile_l,subtile_s,subtile_e return tile_candidates + + +# --------------------------- +# Decode-only GQA SDPA (Lq == 1) +# --------------------------- + +DECODE_GQA_SDPA_TEMPLATE = r""" +// Decode GQA SDPA kernel (Lq == 1) +// B = {{ B }} +// Hq = {{ Hq }} +// H = {{ H }} +// g = {{ g }} +// S = {{ S }} +// Dh = {{ Dh }} +// BlkS = {{ BlkS }} +// tile_s = {{ tile_s }} +// tile_e = {{ tile_e }} +// dh_tiles = {{ dh_tiles }} +{{kernel.def_global_vars()}} + +func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[query, key, value], outputs=[out], names_str="query, key, value, out", input_reorder=input_reorder)}} { + // IO buffers follow input dtype (fp16/bf16/f32) + {{ kernel.def_sram_buffer("query", q_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }} + // Softmax output used for SV matmul (io dtype) + {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("score", score_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("prob", prob_desc, indent_size=2) }} + // Accumulator in fp32 (stable) + {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }} + // Temp output in io dtype for SV matmul result + {{ kernel.def_sram_buffer("out_io", out_io_tile_desc, indent_size=2) }} + // Softmax running stats in fp32 + {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }} + + %c0 = arith.constant 0.0 : {{ acc_stype }} + %c1 = arith.constant 1.0 : {{ acc_stype }} + %c_scale = arith.constant {{ scale }} : {{ acc_stype }} + %c_neg_inf = arith.constant -1.0e+30 : {{ acc_stype }} + + %v0_e_acc = arith.constant dense<0.0> : vector<{{ tile_e }}x{{ acc_stype }}> + %v0_e_io = arith.constant dense<0.0> : vector<{{ tile_e }}x{{ io_stype }}> + %v0_2x = arith.constant dense<0.0> : vector<2x{{ acc_stype }}> + %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2x{{ acc_stype }}> + %v0_s_acc = arith.constant dense<0.0> : vector<{{ tile_s }}x{{ acc_stype }}> + + %v_scale = vector.broadcast %c_scale : {{ acc_stype }} to vector<{{ tile_s }}x{{ acc_stype }}> + + {{ kernel.def_local_vars(indent_size=2) }} + + // kv_head parallelism is the natural unit for GQA reuse + affine.for %kv = 0 to {{ H }} { + // Process S in blocks (BlkS). Sequential inside a core. + affine.for %blk = 0 to {{ S }} step {{ BlkS }} { + // Initialize per-qsub accumulators for this (kv, blk) + affine.for %qsub = 0 to {{ g }} { + affine.vector_store %v_neg_inf_2x, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> + affine.vector_store %v0_2x, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> + affine.for %dht = 0 to {{ dh_tiles }} { + affine.vector_store %v0_e_acc, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> + } + } + + affine.for %s0 = %blk to (%blk + {{ BlkS }}) step {{ tile_s }} { + // Accumulate score per qsub so K tiles can be shared across qsub. + affine.for %qsub = 0 to {{ g }} { + affine.vector_store %v0_s_acc, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}> + } + + affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} { + // Load K slice once for all qsub. + {{ kernel.def_dma_op("MVIN", "key", kk_idx, k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1) }} + %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1> + + affine.for %qsub = 0 to {{ g }} { + {{ kernel.def_dma_op("MVIN", "query", qk_idx, q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12) }} + %q2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1> + + // mul = k @ q -> (tile_s x 1) in io dtype, then upcast and accumulate. + linalg.matmul + { idx_map = array } + ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x1x{{ io_stype }}, 1>) + outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }}) + + %raw_mul_io = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> + %raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}x{{ acc_stype }}> + %old_score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}> + %new_score = arith.addf %old_score, %raw_mul : vector<{{ tile_s }}x{{ acc_stype }}> + affine.vector_store %new_score, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}> + } { accumulation_loop=true } + } { accumulation_loop=true } + + affine.for %qsub = 0 to {{ g }} { + %score_acc = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}> + // scale after full Dh reduction + %scaled_mul_vec = arith.mulf %score_acc, %v_scale : vector<{{ tile_s }}x{{ acc_stype }}> + + // Online softmax update (max/sum/out) identical to FLASH_SDPA_TEMPLATE but specialized to Lq==1. + %old_max = affine.vector_load %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> + // Reduce max over tile_s + %max_init = vector.broadcast %c_neg_inf : {{ acc_stype }} to vector<{{ tile_s }}x{{ acc_stype }}> + %local_max_vec = arith.maximumf %scaled_mul_vec, %max_init : vector<{{ tile_s }}x{{ acc_stype }}> + %max_cast = vector.shape_cast %local_max_vec : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s // 2 }}x2x{{ acc_stype }}> + %max_red1 = vector.multi_reduction , %max_cast, %v_neg_inf_2x [0] : vector<{{ tile_s // 2 }}x2x{{ acc_stype }}> to vector<2x{{ acc_stype }}> + %max_shuf = vector.shuffle %max_red1, %max_red1 [1, 0] : vector<2x{{ acc_stype }}>, vector<2x{{ acc_stype }}> + %max_red2 = arith.maximumf %max_red1, %max_shuf : vector<2x{{ acc_stype }}> + %new_max = arith.maximumf %max_red2, %old_max : vector<2x{{ acc_stype }}> + affine.vector_store %new_max, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> + + // rescale = exp(old_max - new_max) + %max_diff = arith.subf %old_max, %new_max : vector<2x{{ acc_stype }}> + %max_diff_scalar = vector.extract %max_diff[0] : {{ acc_stype }} from vector<2x{{ acc_stype }}> + %rescale_e = vector.broadcast %max_diff_scalar : {{ acc_stype }} to vector<{{ tile_e }}x{{ acc_stype }}> + %exp_rescale_e = math.exp %rescale_e : vector<{{ tile_e }}x{{ acc_stype }}> + %rescale_2 = vector.broadcast %max_diff_scalar : {{ acc_stype }} to vector<2x{{ acc_stype }}> + %exp_rescale_2 = math.exp %rescale_2 : vector<2x{{ acc_stype }}> + + // out *= rescale + %old_out = affine.vector_load %out_acc_buffer[%qsub, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> + %rescaled_out = arith.mulf %exp_rescale_e, %old_out : vector<{{ tile_e }}x{{ acc_stype }}> + affine.vector_store %rescaled_out, %out_acc_buffer[%qsub, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> + + // sum *= rescale + %old_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> + %rescaled_sum = arith.mulf %old_sum, %exp_rescale_2 : vector<2x{{ acc_stype }}> + + // exp(score - new_max) + %new_max_scalar = vector.extract %new_max[0] : {{ acc_stype }} from vector<2x{{ acc_stype }}> + %new_max_bcast = vector.broadcast %new_max_scalar : {{ acc_stype }} to vector<{{ tile_s }}x{{ acc_stype }}> + %shifted = arith.subf %scaled_mul_vec, %new_max_bcast : vector<{{ tile_s }}x{{ acc_stype }}> + %exp_scores = math.exp %shifted : vector<{{ tile_s }}x{{ acc_stype }}> + // For SV matmul: downcast softmax output to io dtype (common in practice) + %exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s }}x{{ io_stype }}> + affine.vector_store %exp_scores_io, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> + + // sum += reduce(exp_scores) + %sum_cast = vector.shape_cast %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s // 2 }}x2x{{ acc_stype }}> + %zero_2x = vector.broadcast %c0 : {{ acc_stype }} to vector<2x{{ acc_stype }}> + %sum_red1 = vector.multi_reduction , %sum_cast, %zero_2x [0] : vector<{{ tile_s // 2 }}x2x{{ acc_stype }}> to vector<2x{{ acc_stype }}> + %sum_shuf = vector.shuffle %sum_red1, %sum_red1 [1, 0] : vector<2x{{ acc_stype }}>, vector<2x{{ acc_stype }}> + %sum_red2 = arith.addf %sum_red1, %sum_shuf : vector<2x{{ acc_stype }}> + %new_sum = arith.addf %sum_red2, %rescaled_sum : vector<2x{{ acc_stype }}> + affine.vector_store %new_sum, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> + + } { accumulation_loop=true } + + // 2) SV accumulation: for each output dh tile, load V once and share across qsub. + affine.for %dht = 0 to {{ dh_tiles }} { + %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht) + {{ kernel.def_dma_op("MVIN", "value", v_idx, v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0) }} + %v2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1> + + affine.for %qsub = 0 to {{ g }} { + %prob_vec = affine.vector_load %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> + affine.vector_store %prob_vec, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> + affine.vector_store %v0_e_io, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> + %out_io_2D = memref.reinterpret_cast %out_io_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1> + linalg.matmul + { idx_map = array } + ins(%v2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(io_stype) }}) + outs(%out_io_2D : memref<{{ tile_e }}x1x{{ io_stype }}, 1>) + + %out_io_vec = affine.vector_load %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> + %out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}x{{ acc_stype }}> + %out_acc_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> + %out_acc_new = arith.addf %out_acc_vec, %out_io_f32 : vector<{{ tile_e }}x{{ acc_stype }}> + affine.vector_store %out_acc_new, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> + } { accumulation_loop=true } + } { accumulation_loop=true } + } { accumulation_loop=true } + + // finalize per-qsub for this (kv, blk) and store out for all dh tiles + affine.for %qsub = 0 to {{ g }} { + %final_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> + %one_2x = vector.broadcast %c1 : {{ acc_stype }} to vector<2x{{ acc_stype }}> + %inv_sum_2x = arith.divf %one_2x, %final_sum : vector<2x{{ acc_stype }}> + %inv_sum = vector.extract %inv_sum_2x[0] : {{ acc_stype }} from vector<2x{{ acc_stype }}> + %inv_bcast = vector.broadcast %inv_sum : {{ acc_stype }} to vector<{{ tile_e }}x{{ acc_stype }}> + + affine.for %dht = 0 to {{ dh_tiles }} { + %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht) + %acc_out = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> + %final_out_acc = arith.mulf %acc_out, %inv_bcast : vector<{{ tile_e }}x{{ acc_stype }}> + %final_out_io = arith.truncf %final_out_acc : vector<{{ tile_e }}x{{ acc_stype }}> to vector<{{ tile_e }}x{{ io_stype }}> + affine.vector_store %final_out_io, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> + {{ kernel.store_output(indent_size=10) }} + } + } { outer_loop=true } + } { outer_loop=true } + } { outer_loop=true } + + return +} +""" + + +class MLIRDecodeGQASDPATemplate(MLIRTemplate): + def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=None): + super().__init__("kernel", input_nodes, layout, input_reorder) + self.scale = scale + self.BlkS = BlkS + + def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs): + # Decode-only: q is (B,Hq,1,Dh) + query, key, value, out = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2], self.output_node + + # Materialize tensors for stride metadata + q_tensor4 = empty_strided(query.layout.size, query.layout.stride) + k_tensor4 = empty_strided(key.layout.size, key.layout.stride) + v_tensor4 = empty_strided(value.layout.size, value.layout.stride) + + B, Hq, Lq, Dh = q_tensor4.shape + Bk, H, S, Dhk = k_tensor4.shape + assert B == 1, "Decode GQA template currently supports B==1" + assert Lq == 1, "Decode GQA template requires Lq==1" + assert Dh == Dhk + g = Hq // H + BlkS = min(int(self.BlkS), int(S)) + + # Use 3D views to match the existing SDPA indexing scheme + # q: (Hq, 1, Dh), k/v: (H, S, Dh), out: (Hq, 1, Dh) + q_tensor = q_tensor4.view(Hq, 1, Dh) + k_tensor = k_tensor4.view(H, S, Dh) + v_tensor = v_tensor4.view(H, S, Dh) + + tile_s = kernel.vector_lane + tile_e = kernel.vector_lane + dh_tiles = int(Dh) // int(tile_e) + + io_stype = mlir_common.DTYPE_TO_MLIR[query.get_dtype()] + acc_stype = "f32" + + # SRAM tiles: q(1x1xtile_e), k/v(1xtile_sxtile_e), mul(tile_sx1) in io dtype. + # out_acc in f32; out_io temp in io dtype. + vlane_stride = 1 + q_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) + q_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) + q_tile_desc.set_name("q_buffer") + q_tile_desc.offset = query.get_layout().offset + + k_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 2, vlane_stride) + k_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, 1, tile_s]) + k_tile_desc.set_name("k_buffer") + k_tile_desc.offset = key.get_layout().offset + + v_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 1, vlane_stride) + v_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, tile_e, 1]) + v_tile_desc.set_name("v_buffer") + v_tile_desc.offset = value.get_layout().offset + + mul_tile_desc = mlir_common.MLIRMultiDimTile([tile_s, 1], kernel.vector_lane, 1, vlane_stride) + mul_tile_desc.set_tile_size_stride([tile_s, 1], [1, 1]) + mul_tile_desc.set_name("mul_buffer") + + score_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride) + score_desc.set_tile_size_stride([g, tile_s], [tile_s, 1]) + score_desc.set_name("score_buffer") + + prob_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride) + prob_desc.set_tile_size_stride([g, tile_s], [tile_s, 1]) + prob_desc.set_name("prob_buffer") + + # Per-qsub accumulators so KV tiles can be shared across qsub + out_acc_tile_desc = mlir_common.MLIRMultiDimTile([g, dh_tiles, tile_e], kernel.vector_lane, 2, vlane_stride) + out_acc_tile_desc.set_tile_size_stride([g, dh_tiles, tile_e], [dh_tiles * tile_e, tile_e, 1]) + out_acc_tile_desc.set_name("out_acc_buffer") + + out_io_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) + out_io_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) + out_io_tile_desc.set_name("out_io_buffer") + + max_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride) + max_desc.set_tile_size_stride([g, 2], [2, 1]) + max_desc.set_name("max_buffer") + + sum_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride) + sum_desc.set_tile_size_stride([g, 2], [2, 1]) + sum_desc.set_name("sum_buffer") + + # Indices + kv = sympy.Symbol("kv") + qsub = sympy.Symbol("qsub") + dh0 = sympy.Symbol("dh0") + k0 = sympy.Symbol("k0") + s0 = sympy.Symbol("s0") + q_head = kv * g + qsub + + q_stride = q_tensor.stride() + k_stride = k_tensor.stride() + v_stride = v_tensor.stride() + # out is (B,Hq,1,Dh) but we address it as (Hq,1,Dh) + out_tensor = empty_strided(out.get_layout().size, out.get_layout().stride).view(Hq, 1, Dh) + out_stride = out_tensor.stride() + + # QK indices use k0 reduction over Dh + qk_idx = [q_head * q_stride[0], sympy.Integer(0), k0 * q_stride[2]] + kk_idx = [kv * k_stride[0], s0 * k_stride[1], k0 * k_stride[2]] + # V and output use dh0 tile offset + v_idx = [kv * v_stride[0], s0 * v_stride[1], dh0 * v_stride[2]] + out_idx = [q_head * out_stride[0], sympy.Integer(0), dh0 * out_stride[2]] + + kernel.loop_size = [tile_s, tile_e, 1] + + kernel.render_options = dict( + KERNEL_NAME=self.name, + kernel=kernel, + B=B, + Hq=Hq, + H=H, + g=g, + S=S, + Dh=Dh, + dh_tiles=dh_tiles, + BlkS=BlkS, + tile_s=tile_s, + tile_e=tile_e, + io_stype=io_stype, + acc_stype=acc_stype, + scale=self.scale, + query=query, + key=key, + value=value, + out=out, + q_tile_desc=q_tile_desc, + k_tile_desc=k_tile_desc, + v_tile_desc=v_tile_desc, + out_acc_tile_desc=out_acc_tile_desc, + out_io_tile_desc=out_io_tile_desc, + mul_tile_desc=mul_tile_desc, + score_desc=score_desc, + prob_desc=prob_desc, + max_desc=max_desc, + sum_desc=sum_desc, + qk_idx=qk_idx, + kk_idx=kk_idx, + v_idx=v_idx, + out_idx=out_idx, + input_reorder=self.input_reorder, + ) + + kernel.epilogue_info = dict( + output_node=self.output_node.name, + sram_var="out_io_buffer", + dram_var="out", + dram_idx=out_idx, + dram_tile_desc=out_io_tile_desc, + nr_rdim=0, + r_dim_size=0, + dim_aliasing={"kv": "kv", "qsub": "qsub", "dh0": "dh0", "s0": "s0"}, + ) + + return self._template_from_string(DECODE_GQA_SDPA_TEMPLATE).render(**kernel.render_options) + + +# --------------------------- +# Decode-only GQA SDPA: 2-kernel pipeline (partial blocks + reduce) +# --------------------------- + +DECODE_GQA_SDPA_PARTIAL_TEMPLATE = r""" +// Decode GQA SDPA partial kernel (per sequence block) +// Produces partials per (kv,qsub,dh_tile,blk): +// - first half lanes: o_j (tile_e) +// - second half lanes: [m_j, l_j, 0, 0, ...] (tile_e) +// QK/softmax is computed once per (kv,qsub,s0) over full Dh using k0 reduction. +// SV then reuses those probabilities across all dh tiles. +// H = {{ H }}, g = {{ g }}, Dh = {{ Dh }}, dh_tiles = {{ dh_tiles }}, S = {{ S }}, BlkS = {{ BlkS }}, nblk = {{ nblk }} +{{kernel.def_global_vars()}} + +func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[query, key, value], outputs=[partial], names_str="query, key, value, partial", input_reorder=input_reorder)}} { + {{ kernel.def_sram_buffer("query", q_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("score", score_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("prob", prob_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("out_io", out_io_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("partial", partial_tile_desc, indent_size=2) }} + + %c0 = arith.constant 0.0 : f32 + %c_scale = arith.constant {{ scale }} : f32 + %c_neg_inf = arith.constant -1.0e+30 : f32 + + %v0_e = arith.constant dense<0.0> : vector<{{ tile_e }}xf32> + %v0_e_io = arith.constant dense<0.0> : vector<{{ tile_e }}x{{ io_stype }}> + %v0_s = arith.constant dense<0.0> : vector<{{ tile_s }}xf32> + %v0_2x = arith.constant dense<0.0> : vector<2xf32> + %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2xf32> + %v_scale = vector.broadcast %c_scale : f32 to vector<{{ tile_s }}xf32> + + {{ kernel.def_local_vars(indent_size=2) }} + + affine.for %kv = 0 to {{ H }} { + affine.for %blk = 0 to {{ nblk }} step 1 { + // Reset per-block accumulators for all qsub/dh tiles. + affine.for %qsub = 0 to {{ g }} { + affine.vector_store %v_neg_inf_2x, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> + affine.vector_store %v0_2x, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> + affine.for %dht = 0 to {{ dh_tiles }} { + affine.vector_store %v0_e, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + } + } + + affine.for %s0 = ({{ BlkS }} * %blk) to ({{ BlkS }} * (%blk + 1)) step {{ tile_s }} { + // Accumulate score per qsub so K tiles can be shared across qsub. + affine.for %qsub = 0 to {{ g }} { + affine.vector_store %v0_s, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32> + } + + affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} { + {{ kernel.def_dma_op("MVIN", "key", kk_idx, k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1) }} + %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1> + + affine.for %qsub = 0 to {{ g }} { + {{ kernel.def_dma_op("MVIN", "query", qk_idx, q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12) }} + %q2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1> + linalg.matmul + { idx_map = array } + ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x1x{{ io_stype }}, 1>) + outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }}) + %raw_mul_io = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> + %raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}xf32> + %old_score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32> + %new_score = arith.addf %old_score, %raw_mul : vector<{{ tile_s }}xf32> + affine.vector_store %new_score, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32> + } { accumulation_loop=true } + } { accumulation_loop=true } + + // Softmax once per qsub; persist probabilities in SRAM for all SV dh tiles. + affine.for %qsub = 0 to {{ g }} { + %score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32> + %scaled = arith.mulf %score, %v_scale : vector<{{ tile_s }}xf32> + + %old_max = affine.vector_load %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> + %max_init = vector.broadcast %c_neg_inf : f32 to vector<{{ tile_s }}xf32> + %local_max_vec = arith.maximumf %scaled, %max_init : vector<{{ tile_s }}xf32> + %max_cast = vector.shape_cast %local_max_vec : vector<{{ tile_s }}xf32> to vector<{{ tile_s // 2 }}x2xf32> + %max_red1 = vector.multi_reduction , %max_cast, %v_neg_inf_2x [0] : vector<{{ tile_s // 2 }}x2xf32> to vector<2xf32> + %max_shuf = vector.shuffle %max_red1, %max_red1 [1, 0] : vector<2xf32>, vector<2xf32> + %max_red2 = arith.maximumf %max_red1, %max_shuf : vector<2xf32> + %new_max = arith.maximumf %max_red2, %old_max : vector<2xf32> + affine.vector_store %new_max, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> + + %max_diff = arith.subf %old_max, %new_max : vector<2xf32> + %max_diff_scalar = vector.extract %max_diff[0] : f32 from vector<2xf32> + %rescale_e = vector.broadcast %max_diff_scalar : f32 to vector<{{ tile_e }}xf32> + %exp_rescale_e = math.exp %rescale_e : vector<{{ tile_e }}xf32> + %rescale_2 = vector.broadcast %max_diff_scalar : f32 to vector<2xf32> + %exp_rescale_2 = math.exp %rescale_2 : vector<2xf32> + + %old_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> + %rescaled_sum = arith.mulf %old_sum, %exp_rescale_2 : vector<2xf32> + + affine.for %dht = 0 to {{ dh_tiles }} { + %old_out = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + %rescaled_out = arith.mulf %exp_rescale_e, %old_out : vector<{{ tile_e }}xf32> + affine.vector_store %rescaled_out, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + } + + %new_max_scalar = vector.extract %new_max[0] : f32 from vector<2xf32> + %new_max_bcast = vector.broadcast %new_max_scalar : f32 to vector<{{ tile_s }}xf32> + %shifted = arith.subf %scaled, %new_max_bcast : vector<{{ tile_s }}xf32> + %exp_scores = math.exp %shifted : vector<{{ tile_s }}xf32> + %exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s }}x{{ io_stype }}> + affine.vector_store %exp_scores_io, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> + + %sum_cast = vector.shape_cast %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s // 2 }}x2xf32> + %zero_2x = vector.broadcast %c0 : f32 to vector<2xf32> + %sum_red1 = vector.multi_reduction , %sum_cast, %zero_2x [0] : vector<{{ tile_s // 2 }}x2xf32> to vector<2xf32> + %sum_shuf = vector.shuffle %sum_red1, %sum_red1 [1, 0] : vector<2xf32>, vector<2xf32> + %sum_red2 = arith.addf %sum_red1, %sum_shuf : vector<2xf32> + %new_sum = arith.addf %sum_red2, %rescaled_sum : vector<2xf32> + affine.vector_store %new_sum, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> + } { accumulation_loop=true } + + // For each output dh tile, load V once and share it across qsub. + affine.for %dht = 0 to {{ dh_tiles }} { + %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht) + {{ kernel.def_dma_op("MVIN", "value", v_idx, v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0) }} + %v2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1> + + affine.for %qsub = 0 to {{ g }} { + %prob_vec = affine.vector_load %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> + affine.vector_store %prob_vec, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> + affine.vector_store %v0_e_io, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> + %out_io_2D = memref.reinterpret_cast %out_io_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1> + linalg.matmul + { idx_map = array } + ins(%v2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(io_stype) }}) + outs(%out_io_2D : memref<{{ tile_e }}x1x{{ io_stype }}, 1>) + + %out_io_vec = affine.vector_load %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> + %out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}xf32> + %out_acc_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + %out_acc_new = arith.addf %out_acc_vec, %out_io_f32 : vector<{{ tile_e }}xf32> + affine.vector_store %out_acc_new, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + } { accumulation_loop=true } + } { accumulation_loop=true } + } { accumulation_loop=true } + + // Store packed partials for all qsub/dh tiles. + affine.for %qsub = 0 to {{ g }} { + %final_max = affine.vector_load %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> + %m_scalar = vector.extract %final_max[0] : f32 from vector<2xf32> + %final_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> + %l_scalar = vector.extract %final_sum[0] : f32 from vector<2xf32> + %ml_vec = vector.broadcast %c0 : f32 to vector<{{ tile_e }}xf32> + %ml0 = vector.insert %m_scalar, %ml_vec[0] : f32 into vector<{{ tile_e }}xf32> + %ml1 = vector.insert %l_scalar, %ml0[1] : f32 into vector<{{ tile_e }}xf32> + + affine.for %dht = 0 to {{ dh_tiles }} { + %out_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + %packed = vector.concat %out_vec, %ml1 : vector<{{ tile_pack }}xf32> + affine.vector_store %packed, %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32> + {{ kernel.store_output(indent_size=10) }} + } + } { outer_loop=true } + } { outer_loop=true } + } { outer_loop=true } + return +} +""" + + +DECODE_GQA_SDPA_REDUCE_TEMPLATE = r""" +// Decode GQA SDPA reduce kernel: merge partials across blocks +// Input partial shape: (HgDhTiles, nblk, tile_pack) +{{kernel.def_global_vars()}} + +func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[partial], outputs=[out], names_str="partial, out", input_reorder=input_reorder)}} { + {{ kernel.def_sram_buffer("partial", partial_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }} + + %c0 = arith.constant 0.0 : f32 + %c1 = arith.constant 1.0 : f32 + %c_neg_inf = arith.constant -1.0e+30 : f32 + %v0_e = arith.constant dense<0.0> : vector<{{ tile_e }}xf32> + %v0_2x = arith.constant dense<0.0> : vector<2xf32> + %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2xf32> + + {{ kernel.def_local_vars(indent_size=2) }} + + affine.for %gh = 0 to {{ HgDhTiles }} { + // reset merged accumulators + affine.vector_store %v0_e, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> + affine.vector_store %v0_2x, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> + + affine.for %blk = 0 to {{ nblk }} { + {{ kernel.def_dma_op("MVIN", "partial", partial_idx, partial_tile_desc, subtile_size=[1, 1, tile_pack], indent_size=8) }} + %p = affine.vector_load %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32> + %p2 = vector.shape_cast %p : vector<{{ tile_pack }}xf32> to vector<2x{{ tile_e }}xf32> + %o_j = vector.extract %p2[0] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32> + %ml_j = vector.extract %p2[1] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32> + %m_j = vector.extract %ml_j[0] : f32 from vector<{{ tile_e }}xf32> + %l_j = vector.extract %ml_j[1] : f32 from vector<{{ tile_e }}xf32> + + %old_max = affine.vector_load %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> + %m_old = vector.extract %old_max[0] : f32 from vector<2xf32> + %m_new = arith.maximumf %m_old, %m_j : f32 + %m_new2 = vector.broadcast %m_new : f32 to vector<2xf32> + affine.vector_store %m_new2, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> + + %diff_old = arith.subf %m_old, %m_new : f32 + %diff_j = arith.subf %m_j, %m_new : f32 + %scale_old = math.exp %diff_old : f32 + %scale_j = math.exp %diff_j : f32 + %scale_old_e = vector.broadcast %scale_old : f32 to vector<{{ tile_e }}xf32> + %scale_j_e = vector.broadcast %scale_j : f32 to vector<{{ tile_e }}xf32> + + %o_old = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + %o_old_rs = arith.mulf %o_old, %scale_old_e : vector<{{ tile_e }}xf32> + %o_j_rs = arith.mulf %o_j, %scale_j_e : vector<{{ tile_e }}xf32> + %o_new = arith.addf %o_old_rs, %o_j_rs : vector<{{ tile_e }}xf32> + affine.vector_store %o_new, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + + %old_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> + %l_old = vector.extract %old_sum[0] : f32 from vector<2xf32> + %l_new = arith.addf (arith.mulf %l_old, %scale_old : f32), (arith.mulf %l_j, %scale_j : f32) : f32 + %l_new2 = vector.broadcast %l_new : f32 to vector<2xf32> + affine.vector_store %l_new2, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> + } { accumulation_loop=true } + + // finalize: out = o / l + %sum2 = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> + %l = vector.extract %sum2[0] : f32 from vector<2xf32> + %inv = arith.divf %c1, %l : f32 + %inv_e = vector.broadcast %inv : f32 to vector<{{ tile_e }}xf32> + %o = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + %out_f32 = arith.mulf %o, %inv_e : vector<{{ tile_e }}xf32> + %out_io = arith.truncf %out_f32 : vector<{{ tile_e }}xf32> to vector<{{ tile_e }}x{{ io_stype }}> + affine.vector_store %out_io, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> + {{ kernel.store_output(indent_size=4) }} + } { outer_loop=true } + return +} +""" + + +class MLIRDecodeGQASDPAPartialTemplate(MLIRTemplate): + def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=None): + super().__init__("kernel", input_nodes, layout, input_reorder) + self.scale = scale + self.BlkS = BlkS + + def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs): + query, key, value = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2] + partial = self.output_node + + q_tensor4 = empty_strided(query.layout.size, query.layout.stride) + k_tensor4 = empty_strided(key.layout.size, key.layout.stride) + v_tensor4 = empty_strided(value.layout.size, value.layout.stride) + B, Hq, Lq, Dh = q_tensor4.shape + _, H, S, _ = k_tensor4.shape + assert B == 1 and Lq == 1 + g = Hq // H + BlkS = min(int(self.BlkS), int(S)) + nblk = (int(S) + int(BlkS) - 1) // int(BlkS) + + io_stype = mlir_common.DTYPE_TO_MLIR[query.get_dtype()] + tile_s = kernel.vector_lane + tile_e = kernel.vector_lane + tile_pack = tile_e * 2 + + # Use 3D views for indices + q_tensor = q_tensor4.view(Hq, 1, Dh) + k_tensor = k_tensor4.view(H, S, Dh) + v_tensor = v_tensor4.view(H, S, Dh) + + # Flatten (kv,qsub,dh_tile) into GH = H*g*(Dh/tile_e) + dh_tiles = int(Dh) // int(tile_e) + HgDhTiles = int(H) * int(g) * int(dh_tiles) + + # tile descs + vlane_stride = 1 + q_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) + q_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) + q_tile_desc.set_name("q_buffer") + q_tile_desc.offset = query.get_layout().offset + + k_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 2, vlane_stride) + k_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, 1, tile_s]) + k_tile_desc.set_name("k_buffer") + k_tile_desc.offset = key.get_layout().offset + + v_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 1, vlane_stride) + v_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, tile_e, 1]) + v_tile_desc.set_name("v_buffer") + v_tile_desc.offset = value.get_layout().offset + + mul_tile_desc = mlir_common.MLIRMultiDimTile([tile_s, 1], kernel.vector_lane, 1, vlane_stride) + mul_tile_desc.set_tile_size_stride([tile_s, 1], [1, 1]) + mul_tile_desc.set_name("mul_buffer") + + score_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride) + score_desc.set_tile_size_stride([g, tile_s], [tile_s, 1]) + score_desc.set_name("score_buffer") + + prob_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride) + prob_desc.set_tile_size_stride([g, tile_s], [tile_s, 1]) + prob_desc.set_name("prob_buffer") + + # Per-qsub, per-dh-tile accumulators so QK is computed once and SV expands across dh tiles. + out_acc_tile_desc = mlir_common.MLIRMultiDimTile([g, dh_tiles, tile_e], kernel.vector_lane, 2, vlane_stride) + out_acc_tile_desc.set_tile_size_stride([g, dh_tiles, tile_e], [dh_tiles * tile_e, tile_e, 1]) + out_acc_tile_desc.set_name("out_acc_buffer") + + max_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride) + max_desc.set_tile_size_stride([g, 2], [2, 1]) + max_desc.set_name("max_buffer") + + sum_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride) + sum_desc.set_tile_size_stride([g, 2], [2, 1]) + sum_desc.set_name("sum_buffer") + + out_io_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) + out_io_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) + out_io_tile_desc.set_name("out_io_buffer") + + partial_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_pack], kernel.vector_lane, 1, vlane_stride) + partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1]) + partial_tile_desc.set_name("partial_buffer") + + # Indices + kv = sympy.Symbol("kv") + qsub = sympy.Symbol("qsub") + dht = sympy.Symbol("dht") + dh0 = sympy.Symbol("dh0") + k0 = sympy.Symbol("k0") + blk = sympy.Symbol("blk") + s0 = sympy.Symbol("s0") + q_head = kv * g + qsub + + q_stride = q_tensor.stride() + k_stride = k_tensor.stride() + v_stride = v_tensor.stride() + + qk_idx = [q_head * q_stride[0], sympy.Integer(0), k0 * q_stride[2]] + kk_idx = [kv * k_stride[0], s0 * k_stride[1], k0 * k_stride[2]] + v_idx = [kv * v_stride[0], s0 * v_stride[1], dh0 * v_stride[2]] + + # partial tensor is view(HgDhTiles, nblk, tile_pack) contiguous + p_tensor = empty_strided(partial.get_layout().size, partial.get_layout().stride).view(HgDhTiles, nblk, tile_pack) + p_stride = p_tensor.stride() + # group head index: ((kv*g + qsub)*dh_tiles + dht) + gh = (kv * g + qsub) * dh_tiles + dht + partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)] + + kernel.loop_size = [tile_s, tile_e, tile_pack] + + kernel.render_options = dict( + KERNEL_NAME=self.name, + kernel=kernel, + H=H, + g=g, + Dh=Dh, + S=S, + BlkS=BlkS, + nblk=nblk, + tile_s=tile_s, + tile_e=tile_e, + dh_tiles=dh_tiles, + tile_pack=tile_pack, + io_stype=io_stype, + scale=self.scale, + query=query, + key=key, + value=value, + partial=partial, + q_tile_desc=q_tile_desc, + k_tile_desc=k_tile_desc, + v_tile_desc=v_tile_desc, + mul_tile_desc=mul_tile_desc, + score_desc=score_desc, + prob_desc=prob_desc, + out_io_tile_desc=out_io_tile_desc, + out_acc_tile_desc=out_acc_tile_desc, + max_desc=max_desc, + sum_desc=sum_desc, + partial_tile_desc=partial_tile_desc, + qk_idx=qk_idx, + kk_idx=kk_idx, + v_idx=v_idx, + partial_idx=partial_idx, + input_reorder=self.input_reorder, + ) + + kernel.epilogue_info = dict( + output_node=self.output_node.name, + sram_var="partial_buffer", + dram_var="partial", + dram_idx=partial_idx, + dram_tile_desc=partial_tile_desc, + nr_rdim=0, + r_dim_size=0, + dim_aliasing={"kv": "kv", "qsub": "qsub", "dht": "dht", "dh0": "dh0", "k0": "k0", "blk": "blk", "s0": "s0"}, + ) + return self._template_from_string(DECODE_GQA_SDPA_PARTIAL_TEMPLATE).render(**kernel.render_options) + + +class MLIRDecodeGQASDPAReduceTemplate(MLIRTemplate): + def __init__(self, input_nodes, layout, BlkS: int = 1024, input_reorder=None): + super().__init__("kernel", input_nodes, layout, input_reorder) + self.BlkS = BlkS + + def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs): + partial = self.input_nodes[0] + out = self.output_node + + tile_e = kernel.vector_lane + tile_pack = tile_e * 2 + + # Infer sizes from partial layout: (HgDhTiles, nblk, tile_pack) + HgDhTiles, nblk, _ = partial.get_size() + io_stype = mlir_common.DTYPE_TO_MLIR[out.get_dtype()] + + vlane_stride = 1 + partial_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_pack], kernel.vector_lane, 1, vlane_stride) + partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1]) + partial_tile_desc.set_name("partial_buffer") + partial_tile_desc.offset = partial.get_layout().offset + + out_acc_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) + out_acc_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) + out_acc_tile_desc.set_name("out_acc_buffer") + + max_desc = mlir_common.MLIRMultiDimTile([1, 2], kernel.vector_lane, 0, vlane_stride) + max_desc.set_tile_size_stride([1, 2], [2, 1]) + max_desc.set_name("max_buffer") + + sum_desc = mlir_common.MLIRMultiDimTile([1, 2], kernel.vector_lane, 0, vlane_stride) + sum_desc.set_tile_size_stride([1, 2], [2, 1]) + sum_desc.set_name("sum_buffer") + + out_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) + out_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) + out_tile_desc.set_name("out_buffer") + + # Indexing: partial is already 3D; out is (Hq,1,Dh) but view as (Hq*Dh/tile_e, 1, tile_e) + p_tensor = empty_strided(partial.get_layout().size, partial.get_layout().stride) + p_stride = p_tensor.stride() + gh = sympy.Symbol("gh") + blk = sympy.Symbol("blk") + partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)] + + # out view + out_tensor4 = empty_strided(out.get_layout().size, out.get_layout().stride) + B, Hq, Lq, Dh = out_tensor4.shape + assert B == 1 and Lq == 1 + dh_tiles = int(Dh) // int(tile_e) + out_tensor = out_tensor4.view(Hq * dh_tiles, 1, tile_e) + o_stride = out_tensor.stride() + out_idx = [gh * o_stride[0], sympy.Integer(0), sympy.Integer(0)] + + kernel.loop_size = [tile_pack, tile_e, 1] + + kernel.render_options = dict( + KERNEL_NAME=self.name, + kernel=kernel, + HgDhTiles=HgDhTiles, + nblk=nblk, + tile_e=tile_e, + tile_pack=tile_pack, + io_stype=io_stype, + partial=partial, + out=out, + partial_tile_desc=partial_tile_desc, + out_acc_tile_desc=out_acc_tile_desc, + max_desc=max_desc, + sum_desc=sum_desc, + out_tile_desc=out_tile_desc, + partial_idx=partial_idx, + out_idx=out_idx, + input_reorder=self.input_reorder, + ) + + kernel.epilogue_info = dict( + output_node=self.output_node.name, + sram_var="out_buffer", + dram_var="out", + dram_idx=out_idx, + dram_tile_desc=out_tile_desc, + nr_rdim=0, + r_dim_size=0, + dim_aliasing={"gh": "gh", "blk": "blk"}, + ) + return self._template_from_string(DECODE_GQA_SDPA_REDUCE_TEMPLATE).render(**kernel.render_options) diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index b2df1d06..53db988b 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -904,7 +904,7 @@ def hook(): def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_common.MLIRMultiDimTile, subtile_size:list=[], async_type=None, indent_size=0, priority: int = 5, lazy_mode: bool = True, - dram_stride:list=None, dram_offset=None): + dram_stride:list=None, dram_offset=None, padding: int = 0): # Todo. Remove legacy behavior (i.e., index_list parsing) def generate_dma_code(): """Internal method to generate DMA code directly.""" @@ -948,7 +948,7 @@ def generate_dma_code(): zero_cse = self.get_const_cse(0, "index") sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim()) - attribute_parts = [f"dram_stride={_dram_stride}", f"sram_stride={sram_strides}", "padding=0"] + attribute_parts = [f"dram_stride={_dram_stride}", f"sram_stride={sram_strides}", f"padding={int(padding)}"] if subtile_size: attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}") attribute = " {" + ", ".join(attribute_parts) + "}" diff --git a/tests/test_sdpa.py b/tests/test_sdpa.py index 6ffd6f2e..ed7ae8f8 100644 --- a/tests/test_sdpa.py +++ b/tests/test_sdpa.py @@ -58,6 +58,60 @@ def test_scaled_dot_product_attention(device, backends="flash"): print("All tests passed!") +def test_scaled_dot_product_attention_gqa_single_batch(device): + """ + Focused GQA testcases for single-batch (n==1). + Shapes: + q: (B, Hq, Lq, Dh) + k: (B, H, S, Dh) + v: (B, H, S, Dh) + """ + torch.manual_seed(0) + + B = 1 + # Decode-focused: include a larger S to hit BlkS logic + seq_len_list = [128, 256, 1024] + head_dim_list = [64, 128] + # GQA ratios requested: Hq / H in {4, 5, 8, 16}. + # Keep H=1 to directly realize those ratios. + gqa_ratios = [4, 5, 8, 16] + H = 1 + + for seq_len in seq_len_list: + for head_dim in head_dim_list: + for ratio in gqa_ratios: + Hq = ratio * H + + clear_caches() + # Decode shape: Lq == 1 + q = torch.rand(B, Hq, 1, head_dim, dtype=torch.float32) + k = torch.rand(B, H, seq_len, head_dim, dtype=torch.float32) + v = torch.rand(B, H, seq_len, head_dim, dtype=torch.float32) + + # NPU + q_npu = q.to(device=device) + k_npu = k.to(device=device) + v_npu = v.to(device=device) + opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention) + out = opt_fn(q_npu, k_npu, v_npu, attn_mask=None, dropout_p=0.0, is_causal=True, enable_gqa=True) + + # CPU reference + cpu_device = torch.device("cpu") + cpu_out = F.scaled_dot_product_attention( + q.to(device=cpu_device), + k.to(device=cpu_device), + v.to(device=cpu_device), + attn_mask=None, + dropout_p=0.0, + is_causal=True, + enable_gqa=True, + ) + + name = f"SDPA-GQA(B: {B}, Hq: {Hq}, H: {H}, S: {seq_len}, head_dim: {head_dim})" + test_result(name, out, cpu_out) + + print("All GQA single-batch tests passed!") + def clear_caches(): import os from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache @@ -69,5 +123,6 @@ def clear_caches(): if __name__ == "__main__": device = torch.device('npu:0') - test_scaled_dot_product_attention(device, backends="flash") + # test_scaled_dot_product_attention(device, backends="flash") + test_scaled_dot_product_attention_gqa_single_batch(device) \ No newline at end of file From 59bd8f8ddc9ff86f35a45a347d7f5c7d5fe8bf7a Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 12 Mar 2026 21:29:16 +0900 Subject: [PATCH 129/194] WIP --- PyTorchSimFrontend/mlir/mlir_lowering.py | 1 + PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 660 +++++++++++------- 2 files changed, 398 insertions(+), 263 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py index ac7eb853..7b2c07bf 100644 --- a/PyTorchSimFrontend/mlir/mlir_lowering.py +++ b/PyTorchSimFrontend/mlir/mlir_lowering.py @@ -89,6 +89,7 @@ def tuned_flash_sdpa( ) partial_tmpl = MLIRDecodeGQASDPAPartialTemplate([query, key, value], partial_layout, scale, BlkS=BlkS) partial = partial_tmpl.generate().output_node() + partial.realize() reduce_tmpl = MLIRDecodeGQASDPAReduceTemplate([partial], layout, BlkS=BlkS) out_node = reduce_tmpl.generate().output_node() return (out_node, None, None, None, None, None, None, None, None) diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py index 1cd810e8..077a8cd2 100644 --- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py +++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py @@ -16,17 +16,87 @@ from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel +def _make_offset_map_with_sym(strides, sym_dim, sym_stride, offset=0): + """Like _make_offset_map but injects a block symbol ``s`` into dimension ``sym_dim``. + + The effective index for that dimension becomes ``d{sym_dim} + sym_stride * s``. + Use this to keep ``affine.for`` bounds static and encode the block contribution + directly inside the ``affine.apply`` call that computes the DRAM offset. + + Args: + strides: per-dimension DRAM strides. + sym_dim: which dimension carries the block symbol. + sym_stride: multiplier for the symbol (1 for abs-position loops like FLASH + ``%blk``; ``BlkS`` for block-index loops like PARTIAL ``%blk``). + offset: constant layout offset. + + Returns: + MLIR affine_map string with one symbol, e.g. + ``affine_map<(d0, d1, d2)[s] -> (d0 * 8192 + (d1 + 128 * s) * 64 + d2)>`` + """ + n = len(strides) + terms = [] + for j, sv in enumerate(strides): + sv = int(sv) + if sv == 0: + continue + if j == sym_dim: + inner = f"d{j} + s" if sym_stride == 1 else f"d{j} + {sym_stride} * s" + terms.append(f"({inner})" if sv == 1 else f"({inner}) * {sv}") + else: + terms.append(f"d{j}" if sv == 1 else f"d{j} * {sv}") + try: + off = int(offset) + except (TypeError, ValueError): + off = 0 + if off: + terms.append(str(off)) + dim_str = ", ".join(f"d{j}" for j in range(n)) + expr = " + ".join(terms) if terms else "0" + return f"affine_map<({dim_str})[s] -> ({expr})>" + + +def _make_offset_map(strides, offset=0): + """Generate an MLIR affine_map string for a flat DRAM base-address. + + Args: + strides: list of integer per-dimension strides. + A stride of 0 means the dimension does not contribute. + offset: constant layout offset (e.g. from IRNode.get_layout().offset). + + Returns: + MLIR affine_map string, e.g. ``affine_map<(d0, d1) -> (d0 * 128 + d1)>`` + """ + n = len(strides) + terms = [] + for j, s in enumerate(strides): + s = int(s) + if s == 1: + terms.append(f"d{j}") + elif s != 0: + terms.append(f"d{j} * {s}") + try: + off = int(offset) + except (TypeError, ValueError): + off = 0 + if off: + terms.append(str(off)) + dim_str = ", ".join(f"d{j}" for j in range(n)) + expr = " + ".join(terms) if terms else "0" + return f"affine_map<({dim_str}) -> ({expr})>" + + def flash_sdpa_args( - query : TensorBox, - key : TensorBox, + query : TensorBox, + key : TensorBox, value : TensorBox) -> list: """ Arg processing for flash SDPA. - Its logic is based on: + Its logic is based on: mm_args() which is in torch._inductor.kernel.mm_common.py (142 line). """ - # Materialize input buffers for the codegen backend. + # Materialize input buffers for the codegen backend. query, key, value = realize_inputs(query, key, value) # query : (n, hq, l, e) @@ -43,7 +113,7 @@ def flash_sdpa_args( n = V.graph.sizevars.guard_equals(nq, nk) n = V.graph.sizevars.guard_equals(nq, nk) - + h = V.graph.sizevars.guard_equals(hk, hv) s = V.graph.sizevars.guard_equals(sk, sv) e = V.graph.sizevars.guard_equals(eq, ek) @@ -62,7 +132,7 @@ def flash_sdpa_args( raise NotImplementedError( f"Flash SDPA currently requires e to be a multiple of vlanes (e: {e}, vlanes: {vector_lane})." ) - + # Minimal GQA support (single-batch only for now). # We map each query head to a KV head by grouping: hq = g * h. if hq != h: @@ -70,14 +140,14 @@ def flash_sdpa_args( raise NotImplementedError("Flash SDPA GQA is currently supported only for n == 1.") if (hq % h) != 0: raise NotImplementedError(f"Flash SDPA GQA requires hq % h == 0 (hq: {hq}, h: {h}).") - + layout = FixedLayout( query.get_device(), query.get_dtype(), [n, hq, l, ev] ) - return [n, hq, h, l, s, e, ev, layout, query, key, value] + return [n, hq, h, l, s, e, ev, layout, query, key, value] def calculate_scale(query: torch.Tensor, scale: float) -> float: """ @@ -109,7 +179,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: {{ kernel.def_sram_buffer("query", q_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }} - + // Output {{ kernel.def_sram_buffer("out", out_tile_desc, indent_size=2) }} @@ -117,7 +187,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }} {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }} - + // Constants %c0 = arith.constant 0.0 : {{ data_stype }} %c1 = arith.constant 1.0 : {{ data_stype }} @@ -133,33 +203,36 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2x{{ data_stype }}> %v_scale = vector.broadcast %c_scale : {{ data_stype }} to vector<{{ tile_s }}x{{ data_stype }}> - - {{ kernel.def_local_vars(indent_size=2) }} - + + {{ kernel.def_local_vars(indent_size=2) }} + affine.for %index0 = 0 to {{ b }} { affine.for %index3 = 0 to 1 step 1 { affine.for %index1 = 0 to {{ l }} step {{ tile_l }} { - {{ kernel.def_dma_op("MVIN", "query", q_idx, q_tile_desc, subtile_size=[1, subtile_l, subtile_e], indent_size=8) }} - + %q_dram_offset = affine.apply {{ q_offset_map }}(%index0, %index1, %index3) + {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[1, subtile_l, subtile_e], indent_size=8, dram_stride=q_dram_stride, dram_offset="q_dram_offset") }} + affine.vector_store %v0_l, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_l, tile_e) }}x{{ data_stype }}> - affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> + affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> affine.vector_store %v0_2x, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> - + %qt_buffer2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_l }}], strides: [{{ tile_l }}, 1] : {{ q_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1> %ot_buffer2D = memref.reinterpret_cast %out_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_l }}], strides: [{{ tile_l }}, 1] : {{ out_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1> affine.for %index2 = 0 to {{ s }} step {{ tile_s }} { - {{ kernel.def_dma_op("MVIN", "key", k_idx, k_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10) }} - {{ kernel.def_dma_op("MVIN", "value", v_idx, v_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10) }} + %k_dram_offset = affine.apply {{ k_offset_map }}(%index0, %index2, %index3) + {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10, dram_stride=k_dram_stride, dram_offset="k_dram_offset") }} + %v_dram_offset = affine.apply {{ v_offset_map }}(%index0, %index2, %index3) + {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10, dram_stride=v_dram_stride, dram_offset="v_dram_offset") }} - affine.vector_store %v0_s, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_s, tile_l) }}x{{ data_stype }}> + affine.vector_store %v0_s, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_s, tile_l) }}x{{ data_stype }}> %k_buffer2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ data_stype }}, 1> %vt_buffer2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ data_stype }}, 1> - + // key @ query.t and scaling. - linalg.matmul + linalg.matmul { idx_map = array } ins(%k_buffer2D, %qt_buffer2D : memref<{{ tile_s }}x{{ tile_e }}x{{ data_stype }}, 1>, memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>) outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(data_stype) }}) @@ -168,7 +241,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: %scaled_mul_vec = arith.mulf %raw_mul_vec, %v_scale : vector<{{ tile_s }}x{{ data_stype }}> affine.vector_store %scaled_mul_vec, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}> - + // Find new max. %old_max = affine.vector_load %max_buffer[0,0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> @@ -182,22 +255,22 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: %max_reduced_1 = vector.multi_reduction , %max_cast, %v_neg_inf_2x [0] : vector<8x2x{{ data_stype }}> to vector<2x{{ data_stype }}> %max_shuffled = vector.shuffle %max_reduced_1, %max_reduced_1 [1, 0] : vector<2x{{ data_stype }}>, vector<2x{{ data_stype }}> %max_reduced_2 = arith.maximumf %max_reduced_1, %max_shuffled : vector<2x{{ data_stype }}> - - %new_max = arith.maximumf %max_reduced_2, %old_max : vector<2x{{ data_stype }}> + + %new_max = arith.maximumf %max_reduced_2, %old_max : vector<2x{{ data_stype }}> affine.vector_store %new_max, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> - + // Compute rescale factors: exp(old_max - new_max) %max_diff = arith.subf %old_max, %new_max : vector<2x{{ data_stype }}> %max_diff_scalar = vector.extract %max_diff[0] : {{ data_stype }} from vector<2x{{ data_stype }}> - - %rescale_bcast_e = vector.broadcast %max_diff_scalar : {{ data_stype }} to vector<{{ tile_e }}x{{ data_stype }}> - %exp_rescale_e = math.exp %rescale_bcast_e : vector<{{ tile_e }}x{{ data_stype }}> + + %rescale_bcast_e = vector.broadcast %max_diff_scalar : {{ data_stype }} to vector<{{ tile_e }}x{{ data_stype }}> + %exp_rescale_e = math.exp %rescale_bcast_e : vector<{{ tile_e }}x{{ data_stype }}> %rescale_bcast_2 = vector.broadcast %max_diff_scalar : {{ data_stype }} to vector<2x{{ data_stype }}> %exp_rescale_2 = math.exp %rescale_bcast_2 : vector<2x{{ data_stype }}> - + // Rescale previous out and sum accumulators %old_out = affine.vector_load %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}> %rescaled_out = arith.mulf %exp_rescale_e, %old_out : vector<{{ tile_e }}x{{ data_stype }}> @@ -206,16 +279,16 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: %old_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> %rescaled_sum = arith.mulf %old_sum, %exp_rescale_2 : vector<2x{{ data_stype }}> - + // Shift scores and apply exp: exp(x - new_max) %scaled_scores_reload = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}> %new_max_scalar = vector.extract %new_max[0] : {{ data_stype }} from vector<2x{{ data_stype }}> %new_max_bcast = vector.broadcast %new_max_scalar : {{ data_stype }} to vector<{{ tile_s }}x{{ data_stype }}> - + %shifted_scores = arith.subf %scaled_scores_reload, %new_max_bcast : vector<{{ tile_s }}x{{ data_stype }}> %exp_scores = math.exp %shifted_scores : vector<{{ tile_s }}x{{ data_stype }}> affine.vector_store %exp_scores, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}> - + // accumulate current sum %chunk_sum_res = affine.for %index5 = 0 to {{ tile_s }} step {{ chunk_size }} iter_args(%iter_sum=%v0_c) -> (vector<{{ chunk_size }}x{{ data_stype }}>) { @@ -223,19 +296,19 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: %local_sum = arith.addf %chunk_exp, %iter_sum : vector<{{ chunk_size }}x{{ data_stype }}> affine.yield %local_sum : vector<{{ chunk_size }}x{{ data_stype }}> } - + %zero_2x = vector.broadcast %c0 : {{ data_stype }} to vector<2x{{ data_stype }}> %sum_cast = vector.shape_cast %chunk_sum_res : vector<{{ chunk_size }}x{{ data_stype }}> to vector<{{ chunk_size // 2 }}x2x{{ data_stype }}> %sum_reduced_1 = vector.multi_reduction , %sum_cast, %zero_2x [0] : vector<8x2x{{ data_stype }}> to vector<2x{{ data_stype }}> %sum_shuffled = vector.shuffle %sum_reduced_1, %sum_reduced_1 [1, 0] : vector<2x{{ data_stype }}>, vector<2x{{ data_stype }}> %sum_reduced_2 = arith.addf %sum_reduced_1, %sum_shuffled : vector<2x{{ data_stype }}> - + %new_sum = arith.addf %sum_reduced_2, %rescaled_sum : vector<2x{{ data_stype }}> affine.vector_store %new_sum, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> - + // value.t @ mul - linalg.matmul + linalg.matmul { idx_map = array } ins(%vt_buffer2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ data_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(data_stype) }}) outs(%ot_buffer2D : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>) @@ -244,20 +317,21 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: // out @ row_sum^(-1) %final_row_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> %one_2x = vector.broadcast %c1 : {{ data_stype }} to vector<2x{{ data_stype }}> - + %reciprocal_row_sum_2x = arith.divf %one_2x, %final_row_sum : vector<2x{{ data_stype }}> %reciprocal_scalar = vector.extract %reciprocal_row_sum_2x[0] : {{ data_stype }} from vector<2x{{ data_stype }}> %reciprocal_bcast_e = vector.broadcast %reciprocal_scalar : {{ data_stype }} to vector<{{ tile_e }}x{{ data_stype }}> - + %accumulated_out = affine.vector_load %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}> %stable_final_out = arith.mulf %accumulated_out, %reciprocal_bcast_e : vector<{{ tile_e }}x{{ data_stype }}> affine.vector_store %stable_final_out, %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}> - {{ kernel.store_output(indent_size=8) }} - } { accumulation_loop=true } + %out_dram_offset = affine.apply {{ out_offset_map }}(%index0, %index1, %index3) + {{ kernel.def_dma_op("MVOUT", "out", [], out_tile_desc, indent_size=8, dram_stride=out_dram_stride, dram_offset="out_dram_offset") }} + } { accumulation_loop=true } } { outer_loop=true } } { outer_loop=true } - return + return } """ @@ -273,10 +347,10 @@ def render(self, prologue_nodes: Optional[List[IRNode]] = None, tile_info = None, **kwargs): - + # Except for kernel, other arguments are usually None. query, key, value, out, q_tensor, k_tensor, v_tensor, out_tensor, b, l, s, e, ev, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes) - + if tile_info is None: tile_l, tile_s, tile_e, subtile_l, subtile_s, subtile_e = self.select_tile(kernel, l, s, e, n_extra_node, 0, n_prologue_node)[0] else: @@ -299,10 +373,10 @@ def render(self, # Prepare tile descriptors for input and output tensors. # Intermediate buffers (transient data) do not require DRAM settings(dram stride and dram indices) - # as they are not synchronized with external DRAM. + # as they are not synchronized with external DRAM. # DRAM and SRAM tile shapes must match. vlane_stride = 1 - + # (n, l, s, e, ev) loop_dim = [sympy.Symbol("index0"), sympy.Symbol("index1"), sympy.Symbol("index2"), sympy.Symbol("index3")] @@ -317,11 +391,10 @@ def render(self, q_tile_desc.set_tile_size_stride(q_tile_size, q_tile_stride) q_tile_desc.set_name("q_buffer") q_tile_desc.offset = query.get_layout().offset - # DRAM settings + # DRAM settings q_stride = q_tensor.stride() - q_idx = [loop_dim[0]*q_stride[0], loop_dim[1]*q_stride[1], loop_dim[3]*q_stride[2]] # To keep index arguemnt order, we used index_list - # Since we use a weight-stationary approach in the Systolic Array (SA), + # Since we use a weight-stationary approach in the Systolic Array (SA), # the split axis of the first operand differs from a standard linear algebra matmul. # The first operand (key) must be split along the column axis. # This logic aligns with the relationship between the dot product's summation direction and the hardware's accumulation direction in the SA. @@ -335,7 +408,6 @@ def render(self, k_tile_desc.offset = key.get_layout().offset # DRAM settings k_stride = k_tensor.stride() - k_idx = [loop_dim[0]*k_stride[0], loop_dim[2]*k_stride[1], loop_dim[3]*k_stride[2]] # Since we compute mul = key @ query.t, we perform out.t = (value.t @ Softmax(mul).t).t, # which simplifies to (value.t @ Softmax(mul)) @@ -349,19 +421,17 @@ def render(self, v_tile_desc.offset = value.get_layout().offset # DRAM settings v_stride = v_tensor.stride() - v_idx = [loop_dim[0]*v_stride[0], loop_dim[2]*v_stride[1], loop_dim[3]*v_stride[2]] # To keep index arguemnt order, we used index_list # Output is also stored in transposed format to match the value.t @ Softmax(mul) operation. # SRAM settings vlane_split_axis = 1 - out_tile_size = [1, tile_l, tile_e] - out_tile_stride=[0, tile_e, 1] + out_tile_size = [1, tile_l, tile_e] + out_tile_stride=[0, tile_e, 1] out_tile_desc = mlir_common.MLIRMultiDimTile(out_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride) out_tile_desc.set_tile_size_stride(out_tile_size, out_tile_stride) out_tile_desc.set_name("out_buffer") # DRAM settings out_stride = out.get_layout().stride[1:] - out_idx = [loop_dim[0]*out_stride[0], loop_dim[1]*out_stride[1], loop_dim[3]*out_stride[2]] # Intermediate buffers @@ -393,28 +463,46 @@ def render(self, # For reduction chunk_size = 16 + # DMA strides and offset affine maps (dram_stride + dram_offset style) + q_dram_stride = [int(q_stride[0]), int(q_stride[1]), int(q_stride[2])] + k_dram_stride = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])] + v_dram_stride = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])] + out_dram_stride = [int(out_stride[0]), int(out_stride[1]), int(out_stride[2])] + + q_offset_map = _make_offset_map(q_dram_stride, q_tile_desc.offset) + k_offset_map = _make_offset_map(k_dram_stride, k_tile_desc.offset) + v_offset_map = _make_offset_map(v_dram_stride, v_tile_desc.offset) + out_offset_map = _make_offset_map(out_dram_stride, 0) + + # Keep out_idx only for epilogue_info (not in render_options) + out_idx = [loop_dim[0]*out_stride[0], loop_dim[1]*out_stride[1], loop_dim[3]*out_stride[2]] + kernel.render_options = dict( KERNEL_NAME = self.name, kernel = kernel, - b = b, - l = l, - s = s, + b = b, + l = l, + s = s, e = e, # Input sizes (dram) - tile_l = tile_l, - tile_s = tile_s, + tile_l = tile_l, + tile_s = tile_s, tile_e = tile_e, # Tile sizes (sram) - subtile_l = subtile_l, - subtile_s = subtile_s, - subtile_e = subtile_e, # Subtile sizes (sram) + subtile_l = subtile_l, + subtile_s = subtile_s, + subtile_e = subtile_e, # Subtile sizes (sram) data_stype="f32", - query = query, + query = query, key = key, - value = value, + value = value, out = out, # Inputs and output (dram) - q_idx = q_idx, - k_idx = k_idx, - v_idx = v_idx, - out_idx = out_idx, # Strides (dram) + q_dram_stride = q_dram_stride, + k_dram_stride = k_dram_stride, + v_dram_stride = v_dram_stride, + out_dram_stride = out_dram_stride, # Per-dim DRAM strides + q_offset_map = q_offset_map, + k_offset_map = k_offset_map, + v_offset_map = v_offset_map, + out_offset_map = out_offset_map, # Affine maps for base address q_tile_desc = q_tile_desc, k_tile_desc = k_tile_desc, v_tile_desc = v_tile_desc, @@ -423,19 +511,8 @@ def render(self, max_desc = max_desc, sum_desc = sum_desc, # Intermediate buffer descriptions (sram) scale = self.scale, - chunk_size = chunk_size, - input_reorder = self.input_reorder # ETC - ) - - kernel.epilogue_info = dict( - output_node = self.output_node.name, - sram_var = "out_buffer", - dram_var = "out", - dram_idx = out_idx, - dram_tile_desc = out_tile_desc, - nr_rdim = nr_rdim, - r_dim_size = 0, - dim_aliasing = epilogue_dim_aliasing + chunk_size = chunk_size, + input_reorder = self.input_reorder # ETC ) code = self._template_from_string(template).render(**kernel.render_options) @@ -445,7 +522,7 @@ def render(self, def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes): if template_buffer_node is not None: self.output_node = template_buffer_node - + query = self.input_nodes[0] key = self.input_nodes[1] value = self.input_nodes[2] @@ -462,7 +539,7 @@ def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes): v_tensor = v_tensor.view([-1, v_tensor.shape[-2], v_tensor.shape[-1]]) out_tensor = out_tensor.view([-1, out_tensor.shape[-2], out_tensor.shape[-1]]) - b, l, s, e, ev = q_tensor.size(0), q_tensor.size(1), k_tensor.size(1), k_tensor.size(2), v_tensor.size(2) + b, l, s, e, ev = q_tensor.size(0), q_tensor.size(1), k_tensor.size(1), k_tensor.size(2), v_tensor.size(2) n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0 n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0 @@ -549,7 +626,7 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no } } - affine.for %s0 = %blk to (%blk + {{ BlkS }}) step {{ tile_s }} { + affine.for %s0 = 0 to {{ BlkS }} step {{ tile_s }} { // Accumulate score per qsub so K tiles can be shared across qsub. affine.for %qsub = 0 to {{ g }} { affine.vector_store %v0_s_acc, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}> @@ -557,11 +634,14 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} { // Load K slice once for all qsub. - {{ kernel.def_dma_op("MVIN", "key", kk_idx, k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1) }} + %kk_offset = affine.apply {{ kk_offset_map_blk }}(%kv, %s0, %k0)[%blk] + {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }} %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1> affine.for %qsub = 0 to {{ g }} { - {{ kernel.def_dma_op("MVIN", "query", qk_idx, q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12) }} + %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub) + %qk_offset = affine.apply {{ qk_offset_map }}(%q_head, %k0) + {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12, dram_stride=q_dram_stride, dram_offset="qk_offset") }} %q2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1> // mul = k @ q -> (tile_s x 1) in io dtype, then upcast and accumulate. @@ -571,9 +651,9 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }}) %raw_mul_io = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> - %raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}x{{ acc_stype }}> + {% if io_stype != acc_stype %}%raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}x{{ acc_stype }}>{% endif %} %old_score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}> - %new_score = arith.addf %old_score, %raw_mul : vector<{{ tile_s }}x{{ acc_stype }}> + %new_score = arith.addf %old_score, {{ "%raw_mul" if io_stype != acc_stype else "%raw_mul_io" }} : vector<{{ tile_s }}x{{ acc_stype }}> affine.vector_store %new_score, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}> } { accumulation_loop=true } } { accumulation_loop=true } @@ -618,8 +698,8 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no %shifted = arith.subf %scaled_mul_vec, %new_max_bcast : vector<{{ tile_s }}x{{ acc_stype }}> %exp_scores = math.exp %shifted : vector<{{ tile_s }}x{{ acc_stype }}> // For SV matmul: downcast softmax output to io dtype (common in practice) - %exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s }}x{{ io_stype }}> - affine.vector_store %exp_scores_io, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> + {% if io_stype != acc_stype %}%exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s }}x{{ io_stype }}>{% endif %} + affine.vector_store {{ "%exp_scores_io" if io_stype != acc_stype else "%exp_scores" }}, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> // sum += reduce(exp_scores) %sum_cast = vector.shape_cast %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s // 2 }}x2x{{ acc_stype }}> @@ -635,7 +715,8 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no // 2) SV accumulation: for each output dh tile, load V once and share across qsub. affine.for %dht = 0 to {{ dh_tiles }} { %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht) - {{ kernel.def_dma_op("MVIN", "value", v_idx, v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0) }} + %v_offset = affine.apply {{ v_offset_map_blk }}(%kv, %s0, %dh0)[%blk] + {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0, dram_stride=v_dram_stride, dram_offset="v_offset") }} %v2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1> affine.for %qsub = 0 to {{ g }} { @@ -649,9 +730,9 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no outs(%out_io_2D : memref<{{ tile_e }}x1x{{ io_stype }}, 1>) %out_io_vec = affine.vector_load %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> - %out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}x{{ acc_stype }}> + {% if io_stype != acc_stype %}%out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}x{{ acc_stype }}>{% endif %} %out_acc_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> - %out_acc_new = arith.addf %out_acc_vec, %out_io_f32 : vector<{{ tile_e }}x{{ acc_stype }}> + %out_acc_new = arith.addf %out_acc_vec, {{ "%out_io_f32" if io_stype != acc_stype else "%out_io_vec" }} : vector<{{ tile_e }}x{{ acc_stype }}> affine.vector_store %out_acc_new, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> } { accumulation_loop=true } } { accumulation_loop=true } @@ -669,9 +750,11 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht) %acc_out = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> %final_out_acc = arith.mulf %acc_out, %inv_bcast : vector<{{ tile_e }}x{{ acc_stype }}> - %final_out_io = arith.truncf %final_out_acc : vector<{{ tile_e }}x{{ acc_stype }}> to vector<{{ tile_e }}x{{ io_stype }}> - affine.vector_store %final_out_io, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> - {{ kernel.store_output(indent_size=10) }} + {% if io_stype != acc_stype %}%final_out_io = arith.truncf %final_out_acc : vector<{{ tile_e }}x{{ acc_stype }}> to vector<{{ tile_e }}x{{ io_stype }}>{% endif %} + affine.vector_store {{ "%final_out_io" if io_stype != acc_stype else "%final_out_acc" }}, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> + %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub) + %out_offset = affine.apply {{ out_offset_map }}(%q_head, %dh0) + {{ kernel.def_dma_op("MVOUT", "out", [], out_io_tile_desc, indent_size=10, dram_stride=out_dram_stride, dram_offset="out_offset") }} } } { outer_loop=true } } { outer_loop=true } @@ -690,7 +773,12 @@ def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=N def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs): # Decode-only: q is (B,Hq,1,Dh) - query, key, value, out = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2], self.output_node + # Use template_buffer_node (the actual V.graph-registered CUDATemplateBuffer with its + # real name e.g. "buf0") when available, instead of the placeholder self.output_node + # (always named "buf_out"). This ensures output_buffers["buf0"] maps correctly + # in mlir_argdefs, which looks up buffer_types by the actual DRAM buffer name. + query, key, value, out = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2], \ + template_buffer_node if template_buffer_node is not None else self.output_node # Materialize tensors for stride metadata q_tensor4 = empty_strided(query.layout.size, query.layout.stride) @@ -765,14 +853,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue sum_desc.set_tile_size_stride([g, 2], [2, 1]) sum_desc.set_name("sum_buffer") - # Indices - kv = sympy.Symbol("kv") - qsub = sympy.Symbol("qsub") - dh0 = sympy.Symbol("dh0") - k0 = sympy.Symbol("k0") - s0 = sympy.Symbol("s0") - q_head = kv * g + qsub - + # Strides from 3D tensor views q_stride = q_tensor.stride() k_stride = k_tensor.stride() v_stride = v_tensor.stride() @@ -780,11 +861,34 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue out_tensor = empty_strided(out.get_layout().size, out.get_layout().stride).view(Hq, 1, Dh) out_stride = out_tensor.stride() - # QK indices use k0 reduction over Dh - qk_idx = [q_head * q_stride[0], sympy.Integer(0), k0 * q_stride[2]] - kk_idx = [kv * k_stride[0], s0 * k_stride[1], k0 * k_stride[2]] - # V and output use dh0 tile offset - v_idx = [kv * v_stride[0], s0 * v_stride[1], dh0 * v_stride[2]] + # DMA strides (per-dimension DRAM strides for each tile) + k_dram_stride = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])] + # Q: q_head is pre-computed in template; stride[1]=0 since Lq=1 + q_dram_stride = [int(q_stride[0]), 0, int(q_stride[2])] + v_dram_stride = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])] + # out: q_head is pre-computed; stride[1]=0 since Lq=1 + out_dram_stride = [int(out_stride[0]), 0, int(out_stride[2])] + + # Affine maps for flat DRAM base address (used with pre-computed loop var expressions) + # K: offset(kv, s0, k0) + kk_offset_map = _make_offset_map(k_dram_stride, k_tile_desc.offset) + # Q: offset(q_head, k0) -- q_head = kv*g+qsub pre-computed in template + qk_offset_map = _make_offset_map([int(q_stride[0]), int(q_stride[2])], q_tile_desc.offset) + # V: offset(kv, s0, dh0) + v_offset_map = _make_offset_map(v_dram_stride, v_tile_desc.offset) + # Out: offset(q_head, dh0) -- q_head pre-computed in template + out_offset_map = _make_offset_map([int(out_stride[0]), int(out_stride[2])], 0) + # Blk-symbol variants: %s0 is relative (0..BlkS-1), %blk is the absolute + # block start (steps by BlkS), so actual_s = s0_rel + 1*blk → sym_stride=1. + kk_offset_map_blk = _make_offset_map_with_sym(k_dram_stride, sym_dim=1, sym_stride=1, offset=k_tile_desc.offset) + v_offset_map_blk = _make_offset_map_with_sym(v_dram_stride, sym_dim=1, sym_stride=1, offset=v_tile_desc.offset) + + # Keep sympy-based out_idx only for epilogue_info (not in render_options) + kv = sympy.Symbol("kv") + qsub = sympy.Symbol("qsub") + dh0 = sympy.Symbol("dh0") + s0 = sympy.Symbol("s0") + q_head = kv * g + qsub out_idx = [q_head * out_stride[0], sympy.Integer(0), dh0 * out_stride[2]] kernel.loop_size = [tile_s, tile_e, 1] @@ -819,24 +923,21 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue prob_desc=prob_desc, max_desc=max_desc, sum_desc=sum_desc, - qk_idx=qk_idx, - kk_idx=kk_idx, - v_idx=v_idx, - out_idx=out_idx, + # DMA strides + k_dram_stride=k_dram_stride, + q_dram_stride=q_dram_stride, + v_dram_stride=v_dram_stride, + out_dram_stride=out_dram_stride, + # Affine offset maps + kk_offset_map=kk_offset_map, + qk_offset_map=qk_offset_map, + v_offset_map=v_offset_map, + out_offset_map=out_offset_map, + kk_offset_map_blk=kk_offset_map_blk, + v_offset_map_blk=v_offset_map_blk, input_reorder=self.input_reorder, ) - kernel.epilogue_info = dict( - output_node=self.output_node.name, - sram_var="out_io_buffer", - dram_var="out", - dram_idx=out_idx, - dram_tile_desc=out_io_tile_desc, - nr_rdim=0, - r_dim_size=0, - dim_aliasing={"kv": "kv", "qsub": "qsub", "dh0": "dh0", "s0": "s0"}, - ) - return self._template_from_string(DECODE_GQA_SDPA_TEMPLATE).render(**kernel.render_options) @@ -891,27 +992,30 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue } } - affine.for %s0 = ({{ BlkS }} * %blk) to ({{ BlkS }} * (%blk + 1)) step {{ tile_s }} { + affine.for %s0 = 0 to {{ BlkS }} step {{ tile_s }} { // Accumulate score per qsub so K tiles can be shared across qsub. affine.for %qsub = 0 to {{ g }} { affine.vector_store %v0_s, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32> } affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} { - {{ kernel.def_dma_op("MVIN", "key", kk_idx, k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1) }} + %kk_offset = affine.apply {{ kk_offset_map_blk }}(%kv, %s0, %k0)[%blk] + {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }} %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1> affine.for %qsub = 0 to {{ g }} { - {{ kernel.def_dma_op("MVIN", "query", qk_idx, q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12) }} + %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub) + %qk_offset = affine.apply {{ qk_offset_map }}(%q_head, %k0) + {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12, dram_stride=q_dram_stride, dram_offset="qk_offset") }} %q2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1> linalg.matmul { idx_map = array } ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x1x{{ io_stype }}, 1>) outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }}) %raw_mul_io = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> - %raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}xf32> + {% if io_stype != "f32" %}%raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}xf32>{% endif %} %old_score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32> - %new_score = arith.addf %old_score, %raw_mul : vector<{{ tile_s }}xf32> + %new_score = arith.addf %old_score, {{ "%raw_mul" if io_stype != "f32" else "%raw_mul_io" }} : vector<{{ tile_s }}xf32> affine.vector_store %new_score, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32> } { accumulation_loop=true } } { accumulation_loop=true } @@ -951,8 +1055,8 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue %new_max_bcast = vector.broadcast %new_max_scalar : f32 to vector<{{ tile_s }}xf32> %shifted = arith.subf %scaled, %new_max_bcast : vector<{{ tile_s }}xf32> %exp_scores = math.exp %shifted : vector<{{ tile_s }}xf32> - %exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s }}x{{ io_stype }}> - affine.vector_store %exp_scores_io, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> + {% if io_stype != "f32" %}%exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s }}x{{ io_stype }}>{% endif %} + affine.vector_store {{ "%exp_scores_io" if io_stype != "f32" else "%exp_scores" }}, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> %sum_cast = vector.shape_cast %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s // 2 }}x2xf32> %zero_2x = vector.broadcast %c0 : f32 to vector<2xf32> @@ -966,7 +1070,8 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue // For each output dh tile, load V once and share it across qsub. affine.for %dht = 0 to {{ dh_tiles }} { %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht) - {{ kernel.def_dma_op("MVIN", "value", v_idx, v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0) }} + %v_offset = affine.apply {{ v_offset_map_blk }}(%kv, %s0, %dh0)[%blk] + {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0, dram_stride=v_dram_stride, dram_offset="v_offset") }} %v2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1> affine.for %qsub = 0 to {{ g }} { @@ -980,9 +1085,9 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue outs(%out_io_2D : memref<{{ tile_e }}x1x{{ io_stype }}, 1>) %out_io_vec = affine.vector_load %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> - %out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}xf32> + {% if io_stype != "f32" %}%out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}xf32>{% endif %} %out_acc_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - %out_acc_new = arith.addf %out_acc_vec, %out_io_f32 : vector<{{ tile_e }}xf32> + %out_acc_new = arith.addf %out_acc_vec, {{ "%out_io_f32" if io_stype != "f32" else "%out_io_vec" }} : vector<{{ tile_e }}xf32> affine.vector_store %out_acc_new, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> } { accumulation_loop=true } } { accumulation_loop=true } @@ -1000,9 +1105,12 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue affine.for %dht = 0 to {{ dh_tiles }} { %out_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - %packed = vector.concat %out_vec, %ml1 : vector<{{ tile_pack }}xf32> + %packed = vector.shuffle %out_vec, %ml1 [{{ range(tile_pack) | join(', ') }}] : vector<{{ tile_e }}xf32>, vector<{{ tile_e }}xf32> affine.vector_store %packed, %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32> - {{ kernel.store_output(indent_size=10) }} + %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub) + %gh = affine.apply affine_map<(d0, d1) -> (d0 * {{ dh_tiles }} + d1)>(%q_head, %dht) + %partial_offset = affine.apply {{ partial_offset_map }}(%gh, %blk) + {{ kernel.def_dma_op("MVOUT", "partial", [], partial_tile_desc, indent_size=10, dram_stride=partial_dram_stride, dram_offset="partial_offset") }} } } { outer_loop=true } } { outer_loop=true } @@ -1012,83 +1120,6 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue """ -DECODE_GQA_SDPA_REDUCE_TEMPLATE = r""" -// Decode GQA SDPA reduce kernel: merge partials across blocks -// Input partial shape: (HgDhTiles, nblk, tile_pack) -{{kernel.def_global_vars()}} - -func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[partial], outputs=[out], names_str="partial, out", input_reorder=input_reorder)}} { - {{ kernel.def_sram_buffer("partial", partial_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }} - - %c0 = arith.constant 0.0 : f32 - %c1 = arith.constant 1.0 : f32 - %c_neg_inf = arith.constant -1.0e+30 : f32 - %v0_e = arith.constant dense<0.0> : vector<{{ tile_e }}xf32> - %v0_2x = arith.constant dense<0.0> : vector<2xf32> - %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2xf32> - - {{ kernel.def_local_vars(indent_size=2) }} - - affine.for %gh = 0 to {{ HgDhTiles }} { - // reset merged accumulators - affine.vector_store %v0_e, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> - affine.vector_store %v0_2x, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> - - affine.for %blk = 0 to {{ nblk }} { - {{ kernel.def_dma_op("MVIN", "partial", partial_idx, partial_tile_desc, subtile_size=[1, 1, tile_pack], indent_size=8) }} - %p = affine.vector_load %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32> - %p2 = vector.shape_cast %p : vector<{{ tile_pack }}xf32> to vector<2x{{ tile_e }}xf32> - %o_j = vector.extract %p2[0] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32> - %ml_j = vector.extract %p2[1] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32> - %m_j = vector.extract %ml_j[0] : f32 from vector<{{ tile_e }}xf32> - %l_j = vector.extract %ml_j[1] : f32 from vector<{{ tile_e }}xf32> - - %old_max = affine.vector_load %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> - %m_old = vector.extract %old_max[0] : f32 from vector<2xf32> - %m_new = arith.maximumf %m_old, %m_j : f32 - %m_new2 = vector.broadcast %m_new : f32 to vector<2xf32> - affine.vector_store %m_new2, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> - - %diff_old = arith.subf %m_old, %m_new : f32 - %diff_j = arith.subf %m_j, %m_new : f32 - %scale_old = math.exp %diff_old : f32 - %scale_j = math.exp %diff_j : f32 - %scale_old_e = vector.broadcast %scale_old : f32 to vector<{{ tile_e }}xf32> - %scale_j_e = vector.broadcast %scale_j : f32 to vector<{{ tile_e }}xf32> - - %o_old = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - %o_old_rs = arith.mulf %o_old, %scale_old_e : vector<{{ tile_e }}xf32> - %o_j_rs = arith.mulf %o_j, %scale_j_e : vector<{{ tile_e }}xf32> - %o_new = arith.addf %o_old_rs, %o_j_rs : vector<{{ tile_e }}xf32> - affine.vector_store %o_new, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - - %old_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> - %l_old = vector.extract %old_sum[0] : f32 from vector<2xf32> - %l_new = arith.addf (arith.mulf %l_old, %scale_old : f32), (arith.mulf %l_j, %scale_j : f32) : f32 - %l_new2 = vector.broadcast %l_new : f32 to vector<2xf32> - affine.vector_store %l_new2, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> - } { accumulation_loop=true } - - // finalize: out = o / l - %sum2 = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> - %l = vector.extract %sum2[0] : f32 from vector<2xf32> - %inv = arith.divf %c1, %l : f32 - %inv_e = vector.broadcast %inv : f32 to vector<{{ tile_e }}xf32> - %o = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - %out_f32 = arith.mulf %o, %inv_e : vector<{{ tile_e }}xf32> - %out_io = arith.truncf %out_f32 : vector<{{ tile_e }}xf32> to vector<{{ tile_e }}x{{ io_stype }}> - affine.vector_store %out_io, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> - {{ kernel.store_output(indent_size=4) }} - } { outer_loop=true } - return -} -""" - - class MLIRDecodeGQASDPAPartialTemplate(MLIRTemplate): def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=None): super().__init__("kernel", input_nodes, layout, input_reorder) @@ -1097,7 +1128,8 @@ def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=N def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs): query, key, value = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2] - partial = self.output_node + # Use the actual registered buffer node (e.g. "buf0") instead of the placeholder "buf_out". + partial = template_buffer_node if template_buffer_node is not None else self.output_node q_tensor4 = empty_strided(query.layout.size, query.layout.stride) k_tensor4 = empty_strided(key.layout.size, key.layout.stride) @@ -1173,28 +1205,39 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1]) partial_tile_desc.set_name("partial_buffer") - # Indices - kv = sympy.Symbol("kv") - qsub = sympy.Symbol("qsub") - dht = sympy.Symbol("dht") - dh0 = sympy.Symbol("dh0") - k0 = sympy.Symbol("k0") - blk = sympy.Symbol("blk") - s0 = sympy.Symbol("s0") - q_head = kv * g + qsub - + # Strides from 3D tensor views q_stride = q_tensor.stride() k_stride = k_tensor.stride() v_stride = v_tensor.stride() - qk_idx = [q_head * q_stride[0], sympy.Integer(0), k0 * q_stride[2]] - kk_idx = [kv * k_stride[0], s0 * k_stride[1], k0 * k_stride[2]] - v_idx = [kv * v_stride[0], s0 * v_stride[1], dh0 * v_stride[2]] - # partial tensor is view(HgDhTiles, nblk, tile_pack) contiguous p_tensor = empty_strided(partial.get_layout().size, partial.get_layout().stride).view(HgDhTiles, nblk, tile_pack) p_stride = p_tensor.stride() - # group head index: ((kv*g + qsub)*dh_tiles + dht) + + # DMA strides + k_dram_stride = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])] + q_dram_stride = [int(q_stride[0]), 0, int(q_stride[2])] + v_dram_stride = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])] + partial_dram_stride = [int(p_stride[0]), int(p_stride[1]), 1] + + # Affine offset maps + kk_offset_map = _make_offset_map(k_dram_stride, k_tile_desc.offset) + qk_offset_map = _make_offset_map([int(q_stride[0]), int(q_stride[2])], q_tile_desc.offset) + v_offset_map = _make_offset_map(v_dram_stride, v_tile_desc.offset) + # partial: offset(gh, blk) -- gh = (kv*g+qsub)*dh_tiles+dht, pre-computed in template + partial_offset_map = _make_offset_map([int(p_stride[0]), int(p_stride[1])], 0) + # Blk-symbol variants: %s0 is relative (0..BlkS-1), %blk is a block index (0..nblk-1), + # so actual_s = s0_rel + BlkS * blk → sym_stride=BlkS. + kk_offset_map_blk = _make_offset_map_with_sym(k_dram_stride, sym_dim=1, sym_stride=int(BlkS), offset=k_tile_desc.offset) + v_offset_map_blk = _make_offset_map_with_sym(v_dram_stride, sym_dim=1, sym_stride=int(BlkS), offset=v_tile_desc.offset) + + # Keep sympy-based indices only for epilogue_info + kv = sympy.Symbol("kv") + qsub = sympy.Symbol("qsub") + dht = sympy.Symbol("dht") + dh0 = sympy.Symbol("dh0") + blk = sympy.Symbol("blk") + q_head = kv * g + qsub gh = (kv * g + qsub) * dh_tiles + dht partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)] @@ -1230,26 +1273,110 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue max_desc=max_desc, sum_desc=sum_desc, partial_tile_desc=partial_tile_desc, - qk_idx=qk_idx, - kk_idx=kk_idx, - v_idx=v_idx, - partial_idx=partial_idx, + # DMA strides + k_dram_stride=k_dram_stride, + q_dram_stride=q_dram_stride, + v_dram_stride=v_dram_stride, + partial_dram_stride=partial_dram_stride, + # Affine offset maps + kk_offset_map=kk_offset_map, + qk_offset_map=qk_offset_map, + v_offset_map=v_offset_map, + partial_offset_map=partial_offset_map, + kk_offset_map_blk=kk_offset_map_blk, + v_offset_map_blk=v_offset_map_blk, input_reorder=self.input_reorder, ) - kernel.epilogue_info = dict( - output_node=self.output_node.name, - sram_var="partial_buffer", - dram_var="partial", - dram_idx=partial_idx, - dram_tile_desc=partial_tile_desc, - nr_rdim=0, - r_dim_size=0, - dim_aliasing={"kv": "kv", "qsub": "qsub", "dht": "dht", "dh0": "dh0", "k0": "k0", "blk": "blk", "s0": "s0"}, - ) return self._template_from_string(DECODE_GQA_SDPA_PARTIAL_TEMPLATE).render(**kernel.render_options) +DECODE_GQA_SDPA_REDUCE_TEMPLATE = r""" +// Decode GQA SDPA reduce kernel: merge partials across blocks +// Input partial shape: (HgDhTiles, nblk, tile_pack) +{{kernel.def_global_vars()}} + +func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[partial], outputs=[out], names_str="partial, out", input_reorder=input_reorder)}} { + {{ kernel.def_sram_buffer("partial", partial_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("out", out_tile_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }} + {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }} + + %c0 = arith.constant 0.0 : f32 + %c1 = arith.constant 1.0 : f32 + %c_neg_inf = arith.constant -1.0e+30 : f32 + %v0_e = arith.constant dense<0.0> : vector<{{ tile_e }}xf32> + %v0_2x = arith.constant dense<0.0> : vector<2xf32> + %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2xf32> + + {{ kernel.def_local_vars(indent_size=2) }} + + affine.for %gh = 0 to {{ HgDhTiles }} { + // reset merged accumulators + affine.vector_store %v0_e, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> + affine.vector_store %v0_2x, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> + + affine.for %blk = 0 to {{ nblk }} { + %partial_offset = affine.apply {{ partial_offset_map }}(%gh, %blk) + {{ kernel.def_dma_op("MVIN", "partial", [], partial_tile_desc, subtile_size=[1, 1, tile_pack], indent_size=8, dram_stride=partial_dram_stride, dram_offset="partial_offset") }} + %p = affine.vector_load %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32> + %p2 = vector.shape_cast %p : vector<{{ tile_pack }}xf32> to vector<2x{{ tile_e }}xf32> + %o_j = vector.extract %p2[0] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32> + %ml_j = vector.extract %p2[1] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32> + %m_j = vector.extract %ml_j[0] : f32 from vector<{{ tile_e }}xf32> + %l_j = vector.extract %ml_j[1] : f32 from vector<{{ tile_e }}xf32> + + %old_max = affine.vector_load %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> + %m_old = vector.extract %old_max[0] : f32 from vector<2xf32> + %m_new = arith.maximumf %m_old, %m_j : f32 + %m_new2 = vector.broadcast %m_new : f32 to vector<2xf32> + affine.vector_store %m_new2, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> + + %diff_old = arith.subf %m_old, %m_new : f32 + %diff_j = arith.subf %m_j, %m_new : f32 + %diff_old_v = vector.broadcast %diff_old : f32 to vector<1xf32> + %diff_j_v = vector.broadcast %diff_j : f32 to vector<1xf32> + %scale_old_v = math.exp %diff_old_v : vector<1xf32> + %scale_j_v = math.exp %diff_j_v : vector<1xf32> + %scale_old = vector.extract %scale_old_v[0] : f32 from vector<1xf32> + %scale_j = vector.extract %scale_j_v[0] : f32 from vector<1xf32> + %scale_old_e = vector.broadcast %scale_old : f32 to vector<{{ tile_e }}xf32> + %scale_j_e = vector.broadcast %scale_j : f32 to vector<{{ tile_e }}xf32> + + %o_old = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + %o_old_rs = arith.mulf %o_old, %scale_old_e : vector<{{ tile_e }}xf32> + %o_j_rs = arith.mulf %o_j, %scale_j_e : vector<{{ tile_e }}xf32> + %o_new = arith.addf %o_old_rs, %o_j_rs : vector<{{ tile_e }}xf32> + affine.vector_store %o_new, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + + %old_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> + %l_old = vector.extract %old_sum[0] : f32 from vector<2xf32> + %l_old_rs = arith.mulf %l_old, %scale_old : f32 + %l_j_rs = arith.mulf %l_j, %scale_j : f32 + %l_new = arith.addf %l_old_rs, %l_j_rs : f32 + %l_new2 = vector.broadcast %l_new : f32 to vector<2xf32> + affine.vector_store %l_new2, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> + } { accumulation_loop=true } + + // finalize: out = o / l + %sum2 = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> + %l = vector.extract %sum2[0] : f32 from vector<2xf32> + %inv = arith.divf %c1, %l : f32 + %inv_e = vector.broadcast %inv : f32 to vector<{{ tile_e }}xf32> + %o = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> + %out_f32 = arith.mulf %o, %inv_e : vector<{{ tile_e }}xf32> + {% if io_stype != "f32" %}%out_io = arith.truncf %out_f32 : vector<{{ tile_e }}xf32> to vector<{{ tile_e }}x{{ io_stype }}>{% endif %} + affine.vector_store {{ "%out_io" if io_stype != "f32" else "%out_f32" }}, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> + %out_offset = affine.apply {{ out_offset_map }}(%gh) + {{ kernel.def_dma_op("MVOUT", "out", [], out_tile_desc, indent_size=4, dram_stride=out_dram_stride, dram_offset="out_offset") }} + } { outer_loop=true } + return +} +""" + + class MLIRDecodeGQASDPAReduceTemplate(MLIRTemplate): def __init__(self, input_nodes, layout, BlkS: int = 1024, input_reorder=None): super().__init__("kernel", input_nodes, layout, input_reorder) @@ -1257,7 +1384,8 @@ def __init__(self, input_nodes, layout, BlkS: int = 1024, input_reorder=None): def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs): partial = self.input_nodes[0] - out = self.output_node + # Use the actual registered buffer node (e.g. "buf0") instead of the placeholder "buf_out". + out = template_buffer_node if template_buffer_node is not None else self.output_node tile_e = kernel.vector_lane tile_pack = tile_e * 2 @@ -1288,21 +1416,33 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue out_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) out_tile_desc.set_name("out_buffer") - # Indexing: partial is already 3D; out is (Hq,1,Dh) but view as (Hq*Dh/tile_e, 1, tile_e) + # Partial tensor strides p_tensor = empty_strided(partial.get_layout().size, partial.get_layout().stride) p_stride = p_tensor.stride() - gh = sympy.Symbol("gh") - blk = sympy.Symbol("blk") - partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)] - # out view + # Out view: (Hq*dh_tiles, 1, tile_e) out_tensor4 = empty_strided(out.get_layout().size, out.get_layout().stride) B, Hq, Lq, Dh = out_tensor4.shape assert B == 1 and Lq == 1 dh_tiles = int(Dh) // int(tile_e) out_tensor = out_tensor4.view(Hq * dh_tiles, 1, tile_e) o_stride = out_tensor.stride() - out_idx = [gh * o_stride[0], sympy.Integer(0), sympy.Integer(0)] + + # DMA strides + partial_dram_stride = [int(p_stride[0]), int(p_stride[1]), 1] + out_dram_stride = [int(o_stride[0]), 0, 0] + + # Affine offset maps + # partial: offset(gh, blk) + partial_offset_map = _make_offset_map([int(p_stride[0]), int(p_stride[1])], partial_tile_desc.offset) + # out: offset(gh) -- single dimension + out_offset_map = _make_offset_map([int(o_stride[0])], 0) + + # Keep sympy-based indices for epilogue_info + gh = sympy.Symbol("gh") + blk = sympy.Symbol("blk") + partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)] + out_idx = [gh * o_stride[0], sympy.Integer(0), sympy.Integer(0)] kernel.loop_size = [tile_pack, tile_e, 1] @@ -1321,19 +1461,13 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue max_desc=max_desc, sum_desc=sum_desc, out_tile_desc=out_tile_desc, - partial_idx=partial_idx, - out_idx=out_idx, + # DMA strides + partial_dram_stride=partial_dram_stride, + out_dram_stride=out_dram_stride, + # Affine offset maps + partial_offset_map=partial_offset_map, + out_offset_map=out_offset_map, input_reorder=self.input_reorder, ) - kernel.epilogue_info = dict( - output_node=self.output_node.name, - sram_var="out_buffer", - dram_var="out", - dram_idx=out_idx, - dram_tile_desc=out_tile_desc, - nr_rdim=0, - r_dim_size=0, - dim_aliasing={"gh": "gh", "blk": "blk"}, - ) return self._template_from_string(DECODE_GQA_SDPA_REDUCE_TEMPLATE).render(**kernel.render_options) From bfc2b22b334599fe8ddd959adb2e17ac1f576474 Mon Sep 17 00:00:00 2001 From: HamHyungkyu Date: Fri, 13 Mar 2026 19:37:08 +0900 Subject: [PATCH 130/194] [Frontend/template] SPDA implementation debug --- PyTorchSimFrontend/extension_codecache.py | 2 - PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 592 ++---------------- 2 files changed, 48 insertions(+), 546 deletions(-) diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index b1c457d3..d3ac7259 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -37,7 +37,6 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256): f""" {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \ -test-loop-padding \ - -dma-fine-grained='systolic-array-size={vectorlane_size}' \ -global-idx='vlen={vlen}' \ -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \ -test-memref-to-gemmini="vectorlane={vectorlane_size}" \ @@ -87,7 +86,6 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si f""" {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \ -test-loop-padding='timing_mode=1' \ - -dma-fine-grained='systolic-array-size={vectorlane_size}' \ -global-idx='vlen={vlen}' \ -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \ -test-tile-operation-graph='vectorlane={vectorlane_size} tls_mode={extension_config.CONFIG_TLS_MODE}' \ diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py index 077a8cd2..adcc7801 100644 --- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py +++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py @@ -563,384 +563,6 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no return tile_candidates -# --------------------------- -# Decode-only GQA SDPA (Lq == 1) -# --------------------------- - -DECODE_GQA_SDPA_TEMPLATE = r""" -// Decode GQA SDPA kernel (Lq == 1) -// B = {{ B }} -// Hq = {{ Hq }} -// H = {{ H }} -// g = {{ g }} -// S = {{ S }} -// Dh = {{ Dh }} -// BlkS = {{ BlkS }} -// tile_s = {{ tile_s }} -// tile_e = {{ tile_e }} -// dh_tiles = {{ dh_tiles }} -{{kernel.def_global_vars()}} - -func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[query, key, value], outputs=[out], names_str="query, key, value, out", input_reorder=input_reorder)}} { - // IO buffers follow input dtype (fp16/bf16/f32) - {{ kernel.def_sram_buffer("query", q_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }} - // Softmax output used for SV matmul (io dtype) - {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("score", score_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("prob", prob_desc, indent_size=2) }} - // Accumulator in fp32 (stable) - {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }} - // Temp output in io dtype for SV matmul result - {{ kernel.def_sram_buffer("out_io", out_io_tile_desc, indent_size=2) }} - // Softmax running stats in fp32 - {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }} - - %c0 = arith.constant 0.0 : {{ acc_stype }} - %c1 = arith.constant 1.0 : {{ acc_stype }} - %c_scale = arith.constant {{ scale }} : {{ acc_stype }} - %c_neg_inf = arith.constant -1.0e+30 : {{ acc_stype }} - - %v0_e_acc = arith.constant dense<0.0> : vector<{{ tile_e }}x{{ acc_stype }}> - %v0_e_io = arith.constant dense<0.0> : vector<{{ tile_e }}x{{ io_stype }}> - %v0_2x = arith.constant dense<0.0> : vector<2x{{ acc_stype }}> - %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2x{{ acc_stype }}> - %v0_s_acc = arith.constant dense<0.0> : vector<{{ tile_s }}x{{ acc_stype }}> - - %v_scale = vector.broadcast %c_scale : {{ acc_stype }} to vector<{{ tile_s }}x{{ acc_stype }}> - - {{ kernel.def_local_vars(indent_size=2) }} - - // kv_head parallelism is the natural unit for GQA reuse - affine.for %kv = 0 to {{ H }} { - // Process S in blocks (BlkS). Sequential inside a core. - affine.for %blk = 0 to {{ S }} step {{ BlkS }} { - // Initialize per-qsub accumulators for this (kv, blk) - affine.for %qsub = 0 to {{ g }} { - affine.vector_store %v_neg_inf_2x, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> - affine.vector_store %v0_2x, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> - affine.for %dht = 0 to {{ dh_tiles }} { - affine.vector_store %v0_e_acc, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> - } - } - - affine.for %s0 = 0 to {{ BlkS }} step {{ tile_s }} { - // Accumulate score per qsub so K tiles can be shared across qsub. - affine.for %qsub = 0 to {{ g }} { - affine.vector_store %v0_s_acc, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}> - } - - affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} { - // Load K slice once for all qsub. - %kk_offset = affine.apply {{ kk_offset_map_blk }}(%kv, %s0, %k0)[%blk] - {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }} - %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1> - - affine.for %qsub = 0 to {{ g }} { - %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub) - %qk_offset = affine.apply {{ qk_offset_map }}(%q_head, %k0) - {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12, dram_stride=q_dram_stride, dram_offset="qk_offset") }} - %q2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1> - - // mul = k @ q -> (tile_s x 1) in io dtype, then upcast and accumulate. - linalg.matmul - { idx_map = array } - ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x1x{{ io_stype }}, 1>) - outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }}) - - %raw_mul_io = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> - {% if io_stype != acc_stype %}%raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}x{{ acc_stype }}>{% endif %} - %old_score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}> - %new_score = arith.addf %old_score, {{ "%raw_mul" if io_stype != acc_stype else "%raw_mul_io" }} : vector<{{ tile_s }}x{{ acc_stype }}> - affine.vector_store %new_score, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}> - } { accumulation_loop=true } - } { accumulation_loop=true } - - affine.for %qsub = 0 to {{ g }} { - %score_acc = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}> - // scale after full Dh reduction - %scaled_mul_vec = arith.mulf %score_acc, %v_scale : vector<{{ tile_s }}x{{ acc_stype }}> - - // Online softmax update (max/sum/out) identical to FLASH_SDPA_TEMPLATE but specialized to Lq==1. - %old_max = affine.vector_load %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> - // Reduce max over tile_s - %max_init = vector.broadcast %c_neg_inf : {{ acc_stype }} to vector<{{ tile_s }}x{{ acc_stype }}> - %local_max_vec = arith.maximumf %scaled_mul_vec, %max_init : vector<{{ tile_s }}x{{ acc_stype }}> - %max_cast = vector.shape_cast %local_max_vec : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s // 2 }}x2x{{ acc_stype }}> - %max_red1 = vector.multi_reduction , %max_cast, %v_neg_inf_2x [0] : vector<{{ tile_s // 2 }}x2x{{ acc_stype }}> to vector<2x{{ acc_stype }}> - %max_shuf = vector.shuffle %max_red1, %max_red1 [1, 0] : vector<2x{{ acc_stype }}>, vector<2x{{ acc_stype }}> - %max_red2 = arith.maximumf %max_red1, %max_shuf : vector<2x{{ acc_stype }}> - %new_max = arith.maximumf %max_red2, %old_max : vector<2x{{ acc_stype }}> - affine.vector_store %new_max, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> - - // rescale = exp(old_max - new_max) - %max_diff = arith.subf %old_max, %new_max : vector<2x{{ acc_stype }}> - %max_diff_scalar = vector.extract %max_diff[0] : {{ acc_stype }} from vector<2x{{ acc_stype }}> - %rescale_e = vector.broadcast %max_diff_scalar : {{ acc_stype }} to vector<{{ tile_e }}x{{ acc_stype }}> - %exp_rescale_e = math.exp %rescale_e : vector<{{ tile_e }}x{{ acc_stype }}> - %rescale_2 = vector.broadcast %max_diff_scalar : {{ acc_stype }} to vector<2x{{ acc_stype }}> - %exp_rescale_2 = math.exp %rescale_2 : vector<2x{{ acc_stype }}> - - // out *= rescale - %old_out = affine.vector_load %out_acc_buffer[%qsub, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> - %rescaled_out = arith.mulf %exp_rescale_e, %old_out : vector<{{ tile_e }}x{{ acc_stype }}> - affine.vector_store %rescaled_out, %out_acc_buffer[%qsub, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> - - // sum *= rescale - %old_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> - %rescaled_sum = arith.mulf %old_sum, %exp_rescale_2 : vector<2x{{ acc_stype }}> - - // exp(score - new_max) - %new_max_scalar = vector.extract %new_max[0] : {{ acc_stype }} from vector<2x{{ acc_stype }}> - %new_max_bcast = vector.broadcast %new_max_scalar : {{ acc_stype }} to vector<{{ tile_s }}x{{ acc_stype }}> - %shifted = arith.subf %scaled_mul_vec, %new_max_bcast : vector<{{ tile_s }}x{{ acc_stype }}> - %exp_scores = math.exp %shifted : vector<{{ tile_s }}x{{ acc_stype }}> - // For SV matmul: downcast softmax output to io dtype (common in practice) - {% if io_stype != acc_stype %}%exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s }}x{{ io_stype }}>{% endif %} - affine.vector_store {{ "%exp_scores_io" if io_stype != acc_stype else "%exp_scores" }}, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> - - // sum += reduce(exp_scores) - %sum_cast = vector.shape_cast %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s // 2 }}x2x{{ acc_stype }}> - %zero_2x = vector.broadcast %c0 : {{ acc_stype }} to vector<2x{{ acc_stype }}> - %sum_red1 = vector.multi_reduction , %sum_cast, %zero_2x [0] : vector<{{ tile_s // 2 }}x2x{{ acc_stype }}> to vector<2x{{ acc_stype }}> - %sum_shuf = vector.shuffle %sum_red1, %sum_red1 [1, 0] : vector<2x{{ acc_stype }}>, vector<2x{{ acc_stype }}> - %sum_red2 = arith.addf %sum_red1, %sum_shuf : vector<2x{{ acc_stype }}> - %new_sum = arith.addf %sum_red2, %rescaled_sum : vector<2x{{ acc_stype }}> - affine.vector_store %new_sum, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> - - } { accumulation_loop=true } - - // 2) SV accumulation: for each output dh tile, load V once and share across qsub. - affine.for %dht = 0 to {{ dh_tiles }} { - %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht) - %v_offset = affine.apply {{ v_offset_map_blk }}(%kv, %s0, %dh0)[%blk] - {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0, dram_stride=v_dram_stride, dram_offset="v_offset") }} - %v2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1> - - affine.for %qsub = 0 to {{ g }} { - %prob_vec = affine.vector_load %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> - affine.vector_store %prob_vec, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> - affine.vector_store %v0_e_io, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> - %out_io_2D = memref.reinterpret_cast %out_io_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1> - linalg.matmul - { idx_map = array } - ins(%v2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(io_stype) }}) - outs(%out_io_2D : memref<{{ tile_e }}x1x{{ io_stype }}, 1>) - - %out_io_vec = affine.vector_load %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> - {% if io_stype != acc_stype %}%out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}x{{ acc_stype }}>{% endif %} - %out_acc_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> - %out_acc_new = arith.addf %out_acc_vec, {{ "%out_io_f32" if io_stype != acc_stype else "%out_io_vec" }} : vector<{{ tile_e }}x{{ acc_stype }}> - affine.vector_store %out_acc_new, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> - } { accumulation_loop=true } - } { accumulation_loop=true } - } { accumulation_loop=true } - - // finalize per-qsub for this (kv, blk) and store out for all dh tiles - affine.for %qsub = 0 to {{ g }} { - %final_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}> - %one_2x = vector.broadcast %c1 : {{ acc_stype }} to vector<2x{{ acc_stype }}> - %inv_sum_2x = arith.divf %one_2x, %final_sum : vector<2x{{ acc_stype }}> - %inv_sum = vector.extract %inv_sum_2x[0] : {{ acc_stype }} from vector<2x{{ acc_stype }}> - %inv_bcast = vector.broadcast %inv_sum : {{ acc_stype }} to vector<{{ tile_e }}x{{ acc_stype }}> - - affine.for %dht = 0 to {{ dh_tiles }} { - %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht) - %acc_out = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}> - %final_out_acc = arith.mulf %acc_out, %inv_bcast : vector<{{ tile_e }}x{{ acc_stype }}> - {% if io_stype != acc_stype %}%final_out_io = arith.truncf %final_out_acc : vector<{{ tile_e }}x{{ acc_stype }}> to vector<{{ tile_e }}x{{ io_stype }}>{% endif %} - affine.vector_store {{ "%final_out_io" if io_stype != acc_stype else "%final_out_acc" }}, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> - %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub) - %out_offset = affine.apply {{ out_offset_map }}(%q_head, %dh0) - {{ kernel.def_dma_op("MVOUT", "out", [], out_io_tile_desc, indent_size=10, dram_stride=out_dram_stride, dram_offset="out_offset") }} - } - } { outer_loop=true } - } { outer_loop=true } - } { outer_loop=true } - - return -} -""" - - -class MLIRDecodeGQASDPATemplate(MLIRTemplate): - def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=None): - super().__init__("kernel", input_nodes, layout, input_reorder) - self.scale = scale - self.BlkS = BlkS - - def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs): - # Decode-only: q is (B,Hq,1,Dh) - # Use template_buffer_node (the actual V.graph-registered CUDATemplateBuffer with its - # real name e.g. "buf0") when available, instead of the placeholder self.output_node - # (always named "buf_out"). This ensures output_buffers["buf0"] maps correctly - # in mlir_argdefs, which looks up buffer_types by the actual DRAM buffer name. - query, key, value, out = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2], \ - template_buffer_node if template_buffer_node is not None else self.output_node - - # Materialize tensors for stride metadata - q_tensor4 = empty_strided(query.layout.size, query.layout.stride) - k_tensor4 = empty_strided(key.layout.size, key.layout.stride) - v_tensor4 = empty_strided(value.layout.size, value.layout.stride) - - B, Hq, Lq, Dh = q_tensor4.shape - Bk, H, S, Dhk = k_tensor4.shape - assert B == 1, "Decode GQA template currently supports B==1" - assert Lq == 1, "Decode GQA template requires Lq==1" - assert Dh == Dhk - g = Hq // H - BlkS = min(int(self.BlkS), int(S)) - - # Use 3D views to match the existing SDPA indexing scheme - # q: (Hq, 1, Dh), k/v: (H, S, Dh), out: (Hq, 1, Dh) - q_tensor = q_tensor4.view(Hq, 1, Dh) - k_tensor = k_tensor4.view(H, S, Dh) - v_tensor = v_tensor4.view(H, S, Dh) - - tile_s = kernel.vector_lane - tile_e = kernel.vector_lane - dh_tiles = int(Dh) // int(tile_e) - - io_stype = mlir_common.DTYPE_TO_MLIR[query.get_dtype()] - acc_stype = "f32" - - # SRAM tiles: q(1x1xtile_e), k/v(1xtile_sxtile_e), mul(tile_sx1) in io dtype. - # out_acc in f32; out_io temp in io dtype. - vlane_stride = 1 - q_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) - q_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) - q_tile_desc.set_name("q_buffer") - q_tile_desc.offset = query.get_layout().offset - - k_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 2, vlane_stride) - k_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, 1, tile_s]) - k_tile_desc.set_name("k_buffer") - k_tile_desc.offset = key.get_layout().offset - - v_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 1, vlane_stride) - v_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, tile_e, 1]) - v_tile_desc.set_name("v_buffer") - v_tile_desc.offset = value.get_layout().offset - - mul_tile_desc = mlir_common.MLIRMultiDimTile([tile_s, 1], kernel.vector_lane, 1, vlane_stride) - mul_tile_desc.set_tile_size_stride([tile_s, 1], [1, 1]) - mul_tile_desc.set_name("mul_buffer") - - score_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride) - score_desc.set_tile_size_stride([g, tile_s], [tile_s, 1]) - score_desc.set_name("score_buffer") - - prob_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride) - prob_desc.set_tile_size_stride([g, tile_s], [tile_s, 1]) - prob_desc.set_name("prob_buffer") - - # Per-qsub accumulators so KV tiles can be shared across qsub - out_acc_tile_desc = mlir_common.MLIRMultiDimTile([g, dh_tiles, tile_e], kernel.vector_lane, 2, vlane_stride) - out_acc_tile_desc.set_tile_size_stride([g, dh_tiles, tile_e], [dh_tiles * tile_e, tile_e, 1]) - out_acc_tile_desc.set_name("out_acc_buffer") - - out_io_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) - out_io_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) - out_io_tile_desc.set_name("out_io_buffer") - - max_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride) - max_desc.set_tile_size_stride([g, 2], [2, 1]) - max_desc.set_name("max_buffer") - - sum_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride) - sum_desc.set_tile_size_stride([g, 2], [2, 1]) - sum_desc.set_name("sum_buffer") - - # Strides from 3D tensor views - q_stride = q_tensor.stride() - k_stride = k_tensor.stride() - v_stride = v_tensor.stride() - # out is (B,Hq,1,Dh) but we address it as (Hq,1,Dh) - out_tensor = empty_strided(out.get_layout().size, out.get_layout().stride).view(Hq, 1, Dh) - out_stride = out_tensor.stride() - - # DMA strides (per-dimension DRAM strides for each tile) - k_dram_stride = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])] - # Q: q_head is pre-computed in template; stride[1]=0 since Lq=1 - q_dram_stride = [int(q_stride[0]), 0, int(q_stride[2])] - v_dram_stride = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])] - # out: q_head is pre-computed; stride[1]=0 since Lq=1 - out_dram_stride = [int(out_stride[0]), 0, int(out_stride[2])] - - # Affine maps for flat DRAM base address (used with pre-computed loop var expressions) - # K: offset(kv, s0, k0) - kk_offset_map = _make_offset_map(k_dram_stride, k_tile_desc.offset) - # Q: offset(q_head, k0) -- q_head = kv*g+qsub pre-computed in template - qk_offset_map = _make_offset_map([int(q_stride[0]), int(q_stride[2])], q_tile_desc.offset) - # V: offset(kv, s0, dh0) - v_offset_map = _make_offset_map(v_dram_stride, v_tile_desc.offset) - # Out: offset(q_head, dh0) -- q_head pre-computed in template - out_offset_map = _make_offset_map([int(out_stride[0]), int(out_stride[2])], 0) - # Blk-symbol variants: %s0 is relative (0..BlkS-1), %blk is the absolute - # block start (steps by BlkS), so actual_s = s0_rel + 1*blk → sym_stride=1. - kk_offset_map_blk = _make_offset_map_with_sym(k_dram_stride, sym_dim=1, sym_stride=1, offset=k_tile_desc.offset) - v_offset_map_blk = _make_offset_map_with_sym(v_dram_stride, sym_dim=1, sym_stride=1, offset=v_tile_desc.offset) - - # Keep sympy-based out_idx only for epilogue_info (not in render_options) - kv = sympy.Symbol("kv") - qsub = sympy.Symbol("qsub") - dh0 = sympy.Symbol("dh0") - s0 = sympy.Symbol("s0") - q_head = kv * g + qsub - out_idx = [q_head * out_stride[0], sympy.Integer(0), dh0 * out_stride[2]] - - kernel.loop_size = [tile_s, tile_e, 1] - - kernel.render_options = dict( - KERNEL_NAME=self.name, - kernel=kernel, - B=B, - Hq=Hq, - H=H, - g=g, - S=S, - Dh=Dh, - dh_tiles=dh_tiles, - BlkS=BlkS, - tile_s=tile_s, - tile_e=tile_e, - io_stype=io_stype, - acc_stype=acc_stype, - scale=self.scale, - query=query, - key=key, - value=value, - out=out, - q_tile_desc=q_tile_desc, - k_tile_desc=k_tile_desc, - v_tile_desc=v_tile_desc, - out_acc_tile_desc=out_acc_tile_desc, - out_io_tile_desc=out_io_tile_desc, - mul_tile_desc=mul_tile_desc, - score_desc=score_desc, - prob_desc=prob_desc, - max_desc=max_desc, - sum_desc=sum_desc, - # DMA strides - k_dram_stride=k_dram_stride, - q_dram_stride=q_dram_stride, - v_dram_stride=v_dram_stride, - out_dram_stride=out_dram_stride, - # Affine offset maps - kk_offset_map=kk_offset_map, - qk_offset_map=qk_offset_map, - v_offset_map=v_offset_map, - out_offset_map=out_offset_map, - kk_offset_map_blk=kk_offset_map_blk, - v_offset_map_blk=v_offset_map_blk, - input_reorder=self.input_reorder, - ) - - return self._template_from_string(DECODE_GQA_SDPA_TEMPLATE).render(**kernel.render_options) - - # --------------------------- # Decode-only GQA SDPA: 2-kernel pipeline (partial blocks + reduce) # --------------------------- @@ -960,13 +582,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }} {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("score", score_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("prob", prob_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("out_io", out_io_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("partial", partial_tile_desc, indent_size=2) }} + %c0 = arith.constant 0.0 : f32 %c_scale = arith.constant {{ scale }} : f32 @@ -984,135 +600,21 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue affine.for %kv = 0 to {{ H }} { affine.for %blk = 0 to {{ nblk }} step 1 { // Reset per-block accumulators for all qsub/dh tiles. - affine.for %qsub = 0 to {{ g }} { - affine.vector_store %v_neg_inf_2x, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> - affine.vector_store %v0_2x, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> - affine.for %dht = 0 to {{ dh_tiles }} { - affine.vector_store %v0_e, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - } - } - + %qk_offset = affine.apply {{ qk_offset_map }}(%kv) + {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[Dh, 1, g_size], indent_size=8, dram_stride=q_dram_stride, dram_offset="qk_offset") }} + %q2D_buffer = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ Dh }}, {{ g_size }}], strides: [{{g_size}}, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ Dh }}x{{ g_size }}x{{ io_stype }}, 1> affine.for %s0 = 0 to {{ BlkS }} step {{ tile_s }} { - // Accumulate score per qsub so K tiles can be shared across qsub. - affine.for %qsub = 0 to {{ g }} { - affine.vector_store %v0_s, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32> - } - affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} { %kk_offset = affine.apply {{ kk_offset_map_blk }}(%kv, %s0, %k0)[%blk] {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }} - %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1> - - affine.for %qsub = 0 to {{ g }} { - %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub) - %qk_offset = affine.apply {{ qk_offset_map }}(%q_head, %k0) - {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12, dram_stride=q_dram_stride, dram_offset="qk_offset") }} - %q2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1> - linalg.matmul - { idx_map = array } - ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x1x{{ io_stype }}, 1>) - outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }}) - %raw_mul_io = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> - {% if io_stype != "f32" %}%raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}xf32>{% endif %} - %old_score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32> - %new_score = arith.addf %old_score, {{ "%raw_mul" if io_stype != "f32" else "%raw_mul_io" }} : vector<{{ tile_s }}xf32> - affine.vector_store %new_score, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32> - } { accumulation_loop=true } - } { accumulation_loop=true } - - // Softmax once per qsub; persist probabilities in SRAM for all SV dh tiles. - affine.for %qsub = 0 to {{ g }} { - %score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32> - %scaled = arith.mulf %score, %v_scale : vector<{{ tile_s }}xf32> - - %old_max = affine.vector_load %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> - %max_init = vector.broadcast %c_neg_inf : f32 to vector<{{ tile_s }}xf32> - %local_max_vec = arith.maximumf %scaled, %max_init : vector<{{ tile_s }}xf32> - %max_cast = vector.shape_cast %local_max_vec : vector<{{ tile_s }}xf32> to vector<{{ tile_s // 2 }}x2xf32> - %max_red1 = vector.multi_reduction , %max_cast, %v_neg_inf_2x [0] : vector<{{ tile_s // 2 }}x2xf32> to vector<2xf32> - %max_shuf = vector.shuffle %max_red1, %max_red1 [1, 0] : vector<2xf32>, vector<2xf32> - %max_red2 = arith.maximumf %max_red1, %max_shuf : vector<2xf32> - %new_max = arith.maximumf %max_red2, %old_max : vector<2xf32> - affine.vector_store %new_max, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> - - %max_diff = arith.subf %old_max, %new_max : vector<2xf32> - %max_diff_scalar = vector.extract %max_diff[0] : f32 from vector<2xf32> - %rescale_e = vector.broadcast %max_diff_scalar : f32 to vector<{{ tile_e }}xf32> - %exp_rescale_e = math.exp %rescale_e : vector<{{ tile_e }}xf32> - %rescale_2 = vector.broadcast %max_diff_scalar : f32 to vector<2xf32> - %exp_rescale_2 = math.exp %rescale_2 : vector<2xf32> - - %old_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> - %rescaled_sum = arith.mulf %old_sum, %exp_rescale_2 : vector<2xf32> - - affine.for %dht = 0 to {{ dh_tiles }} { - %old_out = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - %rescaled_out = arith.mulf %exp_rescale_e, %old_out : vector<{{ tile_e }}xf32> - affine.vector_store %rescaled_out, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - } - - %new_max_scalar = vector.extract %new_max[0] : f32 from vector<2xf32> - %new_max_bcast = vector.broadcast %new_max_scalar : f32 to vector<{{ tile_s }}xf32> - %shifted = arith.subf %scaled, %new_max_bcast : vector<{{ tile_s }}xf32> - %exp_scores = math.exp %shifted : vector<{{ tile_s }}xf32> - {% if io_stype != "f32" %}%exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s }}x{{ io_stype }}>{% endif %} - affine.vector_store {{ "%exp_scores_io" if io_stype != "f32" else "%exp_scores" }}, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> - - %sum_cast = vector.shape_cast %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s // 2 }}x2xf32> - %zero_2x = vector.broadcast %c0 : f32 to vector<2xf32> - %sum_red1 = vector.multi_reduction , %sum_cast, %zero_2x [0] : vector<{{ tile_s // 2 }}x2xf32> to vector<2xf32> - %sum_shuf = vector.shuffle %sum_red1, %sum_red1 [1, 0] : vector<2xf32>, vector<2xf32> - %sum_red2 = arith.addf %sum_red1, %sum_shuf : vector<2xf32> - %new_sum = arith.addf %sum_red2, %rescaled_sum : vector<2xf32> - affine.vector_store %new_sum, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> - } { accumulation_loop=true } + %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }},1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1> + %q2D = memref.reinterpret_cast %q2D_buffer to offset: [%k0], sizes: [{{ tile_e }}, {{ g_size }}], strides: [{{ g_size }}, 1] : memref<{{ Dh }}x{{ g_size }}x{{ io_stype }}, 1> to memref<{{ tile_e }}x{{ g_size }}x{{ io_stype }}, 1> + linalg.matmul + ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x{{ g_size }}x{{ io_stype }}, 1>) + outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }}) - // For each output dh tile, load V once and share it across qsub. - affine.for %dht = 0 to {{ dh_tiles }} { - %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht) - %v_offset = affine.apply {{ v_offset_map_blk }}(%kv, %s0, %dh0)[%blk] - {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0, dram_stride=v_dram_stride, dram_offset="v_offset") }} - %v2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1> - - affine.for %qsub = 0 to {{ g }} { - %prob_vec = affine.vector_load %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> - affine.vector_store %prob_vec, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}> - affine.vector_store %v0_e_io, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> - %out_io_2D = memref.reinterpret_cast %out_io_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1> - linalg.matmul - { idx_map = array } - ins(%v2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(io_stype) }}) - outs(%out_io_2D : memref<{{ tile_e }}x1x{{ io_stype }}, 1>) - - %out_io_vec = affine.vector_load %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> - {% if io_stype != "f32" %}%out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}xf32>{% endif %} - %out_acc_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - %out_acc_new = arith.addf %out_acc_vec, {{ "%out_io_f32" if io_stype != "f32" else "%out_io_vec" }} : vector<{{ tile_e }}xf32> - affine.vector_store %out_acc_new, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - } { accumulation_loop=true } } { accumulation_loop=true } } { accumulation_loop=true } - - // Store packed partials for all qsub/dh tiles. - affine.for %qsub = 0 to {{ g }} { - %final_max = affine.vector_load %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> - %m_scalar = vector.extract %final_max[0] : f32 from vector<2xf32> - %final_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> - %l_scalar = vector.extract %final_sum[0] : f32 from vector<2xf32> - %ml_vec = vector.broadcast %c0 : f32 to vector<{{ tile_e }}xf32> - %ml0 = vector.insert %m_scalar, %ml_vec[0] : f32 into vector<{{ tile_e }}xf32> - %ml1 = vector.insert %l_scalar, %ml0[1] : f32 into vector<{{ tile_e }}xf32> - - affine.for %dht = 0 to {{ dh_tiles }} { - %out_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - %packed = vector.shuffle %out_vec, %ml1 [{{ range(tile_pack) | join(', ') }}] : vector<{{ tile_e }}xf32>, vector<{{ tile_e }}xf32> - affine.vector_store %packed, %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32> - %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub) - %gh = affine.apply affine_map<(d0, d1) -> (d0 * {{ dh_tiles }} + d1)>(%q_head, %dht) - %partial_offset = affine.apply {{ partial_offset_map }}(%gh, %blk) - {{ kernel.def_dma_op("MVOUT", "partial", [], partial_tile_desc, indent_size=10, dram_stride=partial_dram_stride, dram_offset="partial_offset") }} - } - } { outer_loop=true } } { outer_loop=true } } { outer_loop=true } return @@ -1138,6 +640,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue _, H, S, _ = k_tensor4.shape assert B == 1 and Lq == 1 g = Hq // H + g_size = g BlkS = min(int(self.BlkS), int(S)) nblk = (int(S) + int(BlkS) - 1) // int(BlkS) @@ -1157,53 +660,53 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue # tile descs vlane_stride = 1 - q_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) - q_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) + q_tile_desc = mlir_common.MLIRMultiDimTile([Dh, 1, g_size], kernel.vector_lane, 2, vlane_stride) + q_tile_desc.set_tile_size_stride([Dh, 1, g_size], [g_size, 1, 1]) q_tile_desc.set_name("q_buffer") q_tile_desc.offset = query.get_layout().offset k_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 2, vlane_stride) - k_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, 1, tile_s]) + k_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [1, 1, tile_s]) k_tile_desc.set_name("k_buffer") k_tile_desc.offset = key.get_layout().offset v_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 1, vlane_stride) - v_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, tile_e, 1]) + v_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [1, tile_e, 1]) v_tile_desc.set_name("v_buffer") v_tile_desc.offset = value.get_layout().offset - mul_tile_desc = mlir_common.MLIRMultiDimTile([tile_s, 1], kernel.vector_lane, 1, vlane_stride) - mul_tile_desc.set_tile_size_stride([tile_s, 1], [1, 1]) + mul_tile_desc = mlir_common.MLIRMultiDimTile([tile_s, g_size], kernel.vector_lane, 1, vlane_stride) + mul_tile_desc.set_tile_size_stride([tile_s, g_size], [1, tile_s]) mul_tile_desc.set_name("mul_buffer") - score_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride) - score_desc.set_tile_size_stride([g, tile_s], [tile_s, 1]) - score_desc.set_name("score_buffer") + # score_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride) + # score_desc.set_tile_size_stride([g, tile_s], [tile_s, 1]) + # score_desc.set_name("score_buffer") - prob_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride) - prob_desc.set_tile_size_stride([g, tile_s], [tile_s, 1]) - prob_desc.set_name("prob_buffer") + # prob_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride) + # prob_desc.set_tile_size_stride([g, tile_s], [tile_s, 1]) + # prob_desc.set_name("prob_buffer") - # Per-qsub, per-dh-tile accumulators so QK is computed once and SV expands across dh tiles. - out_acc_tile_desc = mlir_common.MLIRMultiDimTile([g, dh_tiles, tile_e], kernel.vector_lane, 2, vlane_stride) - out_acc_tile_desc.set_tile_size_stride([g, dh_tiles, tile_e], [dh_tiles * tile_e, tile_e, 1]) - out_acc_tile_desc.set_name("out_acc_buffer") + # # Per-qsub, per-dh-tile accumulators so QK is computed once and SV expands across dh tiles. + # out_acc_tile_desc = mlir_common.MLIRMultiDimTile([g, dh_tiles, tile_e], kernel.vector_lane, 2, vlane_stride) + # out_acc_tile_desc.set_tile_size_stride([g, dh_tiles, tile_e], [dh_tiles * tile_e, tile_e, 1]) + # out_acc_tile_desc.set_name("out_acc_buffer") - max_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride) - max_desc.set_tile_size_stride([g, 2], [2, 1]) - max_desc.set_name("max_buffer") + # max_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride) + # max_desc.set_tile_size_stride([g, 2], [2, 1]) + # max_desc.set_name("max_buffer") - sum_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride) - sum_desc.set_tile_size_stride([g, 2], [2, 1]) - sum_desc.set_name("sum_buffer") + # sum_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride) + # sum_desc.set_tile_size_stride([g, 2], [2, 1]) + # sum_desc.set_name("sum_buffer") - out_io_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) - out_io_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) - out_io_tile_desc.set_name("out_io_buffer") + # out_io_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) + # out_io_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) + # out_io_tile_desc.set_name("out_io_buffer") - partial_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_pack], kernel.vector_lane, 1, vlane_stride) - partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1]) - partial_tile_desc.set_name("partial_buffer") + # partial_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_pack], kernel.vector_lane, 1, vlane_stride) + # partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1]) + # partial_tile_desc.set_name("partial_buffer") # Strides from 3D tensor views q_stride = q_tensor.stride() @@ -1216,13 +719,13 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue # DMA strides k_dram_stride = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])] - q_dram_stride = [int(q_stride[0]), 0, int(q_stride[2])] + q_dram_stride = [int(q_stride[2]), 0, int(q_stride[1])] v_dram_stride = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])] partial_dram_stride = [int(p_stride[0]), int(p_stride[1]), 1] # Affine offset maps kk_offset_map = _make_offset_map(k_dram_stride, k_tile_desc.offset) - qk_offset_map = _make_offset_map([int(q_stride[0]), int(q_stride[2])], q_tile_desc.offset) + qk_offset_map = _make_offset_map([int(g) * int(q_stride[2])], q_tile_desc.offset) v_offset_map = _make_offset_map(v_dram_stride, v_tile_desc.offset) # partial: offset(gh, blk) -- gh = (kv*g+qsub)*dh_tiles+dht, pre-computed in template partial_offset_map = _make_offset_map([int(p_stride[0]), int(p_stride[1])], 0) @@ -1254,6 +757,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue nblk=nblk, tile_s=tile_s, tile_e=tile_e, + g_size=g_size, dh_tiles=dh_tiles, tile_pack=tile_pack, io_stype=io_stype, @@ -1266,13 +770,13 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue k_tile_desc=k_tile_desc, v_tile_desc=v_tile_desc, mul_tile_desc=mul_tile_desc, - score_desc=score_desc, - prob_desc=prob_desc, - out_io_tile_desc=out_io_tile_desc, - out_acc_tile_desc=out_acc_tile_desc, - max_desc=max_desc, - sum_desc=sum_desc, - partial_tile_desc=partial_tile_desc, + # score_desc=score_desc, + # prob_desc=prob_desc, + # out_io_tile_desc=out_io_tile_desc, + # out_acc_tile_desc=out_acc_tile_desc, + # max_desc=max_desc, + # sum_desc=sum_desc, + # partial_tile_desc=partial_tile_desc, # DMA strides k_dram_stride=k_dram_stride, q_dram_stride=q_dram_stride, From ce9330670c60bb4debf795c4771b8d80057e92e5 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 13 Mar 2026 21:30:07 +0900 Subject: [PATCH 131/194] [Template/SPDA] Remove subtile size temporarily --- PyTorchSimFrontend/extension_codecache.py | 2 ++ PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 18 ++++++------------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index d3ac7259..b1c457d3 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -37,6 +37,7 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256): f""" {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \ -test-loop-padding \ + -dma-fine-grained='systolic-array-size={vectorlane_size}' \ -global-idx='vlen={vlen}' \ -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \ -test-memref-to-gemmini="vectorlane={vectorlane_size}" \ @@ -86,6 +87,7 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si f""" {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \ -test-loop-padding='timing_mode=1' \ + -dma-fine-grained='systolic-array-size={vectorlane_size}' \ -global-idx='vlen={vlen}' \ -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \ -test-tile-operation-graph='vectorlane={vectorlane_size} tls_mode={extension_config.CONFIG_TLS_MODE}' \ diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py index adcc7801..b1569be6 100644 --- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py +++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py @@ -169,9 +169,6 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: // tile_l = {{ tile_l }} // tile_s = {{ tile_s }} // tile_e = {{ tile_e }} -// subtile_l = {{ subtile_l }} -// subtile_s = {{ subtile_s }} -// subtile_e = {{ subtile_e }} {{kernel.def_global_vars()}} func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[query, key, value], outputs=[out], names_str="query, key, value, out", input_reorder=input_reorder)}} { @@ -210,7 +207,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: affine.for %index3 = 0 to 1 step 1 { affine.for %index1 = 0 to {{ l }} step {{ tile_l }} { %q_dram_offset = affine.apply {{ q_offset_map }}(%index0, %index1, %index3) - {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[1, subtile_l, subtile_e], indent_size=8, dram_stride=q_dram_stride, dram_offset="q_dram_offset") }} + {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, indent_size=8, dram_stride=q_dram_stride, dram_offset="q_dram_offset") }} affine.vector_store %v0_l, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_l, tile_e) }}x{{ data_stype }}> affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> @@ -221,9 +218,9 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: affine.for %index2 = 0 to {{ s }} step {{ tile_s }} { %k_dram_offset = affine.apply {{ k_offset_map }}(%index0, %index2, %index3) - {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10, dram_stride=k_dram_stride, dram_offset="k_dram_offset") }} + {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, indent_size=10, dram_stride=k_dram_stride, dram_offset="k_dram_offset") }} %v_dram_offset = affine.apply {{ v_offset_map }}(%index0, %index2, %index3) - {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10, dram_stride=v_dram_stride, dram_offset="v_dram_offset") }} + {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, indent_size=10, dram_stride=v_dram_stride, dram_offset="v_dram_offset") }} affine.vector_store %v0_s, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_s, tile_l) }}x{{ data_stype }}> @@ -487,9 +484,6 @@ def render(self, tile_l = tile_l, tile_s = tile_s, tile_e = tile_e, # Tile sizes (sram) - subtile_l = subtile_l, - subtile_s = subtile_s, - subtile_e = subtile_e, # Subtile sizes (sram) data_stype="f32", query = query, key = key, @@ -601,12 +595,12 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no affine.for %blk = 0 to {{ nblk }} step 1 { // Reset per-block accumulators for all qsub/dh tiles. %qk_offset = affine.apply {{ qk_offset_map }}(%kv) - {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[Dh, 1, g_size], indent_size=8, dram_stride=q_dram_stride, dram_offset="qk_offset") }} + {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, indent_size=8, dram_stride=q_dram_stride, dram_offset="qk_offset") }} %q2D_buffer = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ Dh }}, {{ g_size }}], strides: [{{g_size}}, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ Dh }}x{{ g_size }}x{{ io_stype }}, 1> affine.for %s0 = 0 to {{ BlkS }} step {{ tile_s }} { affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} { %kk_offset = affine.apply {{ kk_offset_map_blk }}(%kv, %s0, %k0)[%blk] - {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }} + {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }} %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }},1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1> %q2D = memref.reinterpret_cast %q2D_buffer to offset: [%k0], sizes: [{{ tile_e }}, {{ g_size }}], strides: [{{ g_size }}, 1] : memref<{{ Dh }}x{{ g_size }}x{{ io_stype }}, 1> to memref<{{ tile_e }}x{{ g_size }}x{{ io_stype }}, 1> linalg.matmul @@ -824,7 +818,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue affine.for %blk = 0 to {{ nblk }} { %partial_offset = affine.apply {{ partial_offset_map }}(%gh, %blk) - {{ kernel.def_dma_op("MVIN", "partial", [], partial_tile_desc, subtile_size=[1, 1, tile_pack], indent_size=8, dram_stride=partial_dram_stride, dram_offset="partial_offset") }} + {{ kernel.def_dma_op("MVIN", "partial", [], partial_tile_desc, indent_size=8, dram_stride=partial_dram_stride, dram_offset="partial_offset") }} %p = affine.vector_load %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32> %p2 = vector.shape_cast %p : vector<{{ tile_pack }}xf32> to vector<2x{{ tile_e }}xf32> %o_j = vector.extract %p2[0] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32> From f2717e1cd117b5229f769ebf3a7040185c984891 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 13 Mar 2026 22:45:00 +0900 Subject: [PATCH 132/194] [Template/SPDA] minor fix --- PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py index b1569be6..be6e7124 100644 --- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py +++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py @@ -713,7 +713,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue # DMA strides k_dram_stride = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])] - q_dram_stride = [int(q_stride[2]), 0, int(q_stride[1])] + q_dram_stride = [int(q_stride[2]), 0, int(q_stride[0])] v_dram_stride = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])] partial_dram_stride = [int(p_stride[0]), int(p_stride[1]), 1] From be23638400926454d8be17742eff4b6fc358b750 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 16 Mar 2026 20:43:21 +0900 Subject: [PATCH 133/194] [Cleanup] Unflag debug option --- PyTorchSimFrontend/extension_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index fe8cc380..1b7ccf8d 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -130,7 +130,7 @@ def load_plan_from_module(module_path): CONFIG_USE_TIMING_POOLING = int(os.environ.get('TORCHSIM_USE_TIMING_POOLING', default=0)) -CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=1)) +CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=0)) def setup_logger(name=None, level=None): From e925ae45cad8cebca98e42de5c1cfb8c01cd35bf Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 16 Mar 2026 22:02:07 +0900 Subject: [PATCH 134/194] [CI] Add deepseek test case --- .github/workflows/pytorchsim_test.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index eaaa7e50..36a62b68 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -726,6 +726,27 @@ jobs: -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Yolov5/test_yolov5.py + test_deepseek: + name: Run test_deepseek + runs-on: self-hosted + steps: + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Run test_deepseek_v3_base.py + run: | + echo "Running test_deepseek_v3_base.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e vpu_num_lanes="${{ inputs.vector_lane }}" \ + -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/DeepSeek/test_deepseek_v3_base.py + test_accuracy: name: Run test_accuracy runs-on: self-hosted From db859911ed73b21db65031f84dc47dc4555dcc3f Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 17 Mar 2026 16:24:45 +0900 Subject: [PATCH 135/194] [Template/SPDA] Cleanup test case + Add an activate option --- PyTorchSimDevice/csrc/aten/native/Extra.cpp | 34 +- .../torch_openreg/openreg/__init__.py | 5 + PyTorchSimFrontend/mlir/mlir_lowering.py | 60 +-- PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 423 +----------------- tests/test_sdpa.py | 241 +++++----- 5 files changed, 181 insertions(+), 582 deletions(-) diff --git a/PyTorchSimDevice/csrc/aten/native/Extra.cpp b/PyTorchSimDevice/csrc/aten/native/Extra.cpp index aaf28e1a..eb76f5d7 100644 --- a/PyTorchSimDevice/csrc/aten/native/Extra.cpp +++ b/PyTorchSimDevice/csrc/aten/native/Extra.cpp @@ -20,8 +20,38 @@ int64_t _fused_sdp_choice( std::optional scale, bool enable_gqa) { - auto backend = sdp::SDPBackend::overrideable; - return static_cast(backend); + sdp::sdp_params params{query, key, value, attn_mask, dropout_p, is_causal, enable_gqa}; + + // Reject inputs that are fundamentally unsupported (e.g. wrong rank) + if (!sdp::check_tensor_shapes(params, /*debug=*/false)) { + return static_cast(sdp::SDPBackend::error); + } + + // q: (B, Hq, L, E) k/v: (B, H, S, E) + const int64_t Hq = query.size(-3); + const int64_t H = key.size(-3); + const int64_t L = query.size(-2); // query sequence length + const int64_t S = key.size(-2); // key/value sequence length + + // Conditions required by the MLIR FlashSDPA kernel: + // Prefill only : L == S (decode has L == 1, not supported) + // Non-GQA : Hq == H (equal query and KV heads) + // No dropout : template has no dropout implementation + // Dense tensors : no nested tensor support + const bool can_use_mlir_flash = + (L == S) && + (Hq == H) && !enable_gqa && + sdp::check_for_dropout(params, /*debug=*/false) && + sdp::check_nested_tensor(params, /*debug=*/false); + + const bool ctx_flash = at::globalContext().userEnabledFlashSDP(); + const bool ctx_math = at::globalContext().userEnabledMathSDP(); + + if (ctx_flash && can_use_mlir_flash) { + return static_cast(sdp::SDPBackend::overrideable); + } + + return static_cast(sdp::SDPBackend::math); } void quantize_tensor_per_tensor_affine_stub( diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py index f674ec06..592011aa 100644 --- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py +++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py @@ -73,6 +73,11 @@ def _lazy_init(): register_interface_for_device(custom_device(), ExtensionDeviceInterface) _initialized = True + # Set default SDPA backend to math-only for this device. + torch._C._set_sdp_use_flash(False) + torch._C._set_sdp_use_overrideable(False) + torch._C._set_sdp_use_math(True) + # Create default streams for all devices num_devices = device_count() for device_idx in range(num_devices): diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py index 7b2c07bf..b717089f 100644 --- a/PyTorchSimFrontend/mlir/mlir_lowering.py +++ b/PyTorchSimFrontend/mlir/mlir_lowering.py @@ -20,8 +20,6 @@ from PyTorchSimFrontend.mlir.mlir_sort_template import MLIRSortTemplate, MLIRStableSortTemplate from PyTorchSimFrontend.mlir.mlir_sdpa_template import ( MLIRFlashSDPATemplate, - MLIRDecodeGQASDPAPartialTemplate, - MLIRDecodeGQASDPAReduceTemplate, flash_sdpa_args, calculate_scale, ) @@ -51,56 +49,27 @@ def tuned_bmm(mat1, mat2, *, layout=None): def tuned_flash_sdpa( - query : TensorBox, - key : TensorBox, - value : TensorBox, + query : TensorBox, + key : TensorBox, + value : TensorBox, attn_bias : Optional[TensorBox] = None, - dropout_p : float = 0.0, - is_causal : bool = False, + dropout_p : float = 0.0, + is_causal : bool = False, return_debug_mask : bool = False, - scale : Optional[float] = None) -> tuple: - - + scale : Optional[float] = None, + enable_gqa : bool = False) -> tuple: + # _fused_sdp_choice in C++ already guarantees: + # L == S (prefill), Hq == H (non-GQA), dropout_p == 0.0 + # before routing here via SDPBackend::overrideable. + # Non-matching shapes fall back to SDPBackend::math in C++ and decompose + # into primitive ops (matmul/softmax) before reaching this lowering. scale = calculate_scale(query, scale) N, Hq, H, L, S, E, Ev, layout, query, key, value = flash_sdpa_args(query, key, value) - - # Decode-only GQA fast path: q is (B,Hq,1,Dh), B==1, Hq!=H, Hq%H==0. - # Always use the 2-kernel decode path: - # 1) block partials over (kv head, sequence block) - # 2) reduce/merge across blocks - # This keeps KV shared across qsub, avoids dh0-outer duplication, and - # stores compact partials instead of full score/prob tensors in DRAM. - if L == 1 and Hq != H and N == 1 and (Hq % H) == 0: - g = Hq // H - vector_lane = extension_config.vpu_num_lanes - tile_e = vector_lane - dh_tiles = E // tile_e - decode_gqa_block_size = 512 - BlkS = decode_gqa_block_size if S >= decode_gqa_block_size else int(S) - # Padding-based tail handling: allow S not divisible by BlkS. - nblk = (S + BlkS - 1) // BlkS - HgDhTiles = H * g * dh_tiles - tile_pack = tile_e * 2 - - partial_layout = ir.FixedLayout( - query.get_device(), - torch.float32, - [HgDhTiles, nblk, tile_pack], - ) - partial_tmpl = MLIRDecodeGQASDPAPartialTemplate([query, key, value], partial_layout, scale, BlkS=BlkS) - partial = partial_tmpl.generate().output_node() - partial.realize() - reduce_tmpl = MLIRDecodeGQASDPAReduceTemplate([partial], layout, BlkS=BlkS) - out_node = reduce_tmpl.generate().output_node() - return (out_node, None, None, None, None, None, None, None, None) - mlir_template = MLIRFlashSDPATemplate([query, key, value], layout, scale) - - # _scaled_dot_product_flash_attention has to return a tuple which has 9 values - # since its backward(_scaled_dot_product_flash_attention_backward) needs that values. - # (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor rng_state, Tensor unused, Tensor debug_attn_mask) return (mlir_template.generate().output_node(), None, None, None, None, None, None, None, None) + + def conv_layout( x: TensorBox, weight: TensorBox, @@ -345,5 +314,4 @@ def _sort_layouts(x: TensorBox, dim: int, descending: bool): if extension_config.CONFIG_USE_TIMING_POOLING: lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template - lowerings.update({getattr(aten._scaled_dot_product_fused_attention_overrideable, overload): tuned_flash_sdpa for overload in aten._scaled_dot_product_fused_attention_overrideable.overloads()}) diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py index be6e7124..37db4956 100644 --- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py +++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py @@ -125,14 +125,6 @@ def flash_sdpa_args( "Flash SDPA currently requires matching head dimensions between query and value (e == ev)." ) - # Support head dimensions larger than vector lanes by tiling e/ev. - # For now, require multiples of vector lanes (covers 64/128 with vlanes=16). - vector_lane = extension_config.vpu_num_lanes - if (e % vector_lane) != 0: - raise NotImplementedError( - f"Flash SDPA currently requires e to be a multiple of vlanes (e: {e}, vlanes: {vector_lane})." - ) - # Minimal GQA support (single-batch only for now). # We map each query head to a KV head by grouping: hq = g * h. if hq != h: @@ -309,7 +301,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: { idx_map = array } ins(%vt_buffer2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ data_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(data_stype) }}) outs(%ot_buffer2D : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>) - } + } {inner_loop=true} // out @ row_sum^(-1) %final_row_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> @@ -556,416 +548,3 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no return tile_candidates - -# --------------------------- -# Decode-only GQA SDPA: 2-kernel pipeline (partial blocks + reduce) -# --------------------------- - -DECODE_GQA_SDPA_PARTIAL_TEMPLATE = r""" -// Decode GQA SDPA partial kernel (per sequence block) -// Produces partials per (kv,qsub,dh_tile,blk): -// - first half lanes: o_j (tile_e) -// - second half lanes: [m_j, l_j, 0, 0, ...] (tile_e) -// QK/softmax is computed once per (kv,qsub,s0) over full Dh using k0 reduction. -// SV then reuses those probabilities across all dh tiles. -// H = {{ H }}, g = {{ g }}, Dh = {{ Dh }}, dh_tiles = {{ dh_tiles }}, S = {{ S }}, BlkS = {{ BlkS }}, nblk = {{ nblk }} -{{kernel.def_global_vars()}} - -func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[query, key, value], outputs=[partial], names_str="query, key, value, partial", input_reorder=input_reorder)}} { - {{ kernel.def_sram_buffer("query", q_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }} - - - %c0 = arith.constant 0.0 : f32 - %c_scale = arith.constant {{ scale }} : f32 - %c_neg_inf = arith.constant -1.0e+30 : f32 - - %v0_e = arith.constant dense<0.0> : vector<{{ tile_e }}xf32> - %v0_e_io = arith.constant dense<0.0> : vector<{{ tile_e }}x{{ io_stype }}> - %v0_s = arith.constant dense<0.0> : vector<{{ tile_s }}xf32> - %v0_2x = arith.constant dense<0.0> : vector<2xf32> - %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2xf32> - %v_scale = vector.broadcast %c_scale : f32 to vector<{{ tile_s }}xf32> - - {{ kernel.def_local_vars(indent_size=2) }} - - affine.for %kv = 0 to {{ H }} { - affine.for %blk = 0 to {{ nblk }} step 1 { - // Reset per-block accumulators for all qsub/dh tiles. - %qk_offset = affine.apply {{ qk_offset_map }}(%kv) - {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, indent_size=8, dram_stride=q_dram_stride, dram_offset="qk_offset") }} - %q2D_buffer = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ Dh }}, {{ g_size }}], strides: [{{g_size}}, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ Dh }}x{{ g_size }}x{{ io_stype }}, 1> - affine.for %s0 = 0 to {{ BlkS }} step {{ tile_s }} { - affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} { - %kk_offset = affine.apply {{ kk_offset_map_blk }}(%kv, %s0, %k0)[%blk] - {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }} - %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }},1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1> - %q2D = memref.reinterpret_cast %q2D_buffer to offset: [%k0], sizes: [{{ tile_e }}, {{ g_size }}], strides: [{{ g_size }}, 1] : memref<{{ Dh }}x{{ g_size }}x{{ io_stype }}, 1> to memref<{{ tile_e }}x{{ g_size }}x{{ io_stype }}, 1> - linalg.matmul - ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x{{ g_size }}x{{ io_stype }}, 1>) - outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }}) - - } { accumulation_loop=true } - } { accumulation_loop=true } - } { outer_loop=true } - } { outer_loop=true } - return -} -""" - - -class MLIRDecodeGQASDPAPartialTemplate(MLIRTemplate): - def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=None): - super().__init__("kernel", input_nodes, layout, input_reorder) - self.scale = scale - self.BlkS = BlkS - - def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs): - query, key, value = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2] - # Use the actual registered buffer node (e.g. "buf0") instead of the placeholder "buf_out". - partial = template_buffer_node if template_buffer_node is not None else self.output_node - - q_tensor4 = empty_strided(query.layout.size, query.layout.stride) - k_tensor4 = empty_strided(key.layout.size, key.layout.stride) - v_tensor4 = empty_strided(value.layout.size, value.layout.stride) - B, Hq, Lq, Dh = q_tensor4.shape - _, H, S, _ = k_tensor4.shape - assert B == 1 and Lq == 1 - g = Hq // H - g_size = g - BlkS = min(int(self.BlkS), int(S)) - nblk = (int(S) + int(BlkS) - 1) // int(BlkS) - - io_stype = mlir_common.DTYPE_TO_MLIR[query.get_dtype()] - tile_s = kernel.vector_lane - tile_e = kernel.vector_lane - tile_pack = tile_e * 2 - - # Use 3D views for indices - q_tensor = q_tensor4.view(Hq, 1, Dh) - k_tensor = k_tensor4.view(H, S, Dh) - v_tensor = v_tensor4.view(H, S, Dh) - - # Flatten (kv,qsub,dh_tile) into GH = H*g*(Dh/tile_e) - dh_tiles = int(Dh) // int(tile_e) - HgDhTiles = int(H) * int(g) * int(dh_tiles) - - # tile descs - vlane_stride = 1 - q_tile_desc = mlir_common.MLIRMultiDimTile([Dh, 1, g_size], kernel.vector_lane, 2, vlane_stride) - q_tile_desc.set_tile_size_stride([Dh, 1, g_size], [g_size, 1, 1]) - q_tile_desc.set_name("q_buffer") - q_tile_desc.offset = query.get_layout().offset - - k_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 2, vlane_stride) - k_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [1, 1, tile_s]) - k_tile_desc.set_name("k_buffer") - k_tile_desc.offset = key.get_layout().offset - - v_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 1, vlane_stride) - v_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [1, tile_e, 1]) - v_tile_desc.set_name("v_buffer") - v_tile_desc.offset = value.get_layout().offset - - mul_tile_desc = mlir_common.MLIRMultiDimTile([tile_s, g_size], kernel.vector_lane, 1, vlane_stride) - mul_tile_desc.set_tile_size_stride([tile_s, g_size], [1, tile_s]) - mul_tile_desc.set_name("mul_buffer") - - # score_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride) - # score_desc.set_tile_size_stride([g, tile_s], [tile_s, 1]) - # score_desc.set_name("score_buffer") - - # prob_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride) - # prob_desc.set_tile_size_stride([g, tile_s], [tile_s, 1]) - # prob_desc.set_name("prob_buffer") - - # # Per-qsub, per-dh-tile accumulators so QK is computed once and SV expands across dh tiles. - # out_acc_tile_desc = mlir_common.MLIRMultiDimTile([g, dh_tiles, tile_e], kernel.vector_lane, 2, vlane_stride) - # out_acc_tile_desc.set_tile_size_stride([g, dh_tiles, tile_e], [dh_tiles * tile_e, tile_e, 1]) - # out_acc_tile_desc.set_name("out_acc_buffer") - - # max_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride) - # max_desc.set_tile_size_stride([g, 2], [2, 1]) - # max_desc.set_name("max_buffer") - - # sum_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride) - # sum_desc.set_tile_size_stride([g, 2], [2, 1]) - # sum_desc.set_name("sum_buffer") - - # out_io_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) - # out_io_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) - # out_io_tile_desc.set_name("out_io_buffer") - - # partial_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_pack], kernel.vector_lane, 1, vlane_stride) - # partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1]) - # partial_tile_desc.set_name("partial_buffer") - - # Strides from 3D tensor views - q_stride = q_tensor.stride() - k_stride = k_tensor.stride() - v_stride = v_tensor.stride() - - # partial tensor is view(HgDhTiles, nblk, tile_pack) contiguous - p_tensor = empty_strided(partial.get_layout().size, partial.get_layout().stride).view(HgDhTiles, nblk, tile_pack) - p_stride = p_tensor.stride() - - # DMA strides - k_dram_stride = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])] - q_dram_stride = [int(q_stride[2]), 0, int(q_stride[0])] - v_dram_stride = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])] - partial_dram_stride = [int(p_stride[0]), int(p_stride[1]), 1] - - # Affine offset maps - kk_offset_map = _make_offset_map(k_dram_stride, k_tile_desc.offset) - qk_offset_map = _make_offset_map([int(g) * int(q_stride[2])], q_tile_desc.offset) - v_offset_map = _make_offset_map(v_dram_stride, v_tile_desc.offset) - # partial: offset(gh, blk) -- gh = (kv*g+qsub)*dh_tiles+dht, pre-computed in template - partial_offset_map = _make_offset_map([int(p_stride[0]), int(p_stride[1])], 0) - # Blk-symbol variants: %s0 is relative (0..BlkS-1), %blk is a block index (0..nblk-1), - # so actual_s = s0_rel + BlkS * blk → sym_stride=BlkS. - kk_offset_map_blk = _make_offset_map_with_sym(k_dram_stride, sym_dim=1, sym_stride=int(BlkS), offset=k_tile_desc.offset) - v_offset_map_blk = _make_offset_map_with_sym(v_dram_stride, sym_dim=1, sym_stride=int(BlkS), offset=v_tile_desc.offset) - - # Keep sympy-based indices only for epilogue_info - kv = sympy.Symbol("kv") - qsub = sympy.Symbol("qsub") - dht = sympy.Symbol("dht") - dh0 = sympy.Symbol("dh0") - blk = sympy.Symbol("blk") - q_head = kv * g + qsub - gh = (kv * g + qsub) * dh_tiles + dht - partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)] - - kernel.loop_size = [tile_s, tile_e, tile_pack] - - kernel.render_options = dict( - KERNEL_NAME=self.name, - kernel=kernel, - H=H, - g=g, - Dh=Dh, - S=S, - BlkS=BlkS, - nblk=nblk, - tile_s=tile_s, - tile_e=tile_e, - g_size=g_size, - dh_tiles=dh_tiles, - tile_pack=tile_pack, - io_stype=io_stype, - scale=self.scale, - query=query, - key=key, - value=value, - partial=partial, - q_tile_desc=q_tile_desc, - k_tile_desc=k_tile_desc, - v_tile_desc=v_tile_desc, - mul_tile_desc=mul_tile_desc, - # score_desc=score_desc, - # prob_desc=prob_desc, - # out_io_tile_desc=out_io_tile_desc, - # out_acc_tile_desc=out_acc_tile_desc, - # max_desc=max_desc, - # sum_desc=sum_desc, - # partial_tile_desc=partial_tile_desc, - # DMA strides - k_dram_stride=k_dram_stride, - q_dram_stride=q_dram_stride, - v_dram_stride=v_dram_stride, - partial_dram_stride=partial_dram_stride, - # Affine offset maps - kk_offset_map=kk_offset_map, - qk_offset_map=qk_offset_map, - v_offset_map=v_offset_map, - partial_offset_map=partial_offset_map, - kk_offset_map_blk=kk_offset_map_blk, - v_offset_map_blk=v_offset_map_blk, - input_reorder=self.input_reorder, - ) - - return self._template_from_string(DECODE_GQA_SDPA_PARTIAL_TEMPLATE).render(**kernel.render_options) - - -DECODE_GQA_SDPA_REDUCE_TEMPLATE = r""" -// Decode GQA SDPA reduce kernel: merge partials across blocks -// Input partial shape: (HgDhTiles, nblk, tile_pack) -{{kernel.def_global_vars()}} - -func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[partial], outputs=[out], names_str="partial, out", input_reorder=input_reorder)}} { - {{ kernel.def_sram_buffer("partial", partial_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("out", out_tile_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }} - {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }} - - %c0 = arith.constant 0.0 : f32 - %c1 = arith.constant 1.0 : f32 - %c_neg_inf = arith.constant -1.0e+30 : f32 - %v0_e = arith.constant dense<0.0> : vector<{{ tile_e }}xf32> - %v0_2x = arith.constant dense<0.0> : vector<2xf32> - %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2xf32> - - {{ kernel.def_local_vars(indent_size=2) }} - - affine.for %gh = 0 to {{ HgDhTiles }} { - // reset merged accumulators - affine.vector_store %v0_e, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> - affine.vector_store %v0_2x, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> - - affine.for %blk = 0 to {{ nblk }} { - %partial_offset = affine.apply {{ partial_offset_map }}(%gh, %blk) - {{ kernel.def_dma_op("MVIN", "partial", [], partial_tile_desc, indent_size=8, dram_stride=partial_dram_stride, dram_offset="partial_offset") }} - %p = affine.vector_load %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32> - %p2 = vector.shape_cast %p : vector<{{ tile_pack }}xf32> to vector<2x{{ tile_e }}xf32> - %o_j = vector.extract %p2[0] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32> - %ml_j = vector.extract %p2[1] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32> - %m_j = vector.extract %ml_j[0] : f32 from vector<{{ tile_e }}xf32> - %l_j = vector.extract %ml_j[1] : f32 from vector<{{ tile_e }}xf32> - - %old_max = affine.vector_load %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> - %m_old = vector.extract %old_max[0] : f32 from vector<2xf32> - %m_new = arith.maximumf %m_old, %m_j : f32 - %m_new2 = vector.broadcast %m_new : f32 to vector<2xf32> - affine.vector_store %m_new2, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32> - - %diff_old = arith.subf %m_old, %m_new : f32 - %diff_j = arith.subf %m_j, %m_new : f32 - %diff_old_v = vector.broadcast %diff_old : f32 to vector<1xf32> - %diff_j_v = vector.broadcast %diff_j : f32 to vector<1xf32> - %scale_old_v = math.exp %diff_old_v : vector<1xf32> - %scale_j_v = math.exp %diff_j_v : vector<1xf32> - %scale_old = vector.extract %scale_old_v[0] : f32 from vector<1xf32> - %scale_j = vector.extract %scale_j_v[0] : f32 from vector<1xf32> - %scale_old_e = vector.broadcast %scale_old : f32 to vector<{{ tile_e }}xf32> - %scale_j_e = vector.broadcast %scale_j : f32 to vector<{{ tile_e }}xf32> - - %o_old = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - %o_old_rs = arith.mulf %o_old, %scale_old_e : vector<{{ tile_e }}xf32> - %o_j_rs = arith.mulf %o_j, %scale_j_e : vector<{{ tile_e }}xf32> - %o_new = arith.addf %o_old_rs, %o_j_rs : vector<{{ tile_e }}xf32> - affine.vector_store %o_new, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - - %old_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> - %l_old = vector.extract %old_sum[0] : f32 from vector<2xf32> - %l_old_rs = arith.mulf %l_old, %scale_old : f32 - %l_j_rs = arith.mulf %l_j, %scale_j : f32 - %l_new = arith.addf %l_old_rs, %l_j_rs : f32 - %l_new2 = vector.broadcast %l_new : f32 to vector<2xf32> - affine.vector_store %l_new2, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> - } { accumulation_loop=true } - - // finalize: out = o / l - %sum2 = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32> - %l = vector.extract %sum2[0] : f32 from vector<2xf32> - %inv = arith.divf %c1, %l : f32 - %inv_e = vector.broadcast %inv : f32 to vector<{{ tile_e }}xf32> - %o = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32> - %out_f32 = arith.mulf %o, %inv_e : vector<{{ tile_e }}xf32> - {% if io_stype != "f32" %}%out_io = arith.truncf %out_f32 : vector<{{ tile_e }}xf32> to vector<{{ tile_e }}x{{ io_stype }}>{% endif %} - affine.vector_store {{ "%out_io" if io_stype != "f32" else "%out_f32" }}, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}> - %out_offset = affine.apply {{ out_offset_map }}(%gh) - {{ kernel.def_dma_op("MVOUT", "out", [], out_tile_desc, indent_size=4, dram_stride=out_dram_stride, dram_offset="out_offset") }} - } { outer_loop=true } - return -} -""" - - -class MLIRDecodeGQASDPAReduceTemplate(MLIRTemplate): - def __init__(self, input_nodes, layout, BlkS: int = 1024, input_reorder=None): - super().__init__("kernel", input_nodes, layout, input_reorder) - self.BlkS = BlkS - - def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs): - partial = self.input_nodes[0] - # Use the actual registered buffer node (e.g. "buf0") instead of the placeholder "buf_out". - out = template_buffer_node if template_buffer_node is not None else self.output_node - - tile_e = kernel.vector_lane - tile_pack = tile_e * 2 - - # Infer sizes from partial layout: (HgDhTiles, nblk, tile_pack) - HgDhTiles, nblk, _ = partial.get_size() - io_stype = mlir_common.DTYPE_TO_MLIR[out.get_dtype()] - - vlane_stride = 1 - partial_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_pack], kernel.vector_lane, 1, vlane_stride) - partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1]) - partial_tile_desc.set_name("partial_buffer") - partial_tile_desc.offset = partial.get_layout().offset - - out_acc_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) - out_acc_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) - out_acc_tile_desc.set_name("out_acc_buffer") - - max_desc = mlir_common.MLIRMultiDimTile([1, 2], kernel.vector_lane, 0, vlane_stride) - max_desc.set_tile_size_stride([1, 2], [2, 1]) - max_desc.set_name("max_buffer") - - sum_desc = mlir_common.MLIRMultiDimTile([1, 2], kernel.vector_lane, 0, vlane_stride) - sum_desc.set_tile_size_stride([1, 2], [2, 1]) - sum_desc.set_name("sum_buffer") - - out_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride) - out_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1]) - out_tile_desc.set_name("out_buffer") - - # Partial tensor strides - p_tensor = empty_strided(partial.get_layout().size, partial.get_layout().stride) - p_stride = p_tensor.stride() - - # Out view: (Hq*dh_tiles, 1, tile_e) - out_tensor4 = empty_strided(out.get_layout().size, out.get_layout().stride) - B, Hq, Lq, Dh = out_tensor4.shape - assert B == 1 and Lq == 1 - dh_tiles = int(Dh) // int(tile_e) - out_tensor = out_tensor4.view(Hq * dh_tiles, 1, tile_e) - o_stride = out_tensor.stride() - - # DMA strides - partial_dram_stride = [int(p_stride[0]), int(p_stride[1]), 1] - out_dram_stride = [int(o_stride[0]), 0, 0] - - # Affine offset maps - # partial: offset(gh, blk) - partial_offset_map = _make_offset_map([int(p_stride[0]), int(p_stride[1])], partial_tile_desc.offset) - # out: offset(gh) -- single dimension - out_offset_map = _make_offset_map([int(o_stride[0])], 0) - - # Keep sympy-based indices for epilogue_info - gh = sympy.Symbol("gh") - blk = sympy.Symbol("blk") - partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)] - out_idx = [gh * o_stride[0], sympy.Integer(0), sympy.Integer(0)] - - kernel.loop_size = [tile_pack, tile_e, 1] - - kernel.render_options = dict( - KERNEL_NAME=self.name, - kernel=kernel, - HgDhTiles=HgDhTiles, - nblk=nblk, - tile_e=tile_e, - tile_pack=tile_pack, - io_stype=io_stype, - partial=partial, - out=out, - partial_tile_desc=partial_tile_desc, - out_acc_tile_desc=out_acc_tile_desc, - max_desc=max_desc, - sum_desc=sum_desc, - out_tile_desc=out_tile_desc, - # DMA strides - partial_dram_stride=partial_dram_stride, - out_dram_stride=out_dram_stride, - # Affine offset maps - partial_offset_map=partial_offset_map, - out_offset_map=out_offset_map, - input_reorder=self.input_reorder, - ) - - return self._template_from_string(DECODE_GQA_SDPA_REDUCE_TEMPLATE).render(**kernel.render_options) diff --git a/tests/test_sdpa.py b/tests/test_sdpa.py index ed7ae8f8..c4825731 100644 --- a/tests/test_sdpa.py +++ b/tests/test_sdpa.py @@ -1,128 +1,145 @@ import sys -import math +import os import torch -import inspect -from typing import List +import torch._dynamo import torch.nn.functional as F -from torch.nn.attention import SDPBackend, sdpa_kernel -from torch.fx.passes.graph_drawer import FxGraphDrawer -from torch._inductor.decomposition import decompositions -def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): - message = f"|{name} Test Passed|" +base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim") +sys.path.append(base_dir) + +device = torch.device("npu:0") + +# --------------------------------------------------------------------------- +# Default sweep configs - edit here to change what gets tested +# --------------------------------------------------------------------------- +SDPA_DEFAULTS = dict( + n_batch_list = [1, 4, 8, 16], + n_head_list = [4, 6, 8, 12], + n_token_list = [128, 256, 512, 1024], + head_dim_list = [32, 64, 128], + is_causal = False, +) + +GQA_DEFAULTS = dict( + batch_list = [1], + num_kv_heads = 1, + gqa_ratios = [4, 5, 8, 16], # Hq = ratio * num_kv_heads + seq_len_list = [128, 256, 1024], + head_dim_list = [64, 128], + query_len = 1, # decode shape: Lq == 1 + is_causal = True, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- +def clear_caches(): + from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache + from torch._inductor.codecache import FxGraphCache + AOTAutogradCache.clear() + torch._dynamo.reset() + os.environ["TORCHINDUCTOR_CACHE"] = "0" + FxGraphCache.clear() + + +def assert_close(name, out, cpu_out, rtol=1e-4, atol=1e-4): + msg = f"|{name} Test Passed|" if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): - print("-" * len(message)) - print(message) - print("-" * len(message)) - pass + print("-" * len(msg)) + print(msg) + print("-" * len(msg)) else: - print("custom out: ", out.cpu()) - print("cpu out: ", cpu_out) + print(f"[FAIL] {name}") + print(" device out:", out.cpu()) + print(" cpu out:", cpu_out) exit(1) -def test_scaled_dot_product_attention(device, backends="flash"): + +def _run_sdpa(device, q, k, v, **kwargs): + """Compile and run SDPA on device; return result on device.""" + opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention) + return opt_fn(q.to(device), k.to(device), v.to(device), **kwargs) + + +def _cpu_sdpa(q, k, v, **kwargs): + """Run reference SDPA on CPU.""" + return F.scaled_dot_product_attention(q.cpu(), k.cpu(), v.cpu(), **kwargs) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- +def test_sdpa( + device, + n_batch_list = SDPA_DEFAULTS["n_batch_list"], + n_head_list = SDPA_DEFAULTS["n_head_list"], + n_token_list = SDPA_DEFAULTS["n_token_list"], + head_dim_list = SDPA_DEFAULTS["head_dim_list"], + is_causal = SDPA_DEFAULTS["is_causal"], +): torch.manual_seed(0) - n_batch_list = [1, 4, 8, 16] - n_head_list = [1, 4, 8, 12] - n_token_list = [128, 256, 512, 1024] - head_dim_list = [32, 64, 128] - - for n_batch in n_batch_list: - for n_head in n_head_list: - for n_token in n_token_list: - for head_dim in head_dim_list: - # Inputs + sdpa_kwargs = dict(attn_mask=None, dropout_p=0.0, is_causal=is_causal) + + for B in n_batch_list: + for H in n_head_list: + for S in n_token_list: + for D in head_dim_list: clear_caches() - query = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32) - key = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32) - value = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32) - - # With NPU - query = query.to(device=device) - key = key.to(device=device) - value = value.to(device=device) - - opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention) - out = opt_fn(query, key, value) - out = out.to(device) - - # With CPU - cpu_device = torch.device('cpu') - query = query.to(device=cpu_device) - key = key.to(device=cpu_device) - value = value.to(device=cpu_device) - cpu_out = F.scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False) - - name = f"SDPA(n_batch: {n_batch}, n_head: {n_head}, n_token: {n_token}, head_dim: {head_dim})" - test_result(name, out, cpu_out) - - print("All tests passed!") - -def test_scaled_dot_product_attention_gqa_single_batch(device): + q = torch.rand(B, H, S, D, dtype=torch.float32) + k = torch.rand(B, H, S, D, dtype=torch.float32) + v = torch.rand(B, H, S, D, dtype=torch.float32) + + out = _run_sdpa(device, q, k, v, **sdpa_kwargs) + cpu_out = _cpu_sdpa(q, k, v, **sdpa_kwargs) + + assert_close(f"SDPA(B:{B}, H:{H}, S:{S}, D:{D})", out, cpu_out) + + print("All SDPA tests passed!") + + +def test_gqa( + device, + batch_list = GQA_DEFAULTS["batch_list"], + num_kv_heads = GQA_DEFAULTS["num_kv_heads"], + gqa_ratios = GQA_DEFAULTS["gqa_ratios"], + seq_len_list = GQA_DEFAULTS["seq_len_list"], + head_dim_list= GQA_DEFAULTS["head_dim_list"], + query_len = GQA_DEFAULTS["query_len"], + is_causal = GQA_DEFAULTS["is_causal"], +): """ - Focused GQA testcases for single-batch (n==1). - Shapes: - q: (B, Hq, Lq, Dh) - k: (B, H, S, Dh) - v: (B, H, S, Dh) + GQA sweep: q shape (B, Hq, Lq, D), kv shape (B, H, S, D). + Hq = ratio * num_kv_heads for each ratio in gqa_ratios. """ torch.manual_seed(0) + sdpa_kwargs = dict(attn_mask=None, dropout_p=0.0, is_causal=is_causal, enable_gqa=True) - B = 1 - # Decode-focused: include a larger S to hit BlkS logic - seq_len_list = [128, 256, 1024] - head_dim_list = [64, 128] - # GQA ratios requested: Hq / H in {4, 5, 8, 16}. - # Keep H=1 to directly realize those ratios. - gqa_ratios = [4, 5, 8, 16] - H = 1 - - for seq_len in seq_len_list: - for head_dim in head_dim_list: - for ratio in gqa_ratios: - Hq = ratio * H - - clear_caches() - # Decode shape: Lq == 1 - q = torch.rand(B, Hq, 1, head_dim, dtype=torch.float32) - k = torch.rand(B, H, seq_len, head_dim, dtype=torch.float32) - v = torch.rand(B, H, seq_len, head_dim, dtype=torch.float32) - - # NPU - q_npu = q.to(device=device) - k_npu = k.to(device=device) - v_npu = v.to(device=device) - opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention) - out = opt_fn(q_npu, k_npu, v_npu, attn_mask=None, dropout_p=0.0, is_causal=True, enable_gqa=True) - - # CPU reference - cpu_device = torch.device("cpu") - cpu_out = F.scaled_dot_product_attention( - q.to(device=cpu_device), - k.to(device=cpu_device), - v.to(device=cpu_device), - attn_mask=None, - dropout_p=0.0, - is_causal=True, - enable_gqa=True, - ) - - name = f"SDPA-GQA(B: {B}, Hq: {Hq}, H: {H}, S: {seq_len}, head_dim: {head_dim})" - test_result(name, out, cpu_out) - - print("All GQA single-batch tests passed!") + for B in batch_list: + for S in seq_len_list: + for D in head_dim_list: + for ratio in gqa_ratios: + Hq = ratio * num_kv_heads + clear_caches() + q = torch.rand(B, Hq, query_len, D, dtype=torch.float32) + k = torch.rand(B, num_kv_heads, S, D, dtype=torch.float32) + v = torch.rand(B, num_kv_heads, S, D, dtype=torch.float32) -def clear_caches(): - import os - from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache - from torch._inductor.codecache import FxGraphCache - AOTAutogradCache.clear() - torch._dynamo.reset() - os.environ["TORCHINDUCTOR_CACHE"] = "0" - FxGraphCache.clear() + out = _run_sdpa(device, q, k, v, **sdpa_kwargs) + cpu_out = _cpu_sdpa(q, k, v, **sdpa_kwargs) + + assert_close( + f"GQA(B:{B}, Hq:{Hq}, H:{num_kv_heads}, S:{S}, D:{D})", + out, cpu_out, + ) + + print("All GQA tests passed!") + + +if __name__ == "__main__": + with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.FLASH_ATTENTION]): + test_sdpa(device) + #test_gqa(device) -if __name__ == "__main__": - device = torch.device('npu:0') - # test_scaled_dot_product_attention(device, backends="flash") - test_scaled_dot_product_attention_gqa_single_batch(device) - \ No newline at end of file + # Example: quick single-config run + # test_gqa(device, batch_list=[1], gqa_ratios=[5], seq_len_list=[32], head_dim_list=[128]) From dd71c70766a06149a975615585f51536d1ea2904 Mon Sep 17 00:00:00 2001 From: HamHyungkyu Date: Tue, 17 Mar 2026 02:31:49 +0000 Subject: [PATCH 136/194] [Frontend] Handle RecompileSignal in MLIRKernel code generation --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 38125e31..672c35f7 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -964,7 +964,10 @@ def make_choices(self, nodes, kernel_name): # Try initial tile size self.reset(None) - src_code, meta_code = super().codegen_nodes(nodes, kernel_name) + try: + src_code, meta_code = super().codegen_nodes(nodes, kernel_name) + except mlir_common.RecompileSignal: + continue current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size()) search_space.add(current_tile_sz) @@ -986,14 +989,12 @@ def make_choices(self, nodes, kernel_name): # Try increase tile size for this axis try: self.kernel_group.tile_desc.scale_tile_dim(axis, prev_ranges[axis], 2) - except extension_codecache.TileSizeError as e: - # Failed to find proper tile size + self.reset(None) + src_code, meta_code = super().codegen_nodes(nodes, kernel_name) + except (extension_codecache.TileSizeError, mlir_common.RecompileSignal): candidate_axes.remove(axis) self.reset(None) continue - - self.reset(None) - src_code, meta_code = super().codegen_nodes(nodes, kernel_name) current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size()) # FIXME. How to intergrate this constraint to tile system? From c5f085ece4e9523ca1e97ee165c6cb976df5427c Mon Sep 17 00:00:00 2001 From: HamHyungkyu Date: Tue, 17 Mar 2026 02:39:03 +0000 Subject: [PATCH 137/194] [Frontend] Enhance vector size handling for low-precision paths in MLIR kernels --- PyTorchSimFrontend/mlir/mlir_common.py | 73 +++++++++++++++++++++--- PyTorchSimFrontend/mlir/mlir_template.py | 4 +- 2 files changed, 68 insertions(+), 9 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 9f5dc6ab..32805261 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -103,14 +103,17 @@ def get_dtype_nbytes(dtype): MLIR_INF = { "inf" : { + "f16" : 0x7C00, "f32" : 0x7F800000, "f64" : 0x7FF0000000000000 }, "-inf" : { + "f16" : 0xFC00, "f32" : 0xFF800000, "f64" : 0xFFF0000000000000 }, "nan" : { + "f16" : 0x7C00, "f32" : 0x7FC00000, "f64" : 0x7FF8000000000000 } @@ -260,17 +263,23 @@ def get_tile_stride_per_lane(self, tile_size: list[int], tile_stride: list[int]) return tile_stride def get_compute_vec_size(self, tile_size: list[int], reduction_numel: int, nr_rdim: int) -> int: - if self.forced_vec_size is not None: - return self.forced_vec_size - per_lane = self.get_numel_per_lane(tile_size) stride = self.vlane_stride if nr_rdim: val = per_lane // max(reduction_numel, 1) + result = val for mult in [8, 4, 2]: if per_lane >= val * mult: - return val * mult - return val + result = val * mult + break + if self.forced_vec_size is not None: + # Cap while keeping result divisible by val (= reduction_size). + # This preserves the assert(vec_len % reduction_size == 0) invariant. + capped = (min(result, self.forced_vec_size) // max(val, 1)) * max(val, 1) + result = max(capped, val) + return result + if self.forced_vec_size is not None: + return self.forced_vec_size for mult in [8, 4, 2]: if (per_lane // stride) >= mult: return stride * mult @@ -787,10 +796,24 @@ def codegen_nodes(self, nodes, kernel_name): # Set node range info vars, reduction_vars = self.set_ranges(group, reduction_group) tile_desc = self.compute_tile_size(nodes, vars, reduction_vars) + _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs() + safe_vec_size = self.get_safe_vec_size(tile_desc.get_compute_vec_size()) + # For pointwise (non-reduction) kernels, cap the MLIR vector size so that + # f16->f32 widening stays within LMUL<=4 (step and forced_vec_size must match). + # Reduction kernels are left unchanged: their accumulator/multi_reduction + # structure assumes compute_vec_size == step, so we must not split them here. + tile_desc.vmap.forced_vec_size = safe_vec_size + compute_vec = tile_desc.get_compute_vec_size() + # RVV requires vector lengths that produce integer power-of-2 LMUL values. + # Non-power-of-2 element counts (e.g. 24) cause LLVM WidenVectorResult crashes. + # Raise BEFORE the try/except so this propagates to make_choices (not retried). + if compute_vec > 1 and (compute_vec & (compute_vec - 1)) != 0: + raise RecompileSignal( + f"Non-power-of-2 compute_vec_size {compute_vec}: tile rejected (RVV requires power-of-2 LMUL)" + ) self.compute_body_loop.size = tile_desc.get_numel_per_lane() - self.compute_body_loop.step = tile_desc.get_compute_vec_size() + self.compute_body_loop.step = compute_vec try: - _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs() with self as kernel: for node in nodes: node.run(vars, reduction_vars) @@ -1035,6 +1058,42 @@ def __exit__(self, exc_type, exc_val, exc_tb): self._nested_context_depth -= 1 if self._nested_context_depth == 0: super().__exit__(exc_type, exc_val, exc_tb) + + def get_safe_vec_size(self, default_vec_size: int = 64) -> int: + """ + Cap forced vector size for low-precision paths so widening ops + (e.g., f16/bf16 -> f32) do not exceed RVV LMUL limits. + + Widening is legal up to source LMUL<=4 (destination LMUL<=8). + Using RVV relation LMUL = (SEW * VL) / VLEN, the safe source VL is: + VL <= 4 * VLEN / SEW + """ + + if not hasattr(self, "buffer_types") or not self.buffer_types: + return default_vec_size + + lowp_bits = [] + for info in self.buffer_types.values(): + dtype = info[0] if info else None + if dtype in DTYPE_LOWP_FP: + mlir_dtype = DTYPE_TO_MLIR[dtype] + lowp_bits.append(MLIR_TO_BIT[mlir_dtype]) + + if not lowp_bits: + return default_vec_size + + min_lowp_bits = min(lowp_bits) + # Constraint: Vector element count must be compatible across all types. + # VLEN=256: f16 (LMUL=2) and f32 (LMUL=4) both yield 32 elements. + # Note: Gem5 version restricts widening ops to LMUL < 8 for destination registers. + # Max LMUL set to 2 to ensure compatibility/safety. + + widen_safe_cap = self.vlen * 2 // min_lowp_bits + if widen_safe_cap <= 0: + return default_vec_size + + vec_size = min(default_vec_size, widen_safe_cap) + return vec_size @dataclasses.dataclass class LoopLevel: diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 53db988b..851f070f 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -1255,7 +1255,7 @@ def set_tile_size(self, template_fusion_info, prologue=False): numel_per_lane = tile_desc.get_numel_per_lane() r_tile_size = tile_desc.get_tile_size()[-1] nr_outer_loop = (numel_per_lane + r_tile_size-1) // r_tile_size - tile_desc.vmap.forced_vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality... + tile_desc.vmap.forced_vec_size = self.get_safe_vec_size(nr_outer_loop * 32) # Why? Emprically selected, other option failed to functionality... self.reduction_fusion = True self.r_tile_size = tile_desc.get_tile_size()[-1] @@ -1266,7 +1266,7 @@ def set_tile_size(self, template_fusion_info, prologue=False): self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop) else: - tile_desc.vmap.forced_vec_size = 64 + tile_desc.vmap.forced_vec_size = self.get_safe_vec_size(64) if prologue: self.prologue_compute_body_loop.size = tile_desc.get_numel_per_lane() From fdd5b5459c41892b4d1a738b5baa3e21cd945b31 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 19 Mar 2026 02:01:08 +0900 Subject: [PATCH 138/194] [Refactor] move to TOGSimulator-based scheduler API --- experiments/BERT.py | 77 ++++++++++++++++------------------------ experiments/attention.py | 70 +++++++++++++----------------------- experiments/conv.py | 76 +++++++++++++++------------------------ experiments/gemm.py | 61 +++++++++++-------------------- experiments/layernorm.py | 59 +++++++++++------------------- experiments/resnet18.py | 57 ++++++++++------------------- experiments/resnet50.py | 57 ++++++++++------------------- experiments/softmax.py | 58 +++++++++++------------------- tests/Fusion/__init__.py | 0 tests/__init__.py | 0 10 files changed, 182 insertions(+), 333 deletions(-) create mode 100644 tests/Fusion/__init__.py create mode 100644 tests/__init__.py diff --git a/experiments/BERT.py b/experiments/BERT.py index fd671833..b938f4e6 100644 --- a/experiments/BERT.py +++ b/experiments/BERT.py @@ -1,57 +1,42 @@ -import torch -import torch._dynamo -import torch.utils.cpp_extension - +import os +import sys import argparse -import datetime -def run_BERT(size, input_seq, config): - from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - # from tests.test_transformer import EncoderBlock - from tests.Fusion.test_transformer_fusion import EncoderBlock - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) - device = scheduler.execution_engine.module.custom_device() +base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') +sys.path.insert(0, base_path) - hidden_dim = {'base': 768, 'large': 1024, 'xlarge': 2048} - embedding_size = {'base': 768, 'large': 1024, 'xlarge': 2048} - heads = {'base': 12, 'large': 16, 'xlarge': 32} # hidden/64 https://arxiv.org/pdf/1909.11942 - cpu_query = torch.randn(input_seq, hidden_dim[size]) - encoder_block = EncoderBlock(embedding_size[size], heads[size]).eval() - - query = cpu_query.clone().to(device=device) - opt_fn = torch.compile(dynamic=False)(encoder_block.to(device=device)) +import torch +from Simulator.simulator import TOGSimulator - SchedulerDNNModel.register_model(f"BERT-{size}", opt_fn) - request = Request(f"BERT-{size}", [query], [], request_queue_idx=0) - scheduler.add_request(request, request_time=0) +config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml') +os.environ['TOGSIM_CONFIG'] = config - # Run scheduler - while not scheduler.is_finished(): - with torch.no_grad(): - scheduler.schedule() +# Try Fusion EncoderBlock first, fall back to standard test_transformer +try: + from tests.Fusion.test_transformer_fusion import EncoderBlock +except ImportError: + from tests.test_transformer import EncoderBlock - print(f"BERT-{size} Simulation Done") +HIDDEN_DIM = {'base': 768, 'large': 1024, 'xlarge': 2048} +EMBEDDING_SIZE = {'base': 768, 'large': 1024, 'xlarge': 2048} +HEADS = {'base': 12, 'large': 16, 'xlarge': 32} if __name__ == "__main__": - import os - import sys - base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml') - config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name - sys.path.append(base_dir) args = argparse.ArgumentParser() - args.add_argument('--size', type=str, default='base') - args.add_argument('--dump_path', type=str, default='results') + args.add_argument('--size', type=str, default='base', choices=['base', 'large', 'xlarge']) args.add_argument('--input_size', type=int, default=512) args = args.parse_args() - size = args.size - input_seq = args.input_size - result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"BERT_{size}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") - # setting environment variables - os.environ['TORCHSIM_LOG_PATH'] = result_path - # only timing simulation - os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'pytorchsim_functional_mode' in os.environ: - del os.environ['pytorchsim_functional_mode'] - - run_BERT(size, input_seq, config) + + hidden_dim = HIDDEN_DIM[args.size] + embedding_size = EMBEDDING_SIZE[args.size] + heads = HEADS[args.size] + + device = torch.device("npu:0") + model = EncoderBlock(embedding_size, heads).eval().to(device=device) + model_input = torch.randn(args.input_size, hidden_dim).to(device=device) + opt_fn = torch.compile(dynamic=False)(model) + + with TOGSimulator(config_path=config): + torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0) + torch.npu.synchronize() + print(f"BERT-{args.size} Simulation Done") diff --git a/experiments/attention.py b/experiments/attention.py index 211433f1..b56ed537 100644 --- a/experiments/attention.py +++ b/experiments/attention.py @@ -1,56 +1,36 @@ -import torch -import torch._dynamo -import torch.utils.cpp_extension - +import os +import sys +import math import argparse -import datetime +base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') +sys.path.insert(0, base_path) -def run_attention(size, config): - def attention(query, key, value): - import math - d_k = query.size(-1) - scores = torch.matmul(key, query.transpose(-2, -1)) / math.sqrt(d_k) - p_attn = scores.softmax(dim=-2) - return torch.matmul(value.transpose(-1, -2), p_attn) - from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) - device = scheduler.execution_engine.module.custom_device() - query = torch.randn(size).to(device=device) - key = torch.randn(size).to(device=device) - value = torch.randn(size).to(device=device) - opt_fn = torch.compile(dynamic=False)(attention) - - SchedulerDNNModel.register_model("attention", opt_fn) - request = Request("attention", [query, key, value], [], request_queue_idx=0) - scheduler.add_request(request, request_time=0) +import torch +from Simulator.simulator import TOGSimulator - # Run scheduler - while not scheduler.is_finished(): - with torch.no_grad(): - scheduler.schedule() +config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml') +os.environ['TOGSIM_CONFIG'] = config - print(f"Attention {str(size)} Simulation Done") +def attention(query, key, value): + d_k = query.size(-1) + scores = torch.matmul(key, query.transpose(-2, -1)) / math.sqrt(d_k) + p_attn = scores.softmax(dim=-2) + return torch.matmul(value.transpose(-1, -2), p_attn) if __name__ == "__main__": - import os - import sys - base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml') - config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path - sys.path.append(base_dir) args = argparse.ArgumentParser() args.add_argument('--size', nargs='+', type=int, default=[12, 512, 64], help='Tensor Shape') - args.add_argument('--dump_path', type=str, default='results') args = args.parse_args() - size = args.size - size_str = "x".join([str(i) for i in size]) - result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"attention_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") - # setting environment variables - os.environ['TORCHSIM_LOG_PATH'] = result_path - # only timing simulation - os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'pytorchsim_functional_mode' in os.environ: - del os.environ['pytorchsim_functional_mode'] + size = tuple(args.size) + + device = torch.device("npu:0") + query = torch.randn(*size).to(device=device) + key = torch.randn(*size).to(device=device) + value = torch.randn(*size).to(device=device) + opt_fn = torch.compile(dynamic=False)(attention) - run_attention(size, config) + with TOGSimulator(config_path=config): + torch.npu.launch_model(opt_fn, query, key, value, stream_index=0, timestamp=0) + torch.npu.synchronize() + print(f"Attention {size} Simulation Done") diff --git a/experiments/conv.py b/experiments/conv.py index 61f7ad80..98391fae 100644 --- a/experiments/conv.py +++ b/experiments/conv.py @@ -1,57 +1,39 @@ -import torch -import torch._dynamo -import torch.utils.cpp_extension - +import os +import sys import argparse -import datetime +base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') +sys.path.insert(0, base_path) + +import torch +from Simulator.simulator import TOGSimulator + +config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') +os.environ['TOGSIM_CONFIG'] = config -def run_conv2d(batch_size, i_h, i_w, i_c, o_c, kernel_size, stride, padding, config): - from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - def custom_conv2d(a, b, bias): - i_c = a.shape[1] - o_c = b.shape[0] - conv2d = torch.nn.Conv2d(i_c, o_c, b.shape[-1], stride=stride, padding=padding, dilation=1, bias=False) +def conv2d_fn(batch_size, i_h, i_w, i_c, o_c, kernel_size, stride, padding): + def _conv(a, b, bias): + conv2d = torch.nn.Conv2d(i_c, o_c, kernel_size, stride=stride, padding=padding, dilation=1, bias=False) conv2d.weight = torch.nn.Parameter(b) - # conv2d.bias = torch.nn.Parameter(bias) return conv2d(a) - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) - device = scheduler.execution_engine.module.custom_device() + return _conv + +if __name__ == "__main__": + args = argparse.ArgumentParser() + args.add_argument('--size', nargs='+', type=int, default=[8, 28, 28, 128, 128, 3, 1, 1], + help='B H W I_C O_C K S P') + args = args.parse_args() + batch_size, i_h, i_w, i_c, o_c, kernel_size, stride, padding = args.size + + device = torch.device("npu:0") conv_input = torch.randn(batch_size, i_c, i_h, i_w).to(memory_format=torch.channels_last, device=device) conv_kernel = torch.randn(o_c, i_c, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device) conv_bias = torch.randn(o_c).to(device=device) - opt_fn = torch.compile(dynamic=False)(custom_conv2d) - - SchedulerDNNModel.register_model("CONV", opt_fn) - request = Request("CONV", [conv_input, conv_kernel, conv_bias], [], request_queue_idx=0) - scheduler.add_request(request, request_time=0) - - # Run scheduler - while not scheduler.is_finished(): - with torch.no_grad(): - scheduler.schedule() - print(f"CONV {batch_size}_{i_h}_{i_w}_{i_c}_{o_c}_{kernel_size}_{stride}_{padding} (B_H_W_I_C_O_C_K_S_P) Simulation Done") + custom_conv = conv2d_fn(batch_size, i_h, i_w, i_c, o_c, kernel_size, stride, padding) + opt_fn = torch.compile(dynamic=False)(custom_conv) -if __name__ == "__main__": - import os - import sys - base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') - config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path - sys.path.append(base_dir) - args = argparse.ArgumentParser() - args.add_argument('--size', nargs='+', type=int, default=[8, 28, 28, 128, 128, 3, 1, 1], help='B H W I_C O_C K S P') - args.add_argument('--dump_path', type=str, default='results') - args = args.parse_args() - size = args.size - size_str = "_".join([str(i) for i in size]) - result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"CONV_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") - # setting environment variables - os.environ['TORCHSIM_LOG_PATH'] = result_path - # only timing simulation - os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'pytorchsim_functional_mode' in os.environ: - del os.environ['pytorchsim_functional_mode'] - - run_conv2d(size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7], config) \ No newline at end of file + with TOGSimulator(config_path=config): + torch.npu.launch_model(opt_fn, conv_input, conv_kernel, conv_bias, stream_index=0, timestamp=0) + torch.npu.synchronize() + print(f"CONV {batch_size}_{i_h}_{i_w}_{i_c}_{o_c}_{kernel_size}_{stride}_{padding} Simulation Done") diff --git a/experiments/gemm.py b/experiments/gemm.py index 0e1a15e4..d256e931 100644 --- a/experiments/gemm.py +++ b/experiments/gemm.py @@ -1,51 +1,32 @@ -import torch -import torch._dynamo -import torch.utils.cpp_extension - +import os +import sys import argparse -import datetime - -def run_matmul(input_size, hidden_size, output_size, config): - from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - def custom_matmul(a, b): - return torch.matmul(a, b) - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) - device = scheduler.execution_engine.module.custom_device() - torch.manual_seed(0) - input = torch.randn(input_size, hidden_size).to(device=device) - weight = torch.randn(hidden_size, output_size).to(device=device) - opt_fn = torch.compile(dynamic=False)(custom_matmul) +base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') +sys.path.insert(0, base_path) - SchedulerDNNModel.register_model("GEMM", opt_fn) - request = Request("GEMM", [input, weight], [], request_queue_idx=0) - scheduler.add_request(request, request_time=0) +import torch +from Simulator.simulator import TOGSimulator - # Run scheduler - while not scheduler.is_finished(): - scheduler.schedule() +config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') +os.environ['TOGSIM_CONFIG'] = config - print(f"GEMM {input_size}x{hidden_size}x{output_size} (MxKxN) Simulation Done") +def matmul_fn(a, b): + return torch.matmul(a, b) if __name__ == "__main__": - import os - import sys - base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') - config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path - sys.path.append(base_dir) args = argparse.ArgumentParser() args.add_argument('--size', nargs='+', type=int, default=[128, 128, 128], help='M K N') - args.add_argument('--dump_path', type=str, default='results') args = args.parse_args() - size = args.size - size_str = "x".join([str(i) for i in size]) - result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"GEMM_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") - # setting environment variables - os.environ['TORCHSIM_LOG_PATH'] = result_path - # only timing simulation - os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'pytorchsim_functional_mode' in os.environ: - del os.environ['pytorchsim_functional_mode'] + M, K, N = args.size[0], args.size[1], args.size[2] - run_matmul(size[0], size[1], size[2], config) + device = torch.device("npu:0") + torch.manual_seed(0) + input_a = torch.randn(M, K).to(device=device) + input_b = torch.randn(K, N).to(device=device) + opt_fn = torch.compile(dynamic=False)(matmul_fn) + + with TOGSimulator(config_path=config): + torch.npu.launch_model(opt_fn, input_a, input_b, stream_index=0, timestamp=0) + torch.npu.synchronize() + print(f"GEMM {M}x{K}x{N} (MxKxN) Simulation Done") diff --git a/experiments/layernorm.py b/experiments/layernorm.py index a6b16986..a9170c6b 100644 --- a/experiments/layernorm.py +++ b/experiments/layernorm.py @@ -1,48 +1,29 @@ -import torch -import torch._dynamo -import torch.utils.cpp_extension - +import os +import sys import argparse -import datetime - -def run_layernorm(size, config): - from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) - device = scheduler.execution_engine.module.custom_device() - input = torch.randn(size).to(device=device) - opt_fn = torch.compile(dynamic=False)(torch.nn.LayerNorm(size[-1]).to(device=device)) +base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') +sys.path.insert(0, base_path) - SchedulerDNNModel.register_model("LayerNorm", opt_fn) - request = Request("LayerNorm", [input], [], request_queue_idx=0) - scheduler.add_request(request, request_time=0) - - # Run scheduler - while not scheduler.is_finished(): - scheduler.schedule() +import torch +from Simulator.simulator import TOGSimulator - print(f"LayerNorm {str(size)} Simulation Done") +config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') +os.environ['TOGSIM_CONFIG'] = config if __name__ == "__main__": - import os - import sys - base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') - config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path - sys.path.append(base_dir) args = argparse.ArgumentParser() args.add_argument('--size', nargs='+', type=int, default=[512, 768], help='Tensor Shape') - args.add_argument('--dump_path', type=str, default='results') args = args.parse_args() - size = args.size - size_str = "x".join([str(i) for i in size]) - result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"LayerNorm_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") - # setting environment variables - os.environ['TORCHSIM_LOG_PATH'] = result_path - os.environ['TORCHSIM_FUSION_REDUCTION_REDUCTION'] = "0" - # only timing simulation - os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'pytorchsim_functional_mode' in os.environ: - del os.environ['pytorchsim_functional_mode'] - - run_layernorm(size, config) + size = tuple(args.size) + normalized_shape = size[-1] + + device = torch.device("npu:0") + model = torch.nn.LayerNorm(normalized_shape).to(device=device) + opt_fn = torch.compile(dynamic=False)(model) + model_input = torch.randn(*size).to(device=device) + + with TOGSimulator(config_path=config): + torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0) + torch.npu.synchronize() + print(f"LayerNorm {size} Simulation Done") diff --git a/experiments/resnet18.py b/experiments/resnet18.py index c7763d86..38fb80fe 100644 --- a/experiments/resnet18.py +++ b/experiments/resnet18.py @@ -1,49 +1,28 @@ -import torch -import torch._dynamo -import torch.utils.cpp_extension - +import os +import sys import argparse -import datetime -def run_resnet(batch, config): - from torchvision.models import resnet18 - from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) - device = scheduler.execution_engine.module.custom_device() - model = resnet18().eval() - input = torch.randn(batch, 3, 224, 224).to(device=device) - opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last)) +base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') +sys.path.insert(0, base_path) - SchedulerDNNModel.register_model("resnet18", opt_fn) - request = Request("resnet18", [input], [], request_queue_idx=0) - scheduler.add_request(request, request_time=0) +import torch +from torchvision.models import resnet18 +from Simulator.simulator import TOGSimulator - # Run scheduler - while not scheduler.is_finished(): - with torch.no_grad(): - scheduler.schedule() - - print("ResNet18 Simulation Done") +config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml') +os.environ['TOGSIM_CONFIG'] = config if __name__ == "__main__": - import os - import sys - base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml') - config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path - sys.path.append(base_dir) args = argparse.ArgumentParser() args.add_argument('--batch', type=int, default=1) - args.add_argument('--dump_path', type=str, default='results') args = args.parse_args() - batch = args.batch - result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet18_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") - # setting environment variables - os.environ['TORCHSIM_LOG_PATH'] = result_path - os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1" - # only timing simulation - os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'pytorchsim_functional_mode' in os.environ: - del os.environ['pytorchsim_functional_mode'] - run_resnet(batch, config) + device = torch.device("npu:0") + model = resnet18().eval().to(device=device, memory_format=torch.channels_last) + opt_fn = torch.compile(dynamic=False)(model) + model_input = torch.randn(args.batch, 3, 224, 224).to(device=device) + + with TOGSimulator(config_path=config): + torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0) + torch.npu.synchronize() + print("ResNet18 Simulation Done") diff --git a/experiments/resnet50.py b/experiments/resnet50.py index 4e611541..5b134c13 100644 --- a/experiments/resnet50.py +++ b/experiments/resnet50.py @@ -1,49 +1,28 @@ -import torch -import torch._dynamo -import torch.utils.cpp_extension - +import os +import sys import argparse -import datetime -def run_resnet(batch, config): - from torchvision.models import resnet50 - from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) - device = scheduler.execution_engine.module.custom_device() - model = resnet50().eval() - input = torch.randn(batch, 3, 224, 224).to(device=device) - opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last)) +base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') +sys.path.insert(0, base_path) - SchedulerDNNModel.register_model("resnet50", opt_fn) - request = Request("resnet50", [input], [], request_queue_idx=0) - scheduler.add_request(request, request_time=0) +import torch +from torchvision.models import resnet50 +from Simulator.simulator import TOGSimulator - # Run scheduler - while not scheduler.is_finished(): - with torch.no_grad(): - scheduler.schedule() - - print("ResNet50 Simulation Done") +config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') +os.environ['TOGSIM_CONFIG'] = config if __name__ == "__main__": - import os - import sys - base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') - config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path - sys.path.append(base_dir) args = argparse.ArgumentParser() args.add_argument('--batch', type=int, default=1) - args.add_argument('--dump_path', type=str, default='results') args = args.parse_args() - batch = args.batch - result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet50_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") - # setting environment variables - os.environ['TORCHSIM_LOG_PATH'] = result_path - os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1" - # only timing simulation - os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'pytorchsim_functional_mode' in os.environ: - del os.environ['pytorchsim_functional_mode'] - run_resnet(batch, config) + device = torch.device("npu:0") + model = resnet50().eval().to(device=device, memory_format=torch.channels_last) + opt_fn = torch.compile(dynamic=False)(model) + model_input = torch.randn(args.batch, 3, 224, 224).to(device=device) + + with TOGSimulator(config_path=config): + torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0) + torch.npu.synchronize() + print("ResNet50 Simulation Done") diff --git a/experiments/softmax.py b/experiments/softmax.py index d30559f7..b86febe0 100644 --- a/experiments/softmax.py +++ b/experiments/softmax.py @@ -1,47 +1,29 @@ -import torch -import torch._dynamo -import torch.utils.cpp_extension - +import os +import sys import argparse -import datetime - -def run_softmax(size, config, dim=1): - from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request - scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) - device = scheduler.execution_engine.module.custom_device() - input = torch.randn(size).to(device=device) - opt_fn = torch.compile(dynamic=False)(torch.nn.Softmax(dim=dim).to(device=device)) +base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') +sys.path.insert(0, base_path) - SchedulerDNNModel.register_model("Softmax", opt_fn) - request = Request("Softmax", [input], [], request_queue_idx=0) - scheduler.add_request(request, request_time=0) - - # Run scheduler - while not scheduler.is_finished(): - scheduler.schedule() +import torch +from Simulator.simulator import TOGSimulator - print(f"Softmax {str(size)} Simulation Done") +config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') +os.environ['TOGSIM_CONFIG'] = config if __name__ == "__main__": - import os - import sys - base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml') - config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path - sys.path.append(base_dir) args = argparse.ArgumentParser() args.add_argument('--size', nargs='+', type=int, default=[512, 512], help='Tensor Shape') - args.add_argument('--dump_path', type=str, default='results') args = args.parse_args() - size = args.size - size_str = "x".join([str(i) for i in size]) - result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"Softmax_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}") - # setting environment variables - os.environ['TORCHSIM_LOG_PATH'] = result_path - # only timing simulation - os.environ['TORCHSIM_VALIDATION_MODE'] = "0" - if 'pytorchsim_functional_mode' in os.environ: - del os.environ['pytorchsim_functional_mode'] - - run_softmax(size, config) + size = tuple(args.size) + dim = 1 + + device = torch.device("npu:0") + model = torch.nn.Softmax(dim=dim).to(device=device) + opt_fn = torch.compile(dynamic=False)(model) + model_input = torch.randn(*size).to(device=device) + + with TOGSimulator(config_path=config): + torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0) + torch.npu.synchronize() + print(f"Softmax {size} Simulation Done") diff --git a/tests/Fusion/__init__.py b/tests/Fusion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b From 3847f9b28053ef0c02b65f0b92d5babd8f01211d Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 19 Mar 2026 14:28:36 +0900 Subject: [PATCH 139/194] [CI] Add missing package + Add test cases --- .github/workflows/docker-image-2-8.yml | 10 +++++++++- Dockerfile.base | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker-image-2-8.yml b/.github/workflows/docker-image-2-8.yml index f1e915d6..52464dff 100644 --- a/.github/workflows/docker-image-2-8.yml +++ b/.github/workflows/docker-image-2-8.yml @@ -52,10 +52,18 @@ jobs: echo "Image did not become available in GHCR within expected time." exit 1 - test-pytorchsim-wrapper: + test-pytorchsim-wrapper1: needs: build-and-test uses: ./.github/workflows/pytorchsim_test.yml with: image_name: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }} vector_lane: 128 spad_size: 128 + + test-pytorchsim-wrapper2: + needs: build-and-test + uses: ./.github/workflows/pytorchsim_test.yml + with: + image_name: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }} + vector_lane: 32 + spad_size: 32 diff --git a/Dockerfile.base b/Dockerfile.base index e8504bcf..05444d41 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -34,7 +34,7 @@ RUN apt -y update && \ python3-dev python-is-python3 libboost-all-dev \ libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \ python3-venv black libssl-dev libasan5 libubsan1 curl device-tree-compiler wget ninja-build && \ - pip install onnx matplotlib scikit-learn pydot tabulate && pip install --user conan==1.56.0 cmake==3.26.4 && rm -rf /var/lib/apt/lists/* + pip install onnx matplotlib scikit-learn pydot tabulate flash_attn && pip install --user conan==1.56.0 cmake==3.26.4 && rm -rf /var/lib/apt/lists/* # Download RISC-V tool chain RUN wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz && \ From 1d7a3a919d7310a03427b2e1b38dcb9067e4f317 Mon Sep 17 00:00:00 2001 From: student-Jungmin Date: Sun, 22 Mar 2026 15:02:24 +0000 Subject: [PATCH 140/194] [FIX] Fix zero systolic array utilization during SDPA execution in TOGSim --- PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py index 37db4956..a3ae6192 100644 --- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py +++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py @@ -238,7 +238,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: %chunk_val = affine.vector_load %mul_buffer[0, %index5] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ chunk_size }}x{{ data_stype }}> %local_max = arith.maximumf %chunk_val, %iter_max : vector<{{ chunk_size }}x{{ data_stype }}> affine.yield %local_max : vector<{{ chunk_size }}x{{ data_stype }}> - } + } { accumulation_loop=true } %max_cast = vector.shape_cast %chunk_max_res : vector<{{ chunk_size }}x{{ data_stype }}> to vector<{{ chunk_size // 2 }}x2x{{ data_stype }}> %max_reduced_1 = vector.multi_reduction , %max_cast, %v_neg_inf_2x [0] : vector<8x2x{{ data_stype }}> to vector<2x{{ data_stype }}> @@ -284,7 +284,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: %chunk_exp = affine.vector_load %mul_buffer[0, %index5] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ chunk_size }}x{{ data_stype }}> %local_sum = arith.addf %chunk_exp, %iter_sum : vector<{{ chunk_size }}x{{ data_stype }}> affine.yield %local_sum : vector<{{ chunk_size }}x{{ data_stype }}> - } + } { accumulation_loop=true } %zero_2x = vector.broadcast %c0 : {{ data_stype }} to vector<2x{{ data_stype }}> %sum_cast = vector.shape_cast %chunk_sum_res : vector<{{ chunk_size }}x{{ data_stype }}> to vector<{{ chunk_size // 2 }}x2x{{ data_stype }}> @@ -301,7 +301,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: { idx_map = array } ins(%vt_buffer2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ data_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(data_stype) }}) outs(%ot_buffer2D : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>) - } {inner_loop=true} + } { accumulation_loop=true } // out @ row_sum^(-1) %final_row_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> @@ -317,7 +317,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float: %out_dram_offset = affine.apply {{ out_offset_map }}(%index0, %index1, %index3) {{ kernel.def_dma_op("MVOUT", "out", [], out_tile_desc, indent_size=8, dram_stride=out_dram_stride, dram_offset="out_dram_offset") }} - } { accumulation_loop=true } + } { outer_loop=true } } { outer_loop=true } } { outer_loop=true } return From 10f592388013bf6b4b0dde0970f1291fe89da569 Mon Sep 17 00:00:00 2001 From: HamHyungkyu Date: Tue, 17 Mar 2026 04:18:42 +0000 Subject: [PATCH 141/194] [Frontend/Fix] Enforce vector length constraints and resolve ext() widening errors Updated the frontend to strictly validate vector element counts, preventing invalid LMUL=8 configurations in Gem5. Fixed a mismatch in the ext() operation's type-checking logic. --- PyTorchSimFrontend/mlir/mlir_common.py | 4 ++-- PyTorchSimFrontend/mlir/mlir_ops.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 32805261..23c02066 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -1086,9 +1086,9 @@ def get_safe_vec_size(self, default_vec_size: int = 64) -> int: # Constraint: Vector element count must be compatible across all types. # VLEN=256: f16 (LMUL=2) and f32 (LMUL=4) both yield 32 elements. # Note: Gem5 version restricts widening ops to LMUL < 8 for destination registers. - # Max LMUL set to 2 to ensure compatibility/safety. + # Max LMUL set to 1 to ensure compatibility/safety. - widen_safe_cap = self.vlen * 2 // min_lowp_bits + widen_safe_cap = self.vlen // min_lowp_bits if widen_safe_cap <= 0: return default_vec_size diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index 76a0e273..218f60a9 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -1041,7 +1041,7 @@ def ext(operand, dtype, *args, **kwargs): op_type = V.kernel.var_info[operand] shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else f"{op_type[1]}" target_type = f"vector<{op_type[0]}x{dtype}>" if op_type[0] > 1 else f"{dtype}" - if op_type[0] == "f": + if dtype[0] == "f": opcode = f'arith.extf' else: opcode = f'arith.extui' From a32f9e04d74bb9035b11facf8aae6c7661d41a6f Mon Sep 17 00:00:00 2001 From: HamHyungkyu Date: Tue, 17 Mar 2026 04:20:31 +0000 Subject: [PATCH 142/194] [Frontend] Add optimized GQA decode implementation with tile-based softmax Note: Known compilation errors persist when using smaller tile sizes; investigation into the tile-stride logic is ongoing. --- tests/test_gqa_decode.py | 216 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 tests/test_gqa_decode.py diff --git a/tests/test_gqa_decode.py b/tests/test_gqa_decode.py new file mode 100644 index 00000000..3605d638 --- /dev/null +++ b/tests/test_gqa_decode.py @@ -0,0 +1,216 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import os +import sys +import math +import argparse +from Simulator.simulator import TOGSimulator +from Scheduler.scheduler import PyTorchSimRunner +device = PyTorchSimRunner.setup_device().custom_device() +# ───────────────────────────────────────────────────────────────────────────── +# Optimized: Flash-Decode style — tile S upfront, batch in B dimension +# ───────────────────────────────────────────────────────────────────────────── + +class GQADecodeOptimized(nn.Module): + """Flash-Decode style GQA decode for multi-core NPU. + + Splits the KV-cache sequence into n_tiles chunks and folds them into the + BMM batch dimension (B_total = H_kv × n_tiles). Both the QK and SV + matrix multiplications are issued as a *single* batched BMM with a short + inner-K loop, so the NPU scheduler can distribute all B_total tiles across + available cores simultaneously. + + Improvement over GQABaseline + ───────────────────────────── + Baseline QK : B=H_kv=1, M=G, N=S(large), K=D → 640 N-tile iters on 1 batch + Optimized QK: B=H_kv*n_tiles, M=G, N=T(small), K=D → n_tiles batch slots for cores + + Baseline SV : B=H_kv=1, M=G, N=D, K=S → K-loop=640, only 8 outer tiles + Optimized SV: B=H_kv*n_tiles, M=G, N=D, K=T → K-loop=T/TILE_K, n_tiles outer tiles + + Memory layout improvements + ────────────────────────── + • K/V tiles are generated with a single contiguous view+reshape (no mid-loop transpose). + • Avoids materializing the full score tensor [H_kv, G, S] in DRAM before tiling. + • Softmax intermediates are kept in smaller [B_total, G, T] buffers. + + Input conventions + ───────────────── + q : [H_kv, G, D] – one decode-step query token per KV head + k : [H_kv, S, D] – KV-cache keys (NOT pre-transposed) + v : [H_kv, S, D] – KV-cache values + + tile_size selection + ─────────────────── + Ideal: tile_size = round_up(S * H_kv / num_cores, vpu_num_lanes) + so that B_total ≈ num_cores. Must also satisfy the SPAD budget: + (G*T + T*D + G*D) * bytes ≤ spad_per_core (for sub-tile occupancy) + Default 512 works for (G=5, D=128, fp16, 16-lane × 8 KB/lane SPAD). + """ + + def __init__(self, tile_size: int = 512): + super().__init__() + self.tile_size = tile_size + + def forward( + self, + q: torch.Tensor, # [H_kv, G, D] + k: torch.Tensor, # [H_kv, S, D] + v: torch.Tensor, # [H_kv, S, D] + scale: float, + ) -> torch.Tensor: + H_kv, G, D = q.shape + _, S, _ = k.shape + T = self.tile_size + n_tiles = (S + T - 1) // T + pad_len = n_tiles * T - S + B_total = H_kv * n_tiles + + # ── 1. Pad S → multiple of T ─────────────────────────────────────── + if pad_len > 0: + k = F.pad(k, (0, 0, 0, pad_len)) # [H_kv, S', D] + v = F.pad(v, (0, 0, 0, pad_len)) # [H_kv, S', D] + + # ── 2. Tile K, V → [B_total, T, D] (contiguous, no copy) ───────── + # k is [H_kv, S', D]; view splits S' → n_tiles×T along dim-1 + k_tiles = k.view(H_kv, n_tiles, T, D).reshape(B_total, T, D) + v_tiles = v.view(H_kv, n_tiles, T, D).reshape(B_total, T, D) + + # ── 3. Expand Q → [B_total, G, D] ───────────────────────────────── + # expand: zero-copy view; reshape: contiguous copy (small: B_total*G*D elems) + q_exp = q.unsqueeze(1).expand(H_kv, n_tiles, G, D).reshape(B_total, G, D) + + # ── 4. Batched QK BMM ────────────────────────────────────────────── + # [B_total, G, D] × [B_total, D, T] → [B_total, G, T] + # NPU mapping: B=B_total, M=G, N=T, K=D + # → outer tiles = B_total × M_tiles × N_tiles (all parallelizable) + # → inner K-loop = D/TILE_K (short, D=128) + k_t = k_tiles.transpose(1, 2) # [B_total, D, T] + scores = torch.bmm(q_exp, k_t) * scale # [B_total, G, T] + + # ── 5. Tile-local softmax (fp32 accumulation) ────────────────────── + # All ops are elementwise on [B_total, G, T] → torch.compile fuses them + scores_f32 = scores.float() + local_max = scores_f32.amax(dim=-1, keepdim=True) # [B_total, G, 1] + local_exp = (scores_f32 - local_max).exp() # [B_total, G, T] + local_sum = local_exp.sum(dim=-1, keepdim=True) # [B_total, G, 1] + + # ── 6. Batched SV BMM ────────────────────────────────────────────── + # [B_total, G, T] × [B_total, T, D] → [B_total, G, D] + # NPU mapping: B=B_total, M=G, N=D, K=T + # → outer tiles = B_total × M_tiles × N_tiles (parallelizable) + # → inner K-loop = T/TILE_K (controlled, T≪S) + sv = torch.bmm(local_exp.to(q.dtype), v_tiles) # [B_total, G, D] + + # ── 7. Online-softmax global reduction (elementwise, fused) ──────── + local_max = local_max.view(H_kv, n_tiles, G, 1) + local_sum = local_sum.view(H_kv, n_tiles, G, 1) + sv = sv.view(H_kv, n_tiles, G, D) + + global_max = local_max.amax(dim=1, keepdim=True) # [H_kv, 1, G, 1] + rescale = (local_max - global_max).exp() # [H_kv, n_tiles, G, 1] + corrected_sv = (sv * rescale).sum(dim=1) # [H_kv, G, D] + corrected_sum = (local_sum * rescale).sum(dim=1) # [H_kv, G, 1] + + return (corrected_sv / corrected_sum.clamp_min(1e-12)).to(q.dtype) + + +# ───────────────────────────────────────────────────────────────────────────── +# Test +# ───────────────────────────────────────────────────────────────────────────── + +MODEL_CONFIGS = { + "LLAMA4_TP8": { + "HEAD_DIM": 128, + "NUM_HEADS": 5, # = 40 total / TP8 + "NUM_KV_HEADS": 1, # = 8 total / TP8 + }, + "QWEN3-235B_TP4": { + "HEAD_DIM": 128, + "NUM_HEADS": 16, + "NUM_KV_HEADS": 1, + }, + "GPT-OSS_TP1": { + "HEAD_DIM": 64, + "NUM_HEADS": 64, + "NUM_KV_HEADS": 8, + }, + "GPT-OSS_TP2": { + "HEAD_DIM": 64, + "NUM_HEADS": 32, + "NUM_KV_HEADS": 4, + }, + "GPT-OSS_TP4": { + "HEAD_DIM": 64, + "NUM_HEADS": 16, + "NUM_KV_HEADS": 2, + }, + "GPT-OSS_TP8": { + "HEAD_DIM": 64, + "NUM_HEADS": 8, + "NUM_KV_HEADS": 1, + }, +} + + +def _make_inputs(cfg, seq_len, dtype): + H_kv = cfg["NUM_KV_HEADS"] + G = cfg["NUM_HEADS"] // cfg["NUM_KV_HEADS"] + D = cfg["HEAD_DIM"] + scale = 1.0 / math.sqrt(D) + + q = torch.randn(H_kv, G, D, dtype=dtype) + k = torch.randn(H_kv, seq_len, D, dtype=dtype) # NOT pre-transposed + v = torch.randn(H_kv, seq_len, D, dtype=dtype) + return q, k, v, scale + + +def test_gqa_decode_optimized(model, device, seq_len: int = 10240, tile_size: int = 512): + + cfg = MODEL_CONFIGS[model] if model is not None else MODEL_CONFIGS["LLAMA4_TP8"] + dtype = torch.float16 + + model = GQADecodeOptimized(tile_size=tile_size).eval() + + # ── NPU run ──────────────────────────────────────────────────────────── + q, k, v, scale = _make_inputs(cfg, seq_len, dtype) + model_dev = model.to(device) + compiled = torch.compile(model_dev, dynamic=False) + + q_dev, k_dev, v_dev = q.to(device), k.to(device), v.to(device) + with torch.no_grad(): + with TOGSimulator(): + out_dev = compiled(q_dev, k_dev, v_dev, scale=scale) + + # ── CPU reference ────────────────────────────────────────────────────── + with torch.no_grad(): + out_cpu = model.cpu()(q, k, v, scale=scale) + + max_diff = (out_dev.cpu() - out_cpu).abs().max().item() + + with torch.no_grad():#CPU reference + out_library = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False, enable_gqa=True) + + max_diff_library = (out_library.cpu() - out_cpu).abs().max().item() + + print(f"[GQADecodeOptimized] seq_len={seq_len}, tile_size={tile_size}") + print(f" max |npu - cpu| = {max_diff:.6f}") + print(f" npu out max = {out_dev.cpu().abs().max().item():.6f}") + print(f" cpu out max = {out_cpu.abs().max().item():.6f}") + print(f" library out max = {out_library.abs().max().item():.6f}") + print(" PASS" if max_diff < 0.05 else " FAIL (diff too large)") + + + + +if __name__ == "__main__": + argparser = argparse.ArgumentParser(description="Test GQA Attention Implementations") + argparser.add_argument("--model", type=str, default="LLAMA4_TP8", choices=MODEL_CONFIGS.keys(), help="Model configuration to test") + argparser.add_argument("--context_length", type=int, default=10240, help="Sequence length (context length) for the attention test") + argparser.add_argument("--tile_size", type=int, default=4096, help="Tile size for the optimized attention implementation") + args = argparser.parse_args() + model = args.model + base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim") + sys.path.append(base_dir) + test_gqa_decode_optimized(model=model, device=device, seq_len=args.context_length, tile_size=args.tile_size) From 9e20d955720eecad26b649c747a9c43f0e2d4f53 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 23 Mar 2026 16:41:18 +0900 Subject: [PATCH 143/194] [PyTorchSim/Frontend] Use kernel specific filelock to avoid race --- PyTorchSimFrontend/extension_codecache.py | 18 +++++++++++------- .../mlir/mlir_codegen_backend.py | 8 ++++++-- PyTorchSimFrontend/mlir/mlir_template.py | 16 ++++++++++------ 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index b1c457d3..6463dbac 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -4,7 +4,7 @@ import subprocess import torch -from torch._inductor.codecache import get_lock_dir, get_hash, write +from torch._inductor.codecache import get_hash, write from torch._inductor.async_compile import AsyncCompile from AsmParser.tog_generator import tog_generator from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen @@ -22,6 +22,11 @@ def hash_prefix(hash_value): def get_write_path(src_code): return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(get_hash(src_code.strip()))) + +def get_lock_path(write_path): + """Return lock file path for the given write_path (per-source_code lock).""" + return os.path.join(write_path, ".compile.lock") + def dump_metadata(args, arg_attributes, path): meta_path = os.path.join(path, "meta.txt") if os.path.isfile(meta_path): @@ -161,8 +166,8 @@ def load(cls, source_code, gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size) from filelock import FileLock - lock_dir = get_lock_dir() - lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT) + os.makedirs(write_path, exist_ok=True) + lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT) if spad_info is not None: link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}" @@ -212,7 +217,7 @@ def load(cls, source_code, gem5_translate_cmd = shlex.split(gem5_cmds[1]) gem5_llc_cmd = shlex.split(gem5_cmds[2]) - lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT) + lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT) with lock: try: result = subprocess.check_output(gem5_sample_cmd) @@ -278,11 +283,10 @@ def run_kernel_simulation(*args, **kwargs): # Wait for compilation key = future.result() from filelock import FileLock - lock_dir = get_lock_dir() - lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT) + result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key)) + lock = FileLock(get_lock_path(result_path), timeout=LOCK_TIMEOUT) with lock: # Run simulator pass - result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key)) # Dump arguments and meta data dump_metadata(args, arg_attributes, result_path) runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 672c35f7..17a60b44 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1091,6 +1091,8 @@ def codegen_nodes(self, nodes, kernel_name): return src_code, meta_code def _prepare_simulator_headers(self, src_code): + from filelock import FileLock + write_path = extension_codecache.get_write_path(src_code) os.makedirs(write_path, exist_ok=True) @@ -1101,8 +1103,10 @@ def _prepare_simulator_headers(self, src_code): spad_section_end_symbol = ( f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));" ) - write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol) - write_atomic(gem5_write_path, self.gem5_header.getvalue()) + lock = FileLock(extension_codecache.get_lock_path(write_path), timeout=extension_codecache.LOCK_TIMEOUT) + with lock: + write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol) + write_atomic(gem5_write_path, self.gem5_header.getvalue()) def get_arg_info(self, name): arg_info = dict() diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 851f070f..b126d3af 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -613,18 +613,22 @@ def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes, return src_code, meta_code def _prepare_simulator_headers(self, src_code): + from filelock import FileLock + spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n" spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));" write_path = extension_codecache.get_write_path(src_code) - if not os.path.exists(write_path): - os.makedirs(write_path, exist_ok=True) + os.makedirs(write_path, exist_ok=True) spike_write_path = os.path.join(write_path, "global_var.h") gem5_write_path = os.path.join(write_path, "gem5_global_var.h") - if not os.path.exists(spike_write_path): - write_atomic(spike_write_path, self.header.getvalue()+spad_end_symbol+spad_section_end_symbol) - if not os.path.exists(gem5_write_path): - write_atomic(gem5_write_path, self.gem5_header.getvalue()) + + lock = FileLock(extension_codecache.get_lock_path(write_path), timeout=extension_codecache.LOCK_TIMEOUT) + with lock: + if not os.path.exists(spike_write_path): + write_atomic(spike_write_path, self.header.getvalue()+spad_end_symbol+spad_section_end_symbol) + if not os.path.exists(gem5_write_path): + write_atomic(gem5_write_path, self.gem5_header.getvalue()) def codegen_prologue_body(self): body = IndentedBuffer() From 070c43a6ae7b194a15119b3abcece4a4ee40a539 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 23 Mar 2026 17:00:04 +0900 Subject: [PATCH 144/194] [Fix] replace outdated config name --- README.md | 2 +- experiments/artifact/cycle_validation/run_cycle.sh | 2 +- .../artifact/speedup/scripts/run_speed_ils_bert.sh | 2 +- .../artifact/speedup/scripts/run_speed_ils_conv.sh | 2 +- .../artifact/speedup/scripts/run_speed_ils_matmul.sh | 2 +- .../artifact/speedup/scripts/run_speed_ils_resnet.sh | 2 +- scripts/CompilerOpt_experiment/DMAopt.sh | 2 +- scripts/sparsity_experiment/run.sh | 12 ++++++------ tests/Yolov5/test_yolov5.py | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index f55995c9..c6280498 100644 --- a/README.md +++ b/README.md @@ -414,7 +414,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing ``` You can set TOGSim config path as below. ```bash -export TORCHSIM_CONFIG=/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml +export TOGSIM_CONFIG=/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml ``` ## Future Works Currently, PyTorchSim supports PyTorch 2.2. Support for newer versions will be added soon. diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh index 9cfd1e98..ebf0b11f 100755 --- a/experiments/artifact/cycle_validation/run_cycle.sh +++ b/experiments/artifact/cycle_validation/run_cycle.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -export TORCHSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml +export TOGSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs mkdir -p $LOG_DIR diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh index 467949af..35d744bf 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh @@ -33,7 +33,7 @@ for i in "${config[@]}"; do output=$(bash -c " export TORCHSIM_TLS_MODE=0; export TORCHSIM_VALIDATION_MODE=0; - export TORCHSIM_CONFIG=$config_path; + export TOGSIM_CONFIG=$config_path; export AUTOTUNE=0; printenv; python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh index fb681c74..f85b4c40 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh @@ -34,7 +34,7 @@ for i in "${config[@]}"; do output=$(bash -c " export TORCHSIM_TLS_MODE=0; export TORCHSIM_VALIDATION_MODE=0; - export TORCHSIM_CONFIG=$config_path; + export TOGSIM_CONFIG=$config_path; export AUTOTUNE=0; printenv; python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh index dc0fdd20..b38848d0 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh @@ -32,7 +32,7 @@ for i in "${config[@]}"; do output=$(bash -c " export TORCHSIM_TLS_MODE=0; export TORCHSIM_VALIDATION_MODE=1; - export TORCHSIM_CONFIG=$config_path; + export TOGSIM_CONFIG=$config_path; export AUTOTUNE=0; printenv; python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh index 2346ab3c..689e6913 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh @@ -40,7 +40,7 @@ for i in "${config[@]}"; do output=$(bash -c " export TORCHSIM_TLS_MODE=0; export TORCHSIM_VALIDATION_MODE=0; - export TORCHSIM_CONFIG=$config_path; + export TOGSIM_CONFIG=$config_path; export AUTOTUNE=0; printenv; python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh diff --git a/scripts/CompilerOpt_experiment/DMAopt.sh b/scripts/CompilerOpt_experiment/DMAopt.sh index 9e494d9b..9f3a9df2 100644 --- a/scripts/CompilerOpt_experiment/DMAopt.sh +++ b/scripts/CompilerOpt_experiment/DMAopt.sh @@ -1,5 +1,5 @@ #!/bin/bash -export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml" +export TOGSIM_CONFIG="/root/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml" # None FG DMA export TORCHSIM_SUBTILE=0 diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh index da9b73cc..7996b5ab 100755 --- a/scripts/sparsity_experiment/run.sh +++ b/scripts/sparsity_experiment/run.sh @@ -5,7 +5,7 @@ export TORCHSIM_FORCE_TIME_M=8 export TORCHSIM_FORCE_TIME_N=8 OUTPUT_DIR="12GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_12G_simple_noc.yml" +export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_12G_simple_noc.yml" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -13,7 +13,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="24GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_24G_simple_noc.yml" +export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_24G_simple_noc.yml" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -21,7 +21,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="48GB" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_48G_simple_noc.yml" +export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_48G_simple_noc.yml" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -29,7 +29,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="12GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_12G_simple_noc.yml" +export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_12G_simple_noc.yml" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -37,7 +37,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="24GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_24G_simple_noc.yml" +export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_24G_simple_noc.yml" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 @@ -45,7 +45,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.6 > ${OUTPUT python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.8 > ${OUTPUT_DIR}/0.8 OUTPUT_DIR="48GB_2core" -export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_48G_simple_noc.yml" +export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_48G_simple_noc.yml" python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.0 > ${OUTPUT_DIR}/0.0 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.2 > ${OUTPUT_DIR}/0.2 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity 0.4 > ${OUTPUT_DIR}/0.4 diff --git a/tests/Yolov5/test_yolov5.py b/tests/Yolov5/test_yolov5.py index 1262dfb9..d98828bd 100644 --- a/tests/Yolov5/test_yolov5.py +++ b/tests/Yolov5/test_yolov5.py @@ -241,7 +241,7 @@ def concat_fn(x1, x2, x3): base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim") config = os.environ.get( - "TORCHSIM_CONFIG", + "TOGSIM_CONFIG", default=f"{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml", ) args = argparse.ArgumentParser() From 9fc08116a6dac34a1b2ebd4346401fe3df5c8cdb Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 23 Mar 2026 17:06:39 +0900 Subject: [PATCH 145/194] [Experiment] use timing mode for validation script --- ...28x128_c1_simple_noc_tpuv3_timing_only.yml | 30 +++++++++++++++++++ experiments/BERT.py | 4 +-- .../artifact/cycle_validation/run_cycle.sh | 22 +++++++------- experiments/attention.py | 2 +- experiments/conv.py | 2 +- experiments/gemm.py | 2 +- experiments/layernorm.py | 2 +- experiments/resnet18.py | 2 +- experiments/resnet50.py | 2 +- experiments/softmax.py | 2 +- 10 files changed, 50 insertions(+), 20 deletions(-) create mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml new file mode 100644 index 00000000..f8ac0a54 --- /dev/null +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml @@ -0,0 +1,30 @@ +num_cores: 1 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 16 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 0 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: autotune +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/experiments/BERT.py b/experiments/BERT.py index b938f4e6..12e3cb33 100644 --- a/experiments/BERT.py +++ b/experiments/BERT.py @@ -8,7 +8,7 @@ import torch from Simulator.simulator import TOGSimulator -config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml') +config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml') os.environ['TOGSIM_CONFIG'] = config # Try Fusion EncoderBlock first, fall back to standard test_transformer @@ -36,7 +36,7 @@ model_input = torch.randn(args.input_size, hidden_dim).to(device=device) opt_fn = torch.compile(dynamic=False)(model) - with TOGSimulator(config_path=config): + with TOGSimulator(config_path=config), torch.no_grad(): torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0) torch.npu.synchronize() print(f"BERT-{args.size} Simulation Done") diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh index ebf0b11f..7406f356 100755 --- a/experiments/artifact/cycle_validation/run_cycle.sh +++ b/experiments/artifact/cycle_validation/run_cycle.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -export TOGSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml +export TOGSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs mkdir -p $LOG_DIR @@ -33,16 +33,6 @@ for sz in \ python3 $TORCHSIM_DIR/experiments/conv.py --size $sz | tee $LOG_DIR/${name}.log done -# Attention -for sz in "12 512 64" "16 512 64" "32 512 64"; do - name="attention_${sz// /x}" - echo "" - echo "===================================================" - echo "[*] Running Attention size=$sz" - echo "===================================================" - python3 $TORCHSIM_DIR/experiments/attention.py --size $sz | tee $LOG_DIR/${name}.log -done - # LayerNorm for sz in "512 768" "2048 768" "8192 768"; do name="layernorm_${sz// /x}" @@ -63,6 +53,16 @@ for sz in "512 512" "2048 2048" "8192 8192"; do python3 $TORCHSIM_DIR/experiments/softmax.py --size $sz | tee $LOG_DIR/${name}.log done +# Attention +for sz in "12 512 64" "16 512 64" "32 512 64"; do + name="attention_${sz// /x}" + echo "" + echo "===================================================" + echo "[*] Running Attention size=$sz" + echo "===================================================" + python3 $TORCHSIM_DIR/experiments/attention.py --size $sz | tee $LOG_DIR/${name}.log +done + # ResNet for model in "resnet18" "resnet50"; do echo "" diff --git a/experiments/attention.py b/experiments/attention.py index b56ed537..db0f45bb 100644 --- a/experiments/attention.py +++ b/experiments/attention.py @@ -30,7 +30,7 @@ def attention(query, key, value): value = torch.randn(*size).to(device=device) opt_fn = torch.compile(dynamic=False)(attention) - with TOGSimulator(config_path=config): + with TOGSimulator(config_path=config), torch.no_grad(): torch.npu.launch_model(opt_fn, query, key, value, stream_index=0, timestamp=0) torch.npu.synchronize() print(f"Attention {size} Simulation Done") diff --git a/experiments/conv.py b/experiments/conv.py index 98391fae..65e52635 100644 --- a/experiments/conv.py +++ b/experiments/conv.py @@ -33,7 +33,7 @@ def _conv(a, b, bias): custom_conv = conv2d_fn(batch_size, i_h, i_w, i_c, o_c, kernel_size, stride, padding) opt_fn = torch.compile(dynamic=False)(custom_conv) - with TOGSimulator(config_path=config): + with TOGSimulator(config_path=config), torch.no_grad(): torch.npu.launch_model(opt_fn, conv_input, conv_kernel, conv_bias, stream_index=0, timestamp=0) torch.npu.synchronize() print(f"CONV {batch_size}_{i_h}_{i_w}_{i_c}_{o_c}_{kernel_size}_{stride}_{padding} Simulation Done") diff --git a/experiments/gemm.py b/experiments/gemm.py index d256e931..dbbba3ea 100644 --- a/experiments/gemm.py +++ b/experiments/gemm.py @@ -26,7 +26,7 @@ def matmul_fn(a, b): input_b = torch.randn(K, N).to(device=device) opt_fn = torch.compile(dynamic=False)(matmul_fn) - with TOGSimulator(config_path=config): + with TOGSimulator(config_path=config), torch.no_grad(): torch.npu.launch_model(opt_fn, input_a, input_b, stream_index=0, timestamp=0) torch.npu.synchronize() print(f"GEMM {M}x{K}x{N} (MxKxN) Simulation Done") diff --git a/experiments/layernorm.py b/experiments/layernorm.py index a9170c6b..375f98e9 100644 --- a/experiments/layernorm.py +++ b/experiments/layernorm.py @@ -23,7 +23,7 @@ opt_fn = torch.compile(dynamic=False)(model) model_input = torch.randn(*size).to(device=device) - with TOGSimulator(config_path=config): + with TOGSimulator(config_path=config), torch.no_grad(): torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0) torch.npu.synchronize() print(f"LayerNorm {size} Simulation Done") diff --git a/experiments/resnet18.py b/experiments/resnet18.py index 38fb80fe..ffec9a50 100644 --- a/experiments/resnet18.py +++ b/experiments/resnet18.py @@ -22,7 +22,7 @@ opt_fn = torch.compile(dynamic=False)(model) model_input = torch.randn(args.batch, 3, 224, 224).to(device=device) - with TOGSimulator(config_path=config): + with TOGSimulator(config_path=config), torch.no_grad(): torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0) torch.npu.synchronize() print("ResNet18 Simulation Done") diff --git a/experiments/resnet50.py b/experiments/resnet50.py index 5b134c13..d886c159 100644 --- a/experiments/resnet50.py +++ b/experiments/resnet50.py @@ -22,7 +22,7 @@ opt_fn = torch.compile(dynamic=False)(model) model_input = torch.randn(args.batch, 3, 224, 224).to(device=device) - with TOGSimulator(config_path=config): + with TOGSimulator(config_path=config), torch.no_grad(): torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0) torch.npu.synchronize() print("ResNet50 Simulation Done") diff --git a/experiments/softmax.py b/experiments/softmax.py index b86febe0..05024121 100644 --- a/experiments/softmax.py +++ b/experiments/softmax.py @@ -23,7 +23,7 @@ opt_fn = torch.compile(dynamic=False)(model) model_input = torch.randn(*size).to(device=device) - with TOGSimulator(config_path=config): + with TOGSimulator(config_path=config), torch.no_grad(): torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0) torch.npu.synchronize() print(f"Softmax {size} Simulation Done") From cf56c596b05457ecd1a3093574a87722e677862e Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 23 Mar 2026 20:36:06 +0900 Subject: [PATCH 146/194] [CI] Run validation script only for vector_lane==128 --- .github/workflows/pytorchsim_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index 36a62b68..3a383137 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -750,6 +750,7 @@ jobs: test_accuracy: name: Run test_accuracy runs-on: self-hosted + if: inputs.vector_lane == 128 steps: - name: Log in to GitHub Container Registry uses: docker/login-action@v3 From 8d22583c1b15eb04c83875f39a0f7bdd140ec967 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 23 Mar 2026 20:44:54 +0900 Subject: [PATCH 147/194] [TOGSim] Add error handling of idle stat couting --- TOGSim/src/Core.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/TOGSim/src/Core.cc b/TOGSim/src/Core.cc index 30858193..1f831661 100644 --- a/TOGSim/src/Core.cc +++ b/TOGSim/src/Core.cc @@ -62,9 +62,9 @@ void Core::vu_cycle() { if (!_vu_compute_pipeline.empty()) { _stat_vu_compute_cycle++; if(_vu_compute_pipeline.front()->finish_cycle <= _core_cycle) { - int bubble = _vu_compute_pipeline.front()->bubble_cycle; + cycle_type bubble = _vu_compute_pipeline.front()->bubble_cycle; _stat_vu_compute_idle_cycle += bubble; - _stat_vu_compute_cycle -= bubble; + _stat_vu_compute_cycle = (bubble < _stat_vu_compute_cycle) ? (_stat_vu_compute_cycle - bubble) : 0; finish_instruction(_vu_compute_pipeline.front()); _vu_compute_pipeline.pop(); } else { @@ -83,9 +83,10 @@ void Core::sa_cycle() { while (retry) { if (!_sa_compute_pipeline.at(i).empty()) { if(_sa_compute_pipeline.at(i).front()->finish_cycle <= _core_cycle) { - int bubble = _sa_compute_pipeline.at(i).front()->bubble_cycle; + cycle_type bubble = _sa_compute_pipeline.at(i).front()->bubble_cycle; _stat_sa_compute_idle_cycle.at(i) += bubble; - _stat_sa_compute_cycle.at(i) -= bubble; + cycle_type& stat = _stat_sa_compute_cycle.at(i); + stat = (bubble < stat) ? (stat - bubble) : 0; finish_instruction(_sa_compute_pipeline.at(i).front()); _sa_compute_pipeline.at(i).pop(); } else { From 0b60ddde6369fa028037c50e281ac36a2ba5e6c4 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 23 Mar 2026 16:17:18 +0900 Subject: [PATCH 148/194] [TOGSim] Update DRAM Bw stat with exact number --- TOGSim/extern/ramulator2 | 2 +- TOGSim/src/Dram.cc | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/TOGSim/extern/ramulator2 b/TOGSim/extern/ramulator2 index 748cd709..49556128 160000 --- a/TOGSim/extern/ramulator2 +++ b/TOGSim/extern/ramulator2 @@ -1 +1 @@ -Subproject commit 748cd7099778d7196326aeb6384da92efb0c34c9 +Subproject commit 495561282d99f2ef2652618710e98c4a287025da diff --git a/TOGSim/src/Dram.cc b/TOGSim/src/Dram.cc index 089c582e..656e57f8 100644 --- a/TOGSim/src/Dram.cc +++ b/TOGSim/src/Dram.cc @@ -54,7 +54,8 @@ DramRamulator2::DramRamulator2(SimulationConfig config, cycle_type* core_cycle) _mem.resize(_n_ch); for (int ch = 0; ch < _n_ch; ch++) { _mem[ch] = std::make_unique( - ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, _n_bl); + ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, _n_bl, + _req_size, config.dram_freq_mhz); } _tx_log2 = log2(_req_size); _tx_ch_log2 = log2(_n_ch_per_partition) + _tx_log2; From 6bc1204b802b518afc8216318e94812529b549a7 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 24 Mar 2026 15:22:14 +0900 Subject: [PATCH 149/194] [Experiment] Fix ils script to use updated config --- TOGSim/src/main.cc | 8 +++ ...lic_ws_128x128_c2_simple_noc_tpuv3_ils.yml | 33 ++++++++++ experiments/artifact/speedup/run_speedup.sh | 61 +++++++++---------- .../speedup/scripts/run_speed_ils_bert.sh | 13 +--- .../speedup/scripts/run_speed_ils_conv.sh | 13 +--- .../speedup/scripts/run_speed_ils_matmul.sh | 13 +--- .../speedup/scripts/run_speed_ils_resnet.sh | 13 +--- 7 files changed, 83 insertions(+), 71 deletions(-) create mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc index cda8f986..57e0e696 100644 --- a/TOGSim/src/main.cc +++ b/TOGSim/src/main.cc @@ -115,6 +115,14 @@ int main(int argc, char** argv) { // Check if help was requested cmd_parser.print_help_message_if_required(); + // Dump full command for copy-paste re-run + std::ostringstream cmd_oss; + for (int i = 0; i < argc; ++i) { + if (i > 0) cmd_oss << " "; + cmd_oss << argv[i]; + } + spdlog::info("[TOGSim] Run command: {}", cmd_oss.str()); + std::string level = "info"; cmd_parser.set_if_defined("log_level", &level); if (level == "trace") diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml new file mode 100644 index 00000000..ce2d932d --- /dev/null +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml @@ -0,0 +1,33 @@ +# ILS (Instruction-Level Simulation) 전용 config +# - pytorchsim_functional_mode: 0 (timing only, no validation) +# - codegen_mapping_strategy: heuristic (no autotune) +num_cores: 2 +core_freq_mhz: 940 +core_stats_print_period_cycles: 10000 +num_systolic_array_per_core: 2 + +vpu_num_lanes: 128 +vpu_spad_size_kb_per_lane: 128 +vpu_vector_length_bits: 256 + +dram_type: ramulator2 +dram_freq_mhz: 940 +dram_channels: 32 +dram_req_size_byte: 32 +dram_num_burst_length: 2 +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml + +icnt_type: simple +icnt_latency_cycles: 10 +icnt_freq_mhz: 940 +icnt_injection_ports_per_core: 16 + +pytorchsim_functional_mode: 0 +pytorchsim_timing_mode: 1 + +codegen_mapping_strategy: heuristic +codegen_external_mapping_file: '' +codegen_autotune_max_retry: 10 +codegen_autotune_template_topk: 4 +codegen_compiler_optimization: all diff --git a/experiments/artifact/speedup/run_speedup.sh b/experiments/artifact/speedup/run_speedup.sh index e84ab1a9..cb5ee511 100755 --- a/experiments/artifact/speedup/run_speedup.sh +++ b/experiments/artifact/speedup/run_speedup.sh @@ -1,7 +1,11 @@ #!/bin/bash +set -e + LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs CONFIG_DIR="$TORCHSIM_DIR/configs" -SIMULATOR_BIN="$TORCHSIM_DIR/TOGSim/build/bin/Simulator" +EXTRACT_TRACE="$TORCHSIM_DIR/experiments/artifact/speedup/scripts/extract_trace_from_log.py" +TRACE_CACHE_DIR="$TORCHSIM_DIR/experiments/artifact/speedup/trace_cache" +mkdir -p "$TRACE_CACHE_DIR" configs=( "systolic_ws_128x128_c2_simple_noc_tpuv3.yml" @@ -25,9 +29,11 @@ output_dir="$TORCHSIM_DIR/experiments/artifact/speedup/results" mkdir -p "$output_dir" echo "[*] Scanning log files in: $LOG_DIR" +echo "[*] Extracting [TOGSim] Run command and trace from logs" echo "" for log_file in "$LOG_DIR"/*.log; do + [[ -f "$log_file" ]] || continue filename=$(basename "$log_file") workload="${filename%.log}" @@ -36,45 +42,38 @@ for log_file in "$LOG_DIR"/*.log; do fi echo "==> Workload: $workload" - declare -a ONNX_ATTR_PAIRS=() + # === Extract [TOGSim] Run command from log === + base_cmd=$(grep "\[TOGSim\] Run command:" "$log_file" 2>/dev/null | sed 's/.*\[TOGSim\] Run command: //' | head -1) + if [[ -z "$base_cmd" ]]; then + echo " Skipping: no [TOGSim] Run command found in $log_file" + continue + fi - # === Grep launch line === - while IFS= read -r line; do - if [[ "$line" == launch* ]]; then - read -r _ onnx_path attr_path _ <<< "$line" - ONNX_ATTR_PAIRS+=("$onnx_path|$attr_path") - fi - done < "$log_file" + # === Get trace file (replace FIFO in command; stored trace or generate from log) === + trace_file=$(python3 "$EXTRACT_TRACE" "$log_file" "$TRACE_CACHE_DIR/${workload}.trace" 2>/dev/null) || true + if [[ -z "$trace_file" || ! -f "$trace_file" ]]; then + echo " Skipping: could not extract trace from $log_file" + continue + fi # Normal configs for config in "${configs[@]}"; do - output_file="$output_dir/${workload}_${config}.txt" - echo "Running with config=$config" - echo "===== config=$config | model=$workload =====" >> "$output_file" + output_file="$output_dir/${workload}_${config}.txt" + echo "===== config=$config | model=$workload =====" > "$output_file" sum_all_iters=0.0 iter_count=0 - # === Run 5 iterations === for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=$workload config=$config" - cmd="" - for pair in "${ONNX_ATTR_PAIRS[@]}"; do - IFS="|" read -r onnx_path attr_path <<< "$pair" - cmd+=" $SIMULATOR_BIN --config $CONFIG_DIR/$config --models_list $onnx_path --attributes_list $attr_path;" - done - - output=$(bash -c "$cmd") - sim_times=$(echo "$output" | grep "Simulation time:" | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/') - - if [[ -n "$sim_times" ]]; then - sum_per_iter=0.0 - while IFS= read -r sim_time; do - echo "Iteration $iter: simulation_time = $sim_time" >> "$output_file" - sum_per_iter=$(awk -v a="$sum_per_iter" -v b="$sim_time" 'BEGIN {printf "%.6f", a + b}') - done <<< "$sim_times" - - echo "Iteration $iter: total_simulation_time = $sum_per_iter" >> "$output_file" - sum_all_iters=$(awk -v a="$sum_all_iters" -v b="$sum_per_iter" 'BEGIN {printf "%.6f", a + b}') + # Build command: replace --config and --models_list in base_cmd with our config and trace + cmd=$(echo "$base_cmd" | sed -E "s|--config [^ ]+|--config $CONFIG_DIR/$config|" | sed -E "s|--models_list [^ ]+|--models_list $trace_file|") + echo "$cmd" + output=$(bash -c "$cmd" 2>&1) || true + sim_time=$(echo "$output" | grep "Wall-clock time for simulation:" | sed -E 's/.*Wall-clock time for simulation: ([0-9]+\.[0-9]+) seconds.*/\1/') + + if [[ -n "$sim_time" ]]; then + echo "Iteration $iter: simulation_time = $sim_time" >> "$output_file" + sum_all_iters=$(awk -v a="$sum_all_iters" -v b="$sim_time" 'BEGIN {printf "%.6f", a + b}') iter_count=$((iter_count + 1)) else echo "Iteration $iter: No simulation time found." >> "$output_file" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh index 35d744bf..642fec34 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh @@ -2,10 +2,7 @@ base_dir=$TORCHSIM_DIR/experiments/artifact/speedup config=( - # "systolic_ws_8x8_c1_simple_noc.yml" - "systolic_ws_128x128_c2_simple_noc_tpuv3.yml" - #"systolic_ws_128x128_c2_booksim_tpuv3.yml" - # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml" + "systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml" ) TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") SIZE_LIST=( @@ -31,15 +28,11 @@ for i in "${config[@]}"; do for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" output=$(bash -c " - export TORCHSIM_TLS_MODE=0; - export TORCHSIM_VALIDATION_MODE=0; export TOGSIM_CONFIG=$config_path; - export AUTOTUNE=0; - printenv; - python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh + cd $TORCHSIM_DIR && python3 $workload 2>&1 ") - sim_time=$(echo "$output" | grep "Simulation time:" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/') + sim_time=$(echo "$output" | grep "Wall-clock time for simulation:" | tail -n 1 | sed -E 's/.*Wall-clock time for simulation: ([0-9]+\.[0-9]+) seconds.*/\1/') if [[ -n "$sim_time" ]]; then echo "Iteration $iter: Simulation time = $sim_time" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh index f85b4c40..f5602668 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh @@ -2,10 +2,7 @@ base_dir=$TORCHSIM_DIR/experiments/artifact/speedup config=( - # "systolic_ws_8x8_c1_simple_noc.yml" - "systolic_ws_128x128_c2_simple_noc_tpuv3.yml" - #"systolic_ws_128x128_c2_booksim_tpuv3.yml" - # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml" + "systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml" ) TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") SHAPE_LIST=( @@ -32,15 +29,11 @@ for i in "${config[@]}"; do for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" output=$(bash -c " - export TORCHSIM_TLS_MODE=0; - export TORCHSIM_VALIDATION_MODE=0; export TOGSIM_CONFIG=$config_path; - export AUTOTUNE=0; - printenv; - python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh + cd $TORCHSIM_DIR && python3 $workload 2>&1 ") - sim_time=$(echo "$output" | grep "Simulation time:" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/') + sim_time=$(echo "$output" | grep "Wall-clock time for simulation:" | tail -n 1 | sed -E 's/.*Wall-clock time for simulation: ([0-9]+\.[0-9]+) seconds.*/\1/') if [[ -n "$sim_time" ]]; then echo "Iteration $iter: Simulation time = $sim_time" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh index b38848d0..bc912aa6 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh @@ -2,10 +2,7 @@ base_dir=$TORCHSIM_DIR/experiments/artifact/speedup config=( - # "systolic_ws_8x8_c1_simple_noc.yml" - "systolic_ws_128x128_c2_simple_noc_tpuv3.yml" - #"systolic_ws_128x128_c2_booksim_tpuv3.yml" - # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml" + "systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml" ) TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") SHAPE_LIST=( @@ -30,15 +27,11 @@ for i in "${config[@]}"; do for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" output=$(bash -c " - export TORCHSIM_TLS_MODE=0; - export TORCHSIM_VALIDATION_MODE=1; export TOGSIM_CONFIG=$config_path; - export AUTOTUNE=0; - printenv; - python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh + cd $TORCHSIM_DIR && python3 $workload 2>&1 ") - sim_time=$(echo "$output" | grep "Simulation time:" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/') + sim_time=$(echo "$output" | grep "Wall-clock time for simulation:" | tail -n 1 | sed -E 's/.*Wall-clock time for simulation: ([0-9]+\.[0-9]+) seconds.*/\1/') if [[ -n "$sim_time" ]]; then echo "Iteration $iter: simulation_time = $sim_time" >> "$output_file" diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh index 689e6913..b1a43cb5 100755 --- a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh +++ b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh @@ -2,10 +2,7 @@ base_dir=$TORCHSIM_DIR/experiments/artifact/speedup config=( - # "systolic_ws_8x8_c1_simple_noc.yml" - "systolic_ws_128x128_c2_simple_noc_tpuv3.yml" - #"systolic_ws_128x128_c2_booksim_tpuv3.yml" - # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml" + "systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml" ) TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") SIZE_LIST=( @@ -38,15 +35,11 @@ for i in "${config[@]}"; do for iter in {1..5}; do echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config" output=$(bash -c " - export TORCHSIM_TLS_MODE=0; - export TORCHSIM_VALIDATION_MODE=0; export TOGSIM_CONFIG=$config_path; - export AUTOTUNE=0; - printenv; - python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh + cd $TORCHSIM_DIR && python3 $workload 2>&1 ") - sim_time=$(echo "$output" | grep "Simulation time:" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/') + sim_time=$(echo "$output" | grep "Wall-clock time for simulation:" | tail -n 1 | sed -E 's/.*Wall-clock time for simulation: ([0-9]+\.[0-9]+) seconds.*/\1/') if [[ -n "$sim_time" ]]; then echo "Iteration $iter: Simulation time = $sim_time" From 336fdf375ac60b066d41a4906df8fc554944e1b9 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 24 Mar 2026 15:26:59 +0900 Subject: [PATCH 150/194] [CI] Remove dump folder mount for test --- .github/workflows/pytorchsim_test.yml | 89 +-------------------------- 1 file changed, 2 insertions(+), 87 deletions(-) diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index 3a383137..2a9d60a1 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -31,8 +31,6 @@ jobs: run: | echo "Running test_add.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_add.py @@ -52,8 +50,6 @@ jobs: run: | echo "Running test_transcendental.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transcendental.py @@ -73,8 +69,6 @@ jobs: run: | echo "Running test_activation.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_activation.py @@ -94,8 +88,6 @@ jobs: run: | echo "Running test_batchnorm.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_batchnorm.py @@ -115,8 +107,6 @@ jobs: run: | echo "Running test_bmm.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_bmm.py @@ -136,8 +126,6 @@ jobs: run: | echo "Running test_cnn.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_cnn.py @@ -157,8 +145,6 @@ jobs: run: | echo "Running test_conv2d.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_conv2d.py @@ -178,8 +164,6 @@ jobs: run: | echo "Running test_cat.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_cat.py @@ -199,8 +183,6 @@ jobs: run: | echo "Running test_matmul.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_matmul.py @@ -220,8 +202,6 @@ jobs: run: | echo "Running test_reduce.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_reduce.py @@ -241,8 +221,6 @@ jobs: run: | echo "Running test_softmax.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_softmax.py @@ -262,8 +240,6 @@ jobs: run: | echo "Running test_transpose2D.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transpose2D.py @@ -283,8 +259,6 @@ jobs: run: | echo "Running test_view3D_2D.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_view3D_2D.py @@ -304,8 +278,6 @@ jobs: run: | echo "Running test_layernorm.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_layernorm.py @@ -325,8 +297,6 @@ jobs: run: | echo "Running test_mlp.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_mlp.py @@ -346,8 +316,6 @@ jobs: run: | echo "Running test_resnet.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_resnet.py @@ -356,8 +324,6 @@ jobs: run: | echo "Running test_resnet.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_resnet.py --model_type resnet50 @@ -377,8 +343,6 @@ jobs: run: | echo "Running test_transformer.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transformer.py @@ -398,8 +362,6 @@ jobs: run: | echo "Running test_transpose3D.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transpose3D.py @@ -419,8 +381,6 @@ jobs: run: | echo "Running test_sparsity.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_sparsity.py @@ -440,8 +400,6 @@ jobs: run: | echo "Running test_pool.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_pool.py @@ -461,8 +419,6 @@ jobs: run: | echo "Running test_single_perceptron.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_single_perceptron.py @@ -482,8 +438,6 @@ jobs: run: | echo "Running test_addmm_residual.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py @@ -492,8 +446,6 @@ jobs: run: | echo "Running test_matmul_activation.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py @@ -502,8 +454,6 @@ jobs: run: | echo "Running test_matmul_scalar.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py @@ -512,8 +462,6 @@ jobs: run: | echo "Running test_matmul_reduction.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_reduction.py @@ -522,8 +470,6 @@ jobs: run: | echo "Running test_bmm_reduction.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_bmm_reduction.py @@ -532,8 +478,6 @@ jobs: run: | echo "Running test_prologue_fusion.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_prologue_fusion.py @@ -542,8 +486,6 @@ jobs: run: | echo "Running test_transformer_fusion.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_transformer_fusion.py @@ -552,8 +494,6 @@ jobs: run: | echo "Running test_conv_fusion.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_conv_fusion.py @@ -573,8 +513,6 @@ jobs: run: | echo "Running test_moe.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/MoE/test_moe.py @@ -594,8 +532,6 @@ jobs: run: | echo "Running test_mistral.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Mixtral_8x7B/test_attention.py @@ -615,8 +551,6 @@ jobs: run: | echo "Running test_vit.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_vit.py @@ -636,8 +570,6 @@ jobs: run: | echo "Running test_diffusion.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Diffusion/test_diffusion.py @@ -657,8 +589,6 @@ jobs: run: | echo "Running test_indirect.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_indirect_access.py @@ -678,8 +608,6 @@ jobs: run: | echo "Running test_scheduler.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_scheduler.py @@ -699,8 +627,6 @@ jobs: run: | echo "Running test_llama.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Llama/test_llama.py @@ -720,8 +646,6 @@ jobs: run: | echo "Running test_yolov5.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/Yolov5/test_yolov5.py @@ -741,8 +665,6 @@ jobs: run: | echo "Running test_deepseek_v3_base.py" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/DeepSeek/test_deepseek_v3_base.py @@ -759,25 +681,18 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Prepare volume directory - run: mkdir -p /tmp/torchsim-ci/${GITHUB_SHA} - - name: Run run_cycle.sh run: | echo "Running run_cycle.sh" docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ -e vpu_num_lanes="${{ inputs.vector_lane }}" \ -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} bash -c \ - "cd /workspace && PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh && \ - cp PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out /dump/summary_cycle.out" - ls /tmp/torchsim-ci/${GITHUB_SHA} + "cd /workspace && PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh >/dev/null 2>&1 && cat PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out" > summary_cycle.out - name: Upload Accuracy Report Artifact uses: actions/upload-artifact@v4 with: name: accuracy-report - path: /tmp/torchsim-ci/${{ github.sha }}/summary_cycle.out + path: summary_cycle.out if-no-files-found: error From 8838bfe361cdadf5a9516d15e737d9443522a84e Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 24 Mar 2026 22:09:45 +0900 Subject: [PATCH 151/194] [Decompse] Add naive group convolution decomposition + test --- .../mlir/mlir_codegen_backend.py | 8 +- PyTorchSimFrontend/mlir/mlir_decomposition.py | 127 +++++++++++++++++- tests/test_group_conv.py | 79 +++++++++++ 3 files changed, 212 insertions(+), 2 deletions(-) create mode 100644 tests/test_group_conv.py diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 17a60b44..3ecf3b53 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -285,7 +285,13 @@ def __init__(self, kernel_group, reason=None): self.gem5_header = IndentedBuffer() self.header.writeline("#include ") self.header.writeline("#include ") - self.header.writeline("void* __wrap_malloc(size_t size) { size = (size + 511UL) & ~511UL; return sbrk(size); }") # Align to 512 bytes + self.header.writeline("#include ") + self.header.writeline("void* __wrap_malloc(size_t size) {") # Align to 512 bytes + self.header.writeline(" size_t aligned = (size + 511UL) & ~511UL;") + self.header.writeline(" void *p = sbrk(aligned);") + #self.header.writeline(' fprintf(stderr, "[SPIKE][__wrap_malloc] addr=%p size=%zu (req=%zu)\\n", p, aligned, size);') + self.header.writeline(" return p;") + self.header.writeline("}") self.header.writeline("void __wrap_free(void *ptr) { return; }") self.reduction_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc") self.spad_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="spad") diff --git a/PyTorchSimFrontend/mlir/mlir_decomposition.py b/PyTorchSimFrontend/mlir/mlir_decomposition.py index 284d25d7..122c2677 100644 --- a/PyTorchSimFrontend/mlir/mlir_decomposition.py +++ b/PyTorchSimFrontend/mlir/mlir_decomposition.py @@ -1,9 +1,134 @@ import math +import operator +from typing import Optional, Sequence, Tuple, Union + import torch import torch.nn.functional as F from torch._inductor.decomposition import register_decomposition -aten = torch.ops.aten +aten = torch.ops.aten # only for @register_decomposition target + + +def _pair_2d(seq: Sequence[int]) -> Tuple[int, int]: + if len(seq) == 1: + v = int(seq[0]) + return v, v + return int(seq[0]), int(seq[1]) + + +def _group_conv_cin1_cout1( + input: torch.Tensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor], + stride: Tuple[int, ...], + padding: Tuple[int, ...], + dilation: Tuple[int, ...], + groups: int, +) -> torch.Tensor: + """ + Grouped conv with ``Cin//groups == 1`` and ``Cout//groups == 1`` (input ``[N,G,H,W]``, weight ``[G,1,Kh,Kw]``). + + 1. Symmetric spatial padding on the input. + 2. For each kernel position ``(kh, kw)``, gather the output grid from the padded tensor and + multiply by ``weight[:, 0, kh, kw]`` (broadcast over ``N``), then sum over ``(kh, kw)``. + + Note + ---- + This is not a performance-optimized kernel: it is explicit gather–multiply–accumulate over + kernel elements. For competitive performance, add a dedicated template (or fused) kernel + instead of relying on this decomposition. + """ + n, c_in, _, _ = input.shape + # PyTorch layout: ``[Cout, Cin/groups, Kh, Kw]`` i.e. ``[G, 1, Kh, Kw]`` here. + c_out, cin_pg, kh, kw = weight.shape + g = groups + assert c_in == g and c_out == g and cin_pg == 1, (c_in, c_out, cin_pg, g) + + sh, sw = _pair_2d(stride) + ph, pw = _pair_2d(padding) + d_h, d_w = _pair_2d(dilation) + + # (left, right, top, bottom) for last two dims + x_pad = F.pad(input, (pw, pw, ph, ph)) + _, _, hp, wp = x_pad.shape + + h_out = (hp - d_h * (kh - 1) - 1) // sh + 1 + w_out = (wp - d_w * (kw - 1) - 1) // sw + 1 + + out = torch.zeros(n, g, h_out, w_out, dtype=input.dtype, device=input.device) + for ki in range(kh): + rows = torch.arange(h_out, device=input.device, dtype=torch.long) * sh + ki * d_h + for kj in range(kw): + cols = torch.arange(w_out, device=input.device, dtype=torch.long) * sw + kj * d_w + sub = x_pad[:, :, rows[:, None], cols[None, :]] + wgk = weight[:, 0, ki, kj].reshape(1, g, 1, 1) + out = out + sub * wgk + + if bias is not None: + out = out + bias.reshape(1, g, 1, 1) + return out + + +@register_decomposition(aten.convolution.default) +def decompose_group_convolution( + input: torch.Tensor, + weight: torch.Tensor, + bias: Union[torch.Tensor, None], + stride: Sequence[int], + padding: Sequence[int], + dilation: Sequence[int], + transposed: bool, + output_padding: Sequence[int], + groups: Union[int, torch.SymInt], +): + """ + Lower grouped ``aten.convolution`` only when each group has a single input and output + channel (``Cin//groups == Cout//groups == 1``), via ``_group_conv_cin1_cout1``. + + Note + ---- + The lowered path is not a performance-optimized kernel; it exists for correctness and + lowering experiments. For speed, implement a separate template (fused) kernel for group + convolution. + + Non-static ``groups`` (cannot ``int()``) falls back: returns ``NotImplemented`` so the + default ``aten.convolution`` is used. ``groups==1`` also returns ``NotImplemented``. + """ + try: + gcount = operator.index(groups) + except (TypeError, ValueError): + return NotImplemented + # groups==1: do not decompose; Inductor keeps the default aten.convolution (plain conv). + if gcount == 1: + return NotImplemented + + cin = input.shape[1] + cout = weight.shape[0] + cin_pg = cin // gcount + cout_pg = cout // gcount + supported = ( + not transposed + and cin % gcount == 0 + and cout % gcount == 0 + and cin_pg == 1 + and cout_pg == 1 + and weight.shape[1] == 1 + ) + if not supported: + raise NotImplementedError( + "PyTorchSim aten.convolution decomposition supports grouped conv only when " + "Cin//groups == 1 and Cout//groups == 1 (i.e. per-group Cin and Cout are 1). " + "For general group convolution, use the default kernel or a dedicated template kernel." + ) + return _group_conv_cin1_cout1( + input, + weight, + bias, + tuple(stride), + tuple(padding), + tuple(dilation), + gcount, + ) @register_decomposition(aten._native_multi_head_attention.default) def decompose_native_multi_head_attention( diff --git a/tests/test_group_conv.py b/tests/test_group_conv.py new file mode 100644 index 00000000..4f97cff6 --- /dev/null +++ b/tests/test_group_conv.py @@ -0,0 +1,79 @@ +import torch +import torch._dynamo +from Simulator.simulator import TOGSimulator + +def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): + if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): + message = f"|{name} Test Passed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + else: + message = f"|{name} Test Failed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + print("custom out: ", out.cpu()) + print("cpu out: ", cpu_out) + exit(1) + +def test_group_convolution( + device, + groups=2, + stride=1, + padding=1, + batch_size=2, + c_per_group=8, + out_per_group=12, + spatial=16, + kernel_size=3, + seed=0, +): + """``torch.compile`` on NPU vs CPU reference — same structure as ``test_matmul`` / ``test_conv2d``.""" + + def custom_group_conv(a, weight, bias): + return torch.convolution( + a, + weight, + bias, + (stride, stride), + (padding, padding), + (1, 1), + False, + (0, 0), + groups, + ) + + torch.manual_seed(seed) + c_in = c_per_group * groups + c_out = out_per_group * groups + k = kernel_size + x = torch.randn(batch_size, c_in, spatial, spatial) + wgt = torch.randn(c_out, c_in // groups, k, k) + b = torch.randn(c_out) + + x1 = x.to(device=device, memory_format=torch.channels_last) + w1 = wgt.to(device=device, memory_format=torch.channels_last) + b1 = b.to(device=device) + x2 = x.to("cpu", memory_format=torch.channels_last) + w2 = wgt.to("cpu", memory_format=torch.channels_last) + b2 = b.to("cpu") + + opt_fn = torch.compile(dynamic=False)(custom_group_conv) + res = opt_fn(x1, w1, b1) + y = custom_group_conv(x2, w2, b2) + label = f"Group Conv Forward (groups={groups}, stride={stride}, pad={padding})" + test_result(label, res, y, rtol=1e-3, atol=1e-3) + print("Max diff > ", torch.max(torch.abs(res.cpu() - y))) + + +if __name__ == "__main__": + device = torch.device("npu:0") + with torch.no_grad(): + #test_group_convolution(device, batch_size=1, groups=2, stride=1, padding=1, seed=0) + #test_group_convolution(device, batch_size=1, groups=4, stride=1, padding=1, seed=1) + #test_group_convolution(device, batch_size=1, groups=2, stride=2, padding=1, seed=2) + test_group_convolution(device, batch_size=1, groups=240, stride=2, padding=1, seed=2, c_per_group=1, out_per_group=1, spatial=40) + + #test_group_convolution(device, batch_size=1, groups=240, stride=2, padding=1, seed=2, c_per_group=1, out_per_group=1) + print("test_group_conv_decomposition: all passed") From 9b0ab3babd9c00d006016b841da92daac82a0e55 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 25 Mar 2026 14:22:18 +0900 Subject: [PATCH 152/194] [Frontend] Fix attribute passing to TOGSIM --- PyTorchSimFrontend/extension_codecache.py | 9 ++++--- Scheduler/scheduler.py | 2 +- Simulator/simulator.py | 32 +++++++++++++++++------ 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 6463dbac..ac711650 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -302,15 +302,16 @@ def run_kernel_simulation(*args, **kwargs): # Prepare arguments for launch kernel onnx_path = os.path.join(result_path, "tile_graph.onnx") - attribute_path = os.path.join(runtime_path, "attribute") + attribute_dir = os.path.join(runtime_path, "attribute") + kernel_attribute_path = TOGSimulator.write_kernel_attribute_file(attribute_dir, args) TOGSim = torch.npu.get_tog_simulator() if not autotune and TOGSim is not None: - attribute_path = TOGSim.create_attribute_file(attribute_path, args) - torch.npu.launch_kernel(onnx_path, attribute_path) + torch.npu.launch_kernel(onnx_path, kernel_attribute_path) result = None # No result for non-autotune mode else: - result_path = TOGSimulator.run_standalone(onnx_path, attribute_path, autotune_mode=autotune) + result_path = TOGSimulator.run_standalone( + onnx_path, kernel_attribute_path, autotune_mode=autotune) result = TOGSimulator.get_result_from_file(result_path) return result return run_kernel_simulation diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 77e218ea..732f2841 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -231,7 +231,7 @@ def prepare_launch_kernel(self, kernel, inputs): onnx_path = os.path.join(result_path, "tile_graph.onnx") attribute_path = os.path.join(runtime_path, "attribute") - attribute_path = self.tog_simulator.create_attribute_file(attribute_path, inputs) + attribute_path = TOGSimulator.write_kernel_attribute_file(attribute_path, inputs) return onnx_path, attribute_path def launch_kernel(self, current_cycle, partion_idx=0): diff --git a/Simulator/simulator.py b/Simulator/simulator.py index f24835ba..a02d8fc9 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -427,28 +427,44 @@ def sram_dealloc(cls, buf_name, addr_range): if buf_name in cls.ALLOC_POOL: del cls.ALLOC_POOL[buf_name] - def create_attribute_file(self, attribute_path, inputs, **kwargs): + @staticmethod + def write_kernel_attribute_file(attribute_dir, inputs, alloc_pool=None): + """ + Write kernel attribute YAML (address_info + sram_alloc) under attribute_dir. + + Does not require a TOGSimulator instance. alloc_pool defaults to class ALLOC_POOL. + + Args: + attribute_dir: Directory to hold numbered attribute files (created if needed) + inputs: Kernel input tensors (data_ptr used for address_info) + alloc_pool: Optional dict like ALLOC_POOL; defaults to TOGSimulator.ALLOC_POOL + + Returns: + Path to the written YAML file. + """ + if alloc_pool is None: + alloc_pool = TOGSimulator.ALLOC_POOL address_info = {} sram_buffer = {} yaml_content = {} - os.makedirs(attribute_path, exist_ok=True) - index = str(len(os.listdir(attribute_path))) - attribute_path = os.path.join(attribute_path, index) + os.makedirs(attribute_dir, exist_ok=True) + index = str(len(os.listdir(attribute_dir))) + attribute_file = os.path.join(attribute_dir, index) for idx, tensor in enumerate(inputs): address_info[f"arg{idx}"] = tensor.data_ptr() yaml_content["address_info"] = address_info - for buf_name, range in self.ALLOC_POOL.items(): + for buf_name, range in alloc_pool.items(): sram_buffer[buf_name] = range yaml_content["sram_alloc"] = sram_buffer - with open(attribute_path, "w") as f: + with open(attribute_file, "w") as f: yaml.dump(yaml_content, f, default_flow_style=False) f.flush() - os.fsync(f.fileno()) # There could be a race condition. - return attribute_path + os.fsync(f.fileno()) + return attribute_file def load_yaml(self, config_path): config_path = Path(config_path) From 5cbe9d1fbd6432550e73ca71d5afa0b4939c3543 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 25 Mar 2026 17:11:48 +0900 Subject: [PATCH 153/194] [Frontend] Fix loop_size argument passing --- PyTorchSimFrontend/mlir/mlir_autotune.py | 2 +- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py index caf4d6da..b8f5eaf9 100644 --- a/PyTorchSimFrontend/mlir/mlir_autotune.py +++ b/PyTorchSimFrontend/mlir/mlir_autotune.py @@ -83,7 +83,7 @@ def cached_run_fn(*args, **kwargs): # Run a candidate code run_method = custom_async_compile.mlir( self.source_code, vectorlane_size=self.extra_args["vector_lane"], - loop_size=None, spad_info=self.extra_args["spad_info"], + loop_size=self.extra_args["loop_size"], spad_info=self.extra_args["spad_info"], vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"], origins=self.extra_args["origins"], silent_mode=True, autotune=self.extra_args['autotune']) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 3ecf3b53..8bfdc57f 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -319,6 +319,7 @@ def __init__(self, kernel_group, reason=None): self.reduce_iterator = {} self.spad_buffer_dict = dict() self.base_vector_initialized = False + self.loop_size = None def reset(self, reason): save = self.exit_stack, self._nested_context_depth @@ -1072,6 +1073,7 @@ def run_bench(self, nodes, kernel_name, src_code): "vlen" : self.vlen, "arg_attributes" : arg_attributes, "autotune" : True, + "loop_size" : self.loop_size, "origins" : {str(i) for node in nodes for i in node.node.origins}, }, source_code=src_code, From f03f72731b5d06f2b49e422de1b059d8b1235a2a Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 25 Mar 2026 17:12:08 +0900 Subject: [PATCH 154/194] [Script] Add utility option --- .../artifact/cycle_validation/run_cycle.sh | 194 ++++++++++++------ 1 file changed, 131 insertions(+), 63 deletions(-) diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh index 7406f356..e49538d0 100755 --- a/experiments/artifact/cycle_validation/run_cycle.sh +++ b/experiments/artifact/cycle_validation/run_cycle.sh @@ -1,85 +1,153 @@ #!/bin/bash set -e +usage() { + cat <<'EOF' +Usage: run_cycle.sh [--only SECTION[,SECTION...]] + + Run cycle validation benchmarks. Default: all sections + summary. + + SECTION (comma-separated for --only): + matmul GEMM sizes + conv Conv2d sizes + layernorm LayerNorm sizes + softmax Softmax sizes + attention Attention sizes + resnet resnet18, resnet50 + bert BERT base/large/xlarge + summary summary_cycle.py (reads logs under experiments/artifact/logs) + +Examples: + ./run_cycle.sh + ./run_cycle.sh --only matmul + ./run_cycle.sh --only matmul,conv,summary +EOF +} + +ONLY="" +while [[ $# -gt 0 ]]; do + case "$1" in + --only) + ONLY="${2:-}" + if [[ -z "$ONLY" ]]; then echo "error: --only needs a value"; exit 1; fi + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "error: unknown argument: $1" >&2 + usage >&2 + exit 1 + ;; + esac +done + +# If ONLY is set, run section NAME only when ",$NAME," appears in ",$ONLY," +should_run() { + local name=$1 + if [[ -z "$ONLY" ]]; then + return 0 + fi + [[ ",${ONLY}," == *",${name},"* ]] +} + export TOGSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs mkdir -p $LOG_DIR # Matmul -for sz in "256 256 256" "512 512 512" "1024 1024 1024" "2048 2048 2048"; do - name="gemm_${sz// /x}" - echo "" - echo "===================================================" - echo "[*] Running Matmul size=$sz" - echo "===================================================" - python3 $TORCHSIM_DIR/experiments/gemm.py --size $sz | tee $LOG_DIR/${name}.log -done +if should_run matmul; then + for sz in "256 256 256" "512 512 512" "1024 1024 1024" "2048 2048 2048"; do + name="gemm_${sz// /x}" + echo "" + echo "===================================================" + echo "[*] Running Matmul size=$sz" + echo "===================================================" + python3 $TORCHSIM_DIR/experiments/gemm.py --size $sz | tee $LOG_DIR/${name}.log + done +fi # Conv -for sz in \ - "1 56 56 64 64 3 1 1" \ - "1 28 28 128 128 3 1 1" \ - "1 14 14 256 256 3 1 1" \ - "1 7 7 512 512 3 1 1" \ - "64 56 56 64 64 3 1 1" \ - "64 28 28 128 128 3 1 1" \ - "64 14 14 256 256 3 1 1" \ - "64 7 7 512 512 3 1 1"; do - name="conv_${sz// /x}" - echo "" - echo "===================================================" - echo "[*] Running Conv size=$sz" - echo "===================================================" - python3 $TORCHSIM_DIR/experiments/conv.py --size $sz | tee $LOG_DIR/${name}.log -done +if should_run conv; then + for sz in \ + "1 56 56 64 64 3 1 1" \ + "1 28 28 128 128 3 1 1" \ + "1 14 14 256 256 3 1 1" \ + "1 7 7 512 512 3 1 1" \ + "64 56 56 64 64 3 1 1" \ + "64 28 28 128 128 3 1 1" \ + "64 14 14 256 256 3 1 1" \ + "64 7 7 512 512 3 1 1"; do + name="conv_${sz// /x}" + echo "" + echo "===================================================" + echo "[*] Running Conv size=$sz" + echo "===================================================" + python3 $TORCHSIM_DIR/experiments/conv.py --size $sz | tee $LOG_DIR/${name}.log + done +fi # LayerNorm -for sz in "512 768" "2048 768" "8192 768"; do - name="layernorm_${sz// /x}" - echo "" - echo "===================================================" - echo "[*] Running LayerNorm size=$sz" - echo "===================================================" - python3 $TORCHSIM_DIR/experiments/layernorm.py --size $sz | tee $LOG_DIR/${name}.log -done +if should_run layernorm; then + for sz in "512 768" "2048 768" "8192 768"; do + name="layernorm_${sz// /x}" + echo "" + echo "===================================================" + echo "[*] Running LayerNorm size=$sz" + echo "===================================================" + python3 $TORCHSIM_DIR/experiments/layernorm.py --size $sz | tee $LOG_DIR/${name}.log + done +fi # Softmax -for sz in "512 512" "2048 2048" "8192 8192"; do - name="softmax_${sz// /x}" - echo "" - echo "===================================================" - echo "[*] Running Softmax size=$sz" - echo "===================================================" - python3 $TORCHSIM_DIR/experiments/softmax.py --size $sz | tee $LOG_DIR/${name}.log -done +if should_run softmax; then + for sz in "512 512" "2048 2048" "8192 8192"; do + name="softmax_${sz// /x}" + echo "" + echo "===================================================" + echo "[*] Running Softmax size=$sz" + echo "===================================================" + python3 $TORCHSIM_DIR/experiments/softmax.py --size $sz | tee $LOG_DIR/${name}.log + done +fi # Attention -for sz in "12 512 64" "16 512 64" "32 512 64"; do - name="attention_${sz// /x}" - echo "" - echo "===================================================" - echo "[*] Running Attention size=$sz" - echo "===================================================" - python3 $TORCHSIM_DIR/experiments/attention.py --size $sz | tee $LOG_DIR/${name}.log -done +if should_run attention; then + for sz in "12 512 64" "16 512 64" "32 512 64"; do + name="attention_${sz// /x}" + echo "" + echo "===================================================" + echo "[*] Running Attention size=$sz" + echo "===================================================" + python3 $TORCHSIM_DIR/experiments/attention.py --size $sz | tee $LOG_DIR/${name}.log + done +fi # ResNet -for model in "resnet18" "resnet50"; do - echo "" - echo "===================================================" - echo "[*] Running $model" - echo "===================================================" - python3 $TORCHSIM_DIR/experiments/${model}.py | tee $LOG_DIR/${model}.log -done +if should_run resnet; then + for model in "resnet18" "resnet50"; do + echo "" + echo "===================================================" + echo "[*] Running $model" + echo "===================================================" + python3 $TORCHSIM_DIR/experiments/${model}.py | tee $LOG_DIR/${model}.log + done +fi # BERT -for model in "base" "large" "xlarge"; do - echo "" - echo "===================================================" - echo "[*] Running BERT size=$model" - echo "===================================================" - python3 $TORCHSIM_DIR/experiments/BERT.py --size $model | tee $LOG_DIR/bert_${model}.log -done +if should_run bert; then + for model in "base" "large" "xlarge"; do + echo "" + echo "===================================================" + echo "[*] Running BERT size=$model" + echo "===================================================" + python3 $TORCHSIM_DIR/experiments/BERT.py --size $model | tee $LOG_DIR/bert_${model}.log + done +fi # Cycle Summary -python3 $TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.py | tee "$TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.out" \ No newline at end of file +if should_run summary; then + python3 $TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.py | tee "$TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.out" +fi From 1ae39bfb4926b8a6b42500b997154329dfc56051 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 26 Mar 2026 12:40:55 +0900 Subject: [PATCH 155/194] [Cleanup] #219 cleanup the deprecated scheduler module --- README.md | 126 +++-- Scheduler/scheduler.py | 533 +------------------ Simulator/simulator.py | 21 +- tests/MLP/test_mlp.py | 4 +- tests/MoE/test_moe.py | 4 +- tests/test_compile_overhead.py | 45 -- tests/test_gqa_decode.py | 3 +- tests/test_hetro.py | 76 ++- tests/test_scheduler.py | 4 +- tests/test_scheduler_batching.py | 41 -- tests/test_sort.py | 5 +- tests/test_sparse_core.py | 5 +- tests/test_spmm_scheduler.py | 66 --- tutorial/session1/CompilerOptimization.ipynb | 3 +- tutorial/session1/ExecutionMode.ipynb | 3 +- tutorial/session1/Inference.ipynb | 3 +- tutorial/session1/LogAnalysis.ipynb | 3 +- tutorial/session1/Mapping.ipynb | 3 +- tutorial/session1/Training.ipynb | 3 +- tutorial/session2/Hands_on.ipynb | 4 +- 20 files changed, 139 insertions(+), 816 deletions(-) delete mode 100644 tests/test_compile_overhead.py delete mode 100644 tests/test_scheduler_batching.py delete mode 100644 tests/test_spmm_scheduler.py diff --git a/README.md b/README.md index c6280498..03041355 100644 --- a/README.md +++ b/README.md @@ -106,9 +106,8 @@ You can run your own PyTorch model on PyTorchSim by setting up a custom NPU devi This method also applies when you want to simulate models beyond the provided examples. ```python import torch -from Scheduler.scheduler import PyTorchSimRunner -# Declare a custom NPU device -device = PyTorchSimRunner.setup_device().custom_device() + +device = torch.device("npu:0") # Declare you own model (e.g. resnet18 from torchvision) from torchvision.models import resnet18 @@ -215,76 +214,95 @@ opt_step() `tests/test_mlp.py` provides an example of MLP training. ## Multi-tenancy -Our load generator supports multi-tenancy experiments. You can run a simple example by executing `tests/test_scheduler.py`. -```bash -python tests/test_scheduler.py -``` -Below is an example code of multi-tenancy `resnet18` and `EncoderBlock`. -In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSimulator config file(`.yml`). The compiled PyTorch models are then registered with a unique model id. -```python3 -import os -import sys +While the **`with TOGSimulator(config_path=...)`** block is active, **`TOGSIM_CONFIG`** is set to that YAML so **compilation and TOGSim use the same** hardware description. + +### 1. One TOGSim session, one continuous log + +If you want **one** log where kernels are simulated **in sequence** as a single run, wrap the code you already use to execute the compiled model with **`with TOGSimulator(config_path=...)`**. No other API is required; every forward inside the block shares that session. + +```python import torch -from torchvision.models import resnet18 -base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') -config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml' +from Simulator.simulator import TOGSimulator -sys.path.append(base_path) -from tests.test_transformer import EncoderBlock -from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator -scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config) +# ... build model, torch.compile, tensors on npu:0 as usual ... -# Register compiled model -target_model0 = resnet18().eval() -target_model1 = EncoderBlock(768, 12).eval() -opt_model0 = torch.compile(target_model0.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last)) -opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device())) -SchedulerDNNModel.register_model("model0", opt_model0) -SchedulerDNNModel.register_model("model1", opt_model1) +with TOGSimulator(config_path=config): + y = compiled_model(x) ``` -The config file(`.yml`) specifies two key items: -- `num_partition`: The total number of independent request queues to create. -- `partition`: Defines the hardware mapping, assigning each queue (identified by its index) to a specific physical core. -For example, the configuration below creates two scheduling queues (`0` and `1`) and maps `core_0` to queue `0` and `core_1` to queue `1`: +### 2. Multi-tenancy and explicit scheduling (`launch_model`) + +For **multi-tenant** or **interleaved** execution, you usually need to attach a **timestamp** and a **`stream_index`** to each launch so the simulator can order work correctly. Use **`torch.npu.launch_model(compiled_model, *inputs, stream_index=..., timestamp=...)`** for that; plain `compiled_model(x)` does not carry those parameters. + +**`stream_index`** is the **request-queue / partition index** in the TOGSim config: it must match the **values** in the **`partition`** map (each queue index is mapped to a **core**). For example, `stream_index=0` goes to the queue bound to `core_0`, `stream_index=1` to the queue for `core_1`, and so on. + +**`timestamp`** is in **nanoseconds** (simulation time for ordering launches). Use `0` when you do not need explicit times beyond submission order. + +```python +with TOGSimulator(config_path=config): + torch.npu.launch_model(opt_model1, x1, stream_index=0, timestamp=0) + torch.npu.launch_model(opt_model2, x2, stream_index=1, timestamp=0) + torch.npu.synchronize() + torch.npu.launch_model(opt_model1, x1, stream_index=0, timestamp=0) + torch.npu.launch_model(opt_model2, x2, stream_index=1, timestamp=0) +``` + +Here **`synchronize()`** acts as a barrier: it does not return until every **`launch_model`** issued **above** it has finished in the simulator. The later pair of `launch_model` calls therefore runs only after those earlier models have fully completed—so the sync is the point in the timeline where **all preceding launches are done**. + +```bash +python tests/test_scheduler.py +``` + +Use a TOGSim config(`.yml`) that defines **partitions** when mapping queues to cores, for example: + +- **`num_partition`**: Number of independent request queues (valid **`stream_index`** values are `0 … num_partition-1`). +- **`partition`**: Maps each **core** name to a **queue index**; that index is the same **`stream_index`** you pass to **`launch_model`**. + ``` "num_partition" : 2, "partition": { - "core_0":0, - "core_1":1 + "core_0": 0, + "core_1": 1 } ``` -Next, DNN model requests are generated and submitted. We provide a `poisson_request_generator` utility, which generates request arrival times. -Each `Request` is created with its model name, data, and a request_queue_idx to specify its target queue, then added via `scheduler.add_request`. -As shown in the code, `model0` requests are queued to `request_queue_idx=0`, while `model1` requests are queued to `request_queue_idx=1`. -```python3 -# Load Generation +Here `stream_index=0` selects queue `0` (core_0), `stream_index=1` selects queue `1` (core_1). + +### 3. Load generation (Poisson arrivals) + +The **`poisson_request_generator`** in **`Scheduler.scheduler`** yields synthetic **arrival times** (in **milliseconds**). Merge those with **`launch_model`**: convert each time to **nanoseconds** for **`timestamp`**, set **`stream_index`** to the target partition queue, and run all launches inside one **`with TOGSimulator(...)`** so a **single** log captures the full trace. + +```python +from Scheduler.scheduler import poisson_request_generator + model0_lambda = 5.0 model1_lambda = 3.0 -max_time = 1000.0 # [s] +max_time_msec = 1000.0 # Poisson horizon [ms] -# Generate Possion distribution requests for model0 -for model0_request_time in poisson_request_generator(model0_lambda, max_msec_time=max_time): - x = torch.randn(1, 3, 224, 224) - new_request = Request("model0", [x], [], request_queue_idx=0) - scheduler.add_request(new_request, request_time=model0_request_time) +events = [] +for t in poisson_request_generator(model0_lambda, max_msec_time=max_time_msec): + x = torch.randn(1, 3, 224, 224, device=device) + events.append((t, 0, opt_model0, (x,))) # stream_index 0 → queue / partition 0 -# Generate Possion distribution requests for model1 -for model1_request_time in poisson_request_generator(model1_lambda, max_msec_time=max_time): - x = torch.randn(128, 768) - new_request = Request("model1", [x], [], request_queue_idx=1) - scheduler.add_request(new_request, request_time=model1_request_time) -``` +for t in poisson_request_generator(model1_lambda, max_msec_time=max_time_msec): + x = torch.randn(128, 768, device=device) + events.append((t, 1, opt_model1, (x,))) # stream_index 1 → queue / partition 1 -Finally, `scheduler.schedule()` is called in a loop until all requests are processed. -```python3 -# Run scheduler -while not scheduler.is_finished(): - scheduler.schedule() +events.sort(key=lambda e: e[0]) + +with TOGSimulator(config_path=config): + for t_msec, stream_index, model, args in events: + torch.npu.launch_model( + model, + *args, + stream_index=stream_index, + timestamp=int(t_msec * 1e6), + ) # ms → ns ``` +The two Poisson streams are **combined and sorted by time** so launches follow a single global arrival order. + ## Compiler Optimizations PyTorchSim compiler supports several fusion optimizations: - GEMM prologue fusion diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py index 732f2841..2b3aac92 100644 --- a/Scheduler/scheduler.py +++ b/Scheduler/scheduler.py @@ -1,34 +1,11 @@ -from typing import List -import os -import sys -import numpy as np -import torch -from pathlib import Path -import importlib.util -from PyTorchSimFrontend.extension_codecache import hash_prefix -from Simulator.simulator import TOGSimulator -from PyTorchSimFrontend import extension_config - -# Configure logger for Scheduler module -logger = extension_config.setup_logger() - +"""Poisson load helpers for synthetic request arrival times.""" -def import_module_from_path(module_name, path): - module_path = Path(path) # Convert to Path object for safety - if not module_path.exists() or not module_path.is_file(): - raise FileNotFoundError(f"No such file: '{module_path}'") - - spec = importlib.util.spec_from_file_location(module_name, module_path) - if spec is None: - raise ImportError(f"Could not load module from path: '{module_path}'") - - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) +import numpy as np - return module def poisson_request_generator(lambda_requests, max_msec_time=None): - current_time = 0.0 # msec + """Yield synthetic arrival times in milliseconds (first sample is 0).""" + current_time = 0.0 # msec yield 0 while max_msec_time is None or current_time < max_msec_time: @@ -39,505 +16,3 @@ def poisson_request_generator(lambda_requests, max_msec_time=None): break yield current_time - -class Request: - """ Each request has model name, it's own id, and requested time. """ - request_id = 0 - QUEUED = 1 - RUNNING = 2 - INCREMENT = 3 - FINISHED = 4 - def __init__(self, model:str, batchable_input_tensor : List[torch.Tensor], - shared_input_tensor: List[torch.tensor], request_queue_idx=0) -> None: - self.model = model - self.batchable_input_tensor = batchable_input_tensor - self.shared_input_tensor = shared_input_tensor - self.arrival_time = None - self.start_time = [] - self.finish_time = [] - self.state = self.QUEUED - self.id = self.allocate_id() - self.request_queue_idx = request_queue_idx - - def allocate_id(self): - allocated_id = Request.request_id - Request.request_id += 1 - return allocated_id - - def set_start(self, start_time): - self.state = self.RUNNING - self.start_time.append(start_time) - - def set_finished(self, finish_time): - self.state = self.FINISHED - self.finish_time.append(finish_time) - - def get_latency(self): - # Todo. Provide Toke-By-Token - if self.state == self.FINISHED: - turnaround_time = self.finish_time[-1] - self.arrival_time - else: - turnaround_time = None - - if self.start_time: - response_time = self.start_time[0] - self.arrival_time - else: - response_time = None - - if self.start_time and self.finish_time: - tbt_time = [i-j for i,j in zip(self.finish_time, self.start_time)] - else: - tbt_time = [] - - return turnaround_time, response_time, tbt_time - - def free_memory(self): - """ Free memory resources that are allocated for handle this request """ - return - - def __str__(self) -> str: - return f"Request{self.id} Model: '{self.model}', Arrival: {self.arrival_time}, Start: {self.start_time}, End: {self.finish_time}, State: {self.state}, Partion: {self.request_queue_idx}" - -class RequestReturn: - INCREMENT = 0 - FINISHED = 1 - def __init__(self, state) -> None: - self.state = state - - def is_finished(self): - return self.state == self.FINISHED - - def is_increment(self): - return self.state == self.INCREMENT - -class SchedulerDNNModel: - MODEL_MAP = {} - def __init__(self, batched_req : List[Request], partition_idx) -> None: - self.model_name = batched_req[0].model - self.batched_req = batched_req - self.args = None - self.model = self.find_model(self.model_name) - self.partition_idx = partition_idx - - def find_model(self, model_name : str): - if model_name in SchedulerDNNModel.MODEL_MAP: - return SchedulerDNNModel.MODEL_MAP[model_name] - else: - raise KeyError(f'[Scheduler] Requested model "{model_name}" is not registered...') - - def get_batchable_input(self): - batched_input_tensor = [] - for i in range(len(self.batched_req[0].batchable_input_tensor)): - tensor_list = [req.batchable_input_tensor[i] for req in self.batched_req] - batched_input_tensor.append(torch.concat(tensor_list, dim=0)) - return batched_input_tensor - - def get_shared_input(self): - return self.batched_req[0].shared_input_tensor - - def get_input(self): - return self.get_batchable_input() + self.get_shared_input() - - def __str__(self): - return f"DNN Model: {self.model_name}, Partion idx: {self.partition_idx} Req: {self.batched_req[0]}" - - @staticmethod - def register_model(model_name : str, compiled_model): - SchedulerDNNModel.MODEL_MAP[model_name] = compiled_model - -class PyTorchSimRunner: - PARTITION_BUSY = 0 - PARTITION_IDLE = 1 - SELECT_NOTHING = 2 - NPU_MODULE = None - def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None: - self.module = self.setup_device() - self.num_partion = num_partion - self.launch_model_dicts = [] - self.nested_launch_model_dicts = [] - self.partition_state = [] - for i in range(self.num_partion): - self.launch_model_dicts.append({}) - self.nested_launch_model_dicts.append({}) - self.partition_state.append(self.PARTITION_IDLE) - - self.finish_req_dict = {} - self.tog_simulator = tog_simulator - - # Dry run for compile and create generator - os.environ["TOGSIM_EAGER_MODE"] = "1" - - @classmethod - def setup_device(cls): - if cls.NPU_MODULE is not None: - return cls.NPU_MODULE - - try: - from torch._inductor.codegen.common import register_backend_for_device - from PyTorchSimFrontend.mlir.mlir_codegen_backend import ExtensionWrapperCodegen - from PyTorchSimFrontend.mlir.mlir_scheduling import MLIRScheduling - except ImportError as e: - logger.error(f"Failed to import torch_openreg: {e}") - logger.error("Please ensure PyTorchSimDevice2 is installed: pip install -e PyTorchSimDevice2") - raise - - register_backend_for_device( - "npu", - lambda scheduling: MLIRScheduling(scheduling), - ExtensionWrapperCodegen - ) - - cls.NPU_MODULE = torch.npu - return cls.NPU_MODULE - - def submit(self, batched_req, partition_idx) -> List[RequestReturn]: - # FIXME. Construct SchedulerDNNModel - batched_req_model = self.get_compiled_model(batched_req, partition_idx) - self.prepare_model(batched_req_model) - - def get_compiled_model(self, batched_req: List[Request], request_queue_idx): - compiled_model = SchedulerDNNModel(batched_req, request_queue_idx) - return compiled_model - - def is_partition_idle(self, partition_idx): - return len(self.launch_model_dicts[partition_idx]) == 0 - - def is_any_idle(self, skip_list): - return any([self.is_partition_idle(i) and not skip_list[i] for i in range(self.num_partion)]) - - def is_all_idle(self): - return all([self.is_partition_idle(i) for i in range(self.num_partion)]) - - def prepare_model(self, req_model: SchedulerDNNModel): - result_path = os.path.join(extension_config.CONFIG_TORCHSIM_LOG_PATH, "togsim_result", req_model.model_name) - os.makedirs(result_path, exist_ok=True) - index = str(len(os.listdir(result_path))) - - # Prepare input tensor - input_tensor_list = req_model.get_input() - input_tensor_list = [input_tensor.to(device=self.module.custom_device()) for input_tensor in input_tensor_list] - - # This model-call will return generator - ret = req_model.model(*input_tensor_list) - self.launch_model_dicts[req_model.partition_idx][req_model] = ret - - def finish_model(self, model : SchedulerDNNModel, output : torch.Tensor): - for req in model.batched_req: - # TODO. finish time - self.finish_req_dict[req] = RequestReturn(RequestReturn.FINISHED) - - def prepare_launch_kernel(self, kernel, inputs): - result_path, runtime_path, _ = kernel(*inputs) - onnx_path = os.path.join(result_path, "tile_graph.onnx") - - attribute_path = os.path.join(runtime_path, "attribute") - attribute_path = TOGSimulator.write_kernel_attribute_file(attribute_path, inputs) - return onnx_path, attribute_path - - def launch_kernel(self, current_cycle, partion_idx=0): - # Check partition is busy - if self.partition_state[partion_idx] != self.PARTITION_IDLE: - return self.partition_state[partion_idx] - result = self.select_kernel(partion_idx) - if result == self.SELECT_NOTHING: - return self.SELECT_NOTHING - kernel, inputs = result - if not isinstance(kernel, str): - onnx_path, attribute_path = self.prepare_launch_kernel(kernel, inputs) - else: - onnx_path, attribute_path = kernel, inputs - self.partition_state[partion_idx] = self.PARTITION_BUSY - return self.tog_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx) - -class FIFORunner(PyTorchSimRunner): - def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None: - super().__init__(tog_simulator, num_partion) - - def select_kernel(self, partition_idx): - while len(self.nested_launch_model_dicts[partition_idx]) or len(self.launch_model_dicts[partition_idx]): - if len(self.nested_launch_model_dicts[partition_idx]): - target_dict = self.nested_launch_model_dicts - else: - target_dict = self.launch_model_dicts - - # Select FIFO manner - req, target_model = next(iter(target_dict[partition_idx].items())) - try: - kernel, inputs = next(target_model) - - # For extern call - if isinstance(kernel, str): - return kernel, inputs - - # For convolution... - if not hasattr(kernel, "future"): - nested_gen = kernel(*inputs) - self.nested_launch_model_dicts[partition_idx] = {req : nested_gen} - kernel, inputs = \ - next(self.nested_launch_model_dicts[partition_idx][req]) - return kernel, inputs - except StopIteration as e: - # Retry - if target_dict == self.launch_model_dicts: - self.finish_model(req, e.value) - del target_dict[partition_idx][req] - # No proper kernel now - return self.SELECT_NOTHING - -class RoundRobinRunner(PyTorchSimRunner): - def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None: - super().__init__(tog_simulator, num_partion) - self.next_pointer = None - - def select_kernel(self, partition_idx): - while len(self.nested_launch_model_dicts[partition_idx]) or len(self.launch_model_dicts[partition_idx]): - if len(self.nested_launch_model_dicts[partition_idx]): - target_dict = self.nested_launch_model_dicts - else: - target_dict = self.launch_model_dicts - - req_list = list(target_dict[partition_idx].keys()) - # Select RR manner - if self.next_pointer is None or self.next_pointer not in req_list: - req = req_list[0] - pos = 0 - else: - req = self.next_pointer - pos = req_list.index(req) - - # Set Next pointer - if pos + 1 < len(req_list): - self.next_pointer = req_list[pos+1] - else: - self.next_pointer = req_list[0] - - target_model = self.launch_model_dicts[partition_idx][req] - try: - kernel, inputs = next(target_model) - - # For convolution... - if not hasattr(kernel, "future"): - nested_gen = kernel(*inputs) - self.nested_launch_model_dicts[partition_idx] = {req : nested_gen} - kernel, inputs = \ - next(self.nested_launch_model_dicts[partition_idx][req]) - return kernel, inputs - except StopIteration as e: - # Retry - if target_dict == self.launch_model_dicts: - self.finish_model(req, e.value) - del self.launch_model_dicts[partition_idx][req] - # No proper kernel now - return self.SELECT_NOTHING - -class Scheduler: - - FIFO_ENGINE = 0 - RR_ENGINE = 1 - def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG) -> None: - self.current_cycle = 0 - self.max_batch = max_batch - self.num_request_queue = num_request_queue - self.request_queue : List[List[Request]] = [] - for i in range(self.num_request_queue): - self.request_queue.append([]) - self.finish_queue : List[Request] = [] - - self.tog_simulator = TOGSimulator(togsim_config) - if self.tog_simulator.config_yaml['pytorchsim_timing_mode'] == 0: - # Scheduler requires timing mode to be enabled (pytorchsim_timing_mode != 0). - logger.error(f"pytorchsim_timing_mode is set to 0 in config file '{togsim_config}'. ") - logger.error(f"Scheduler requires timing mode to be enabled (pytorchsim_timing_mode != 0).") - exit(0) - - os.environ['TOGSIM_CONFIG'] = togsim_config - self.tog_simulator.interactive_simulation() - if engine_select == Scheduler.FIFO_ENGINE: - self.execution_engine = FIFORunner(self.tog_simulator, self.num_request_queue) - elif engine_select == Scheduler.RR_ENGINE: - self.execution_engine = RoundRobinRunner(self.tog_simulator, self.num_request_queue) - else: - logger.error(f"Not supported engine type {engine_select}") - exit(1) - - def add_request(self, request: Request, request_time=-1): - """register model at timestamp time - request_time : msec - """ - request_time = self.current_time() if request_time == -1 else request_time - request.arrival_time = request_time - self.request_queue[request.request_queue_idx].append(request) - - def request_empty(self, request_queue_idx): - return len(self.request_queue[request_queue_idx])==0 - - def select(self, request_queue_idx=0) -> List[Request]: - """ - Select 1 request from request_queue in FCFS manner. - If there is no proper request, return None - """ - candidate_req = [] - if not self.request_queue[request_queue_idx]: - return candidate_req - for req in self.request_queue[request_queue_idx]: - - if self.msec_to_cycle(req.arrival_time) <= self.current_cycle and req.state == Request.QUEUED: - candidate_req.append(req) - - # Stop batching - if self.max_batch <= len(candidate_req): - break - return candidate_req - - def next_request_time(self, request_queue_idx=0): - for req in self.request_queue[request_queue_idx]: - if req.state == Request.QUEUED: - return req, req.arrival_time - return None, -1 - - def nearest_next_reqeust_time(self): - nearest_req = None - nearest_arrival_time = -1 - for i in range(self.num_request_queue): - req, arrival_time = self.next_request_time(i) - if nearest_arrival_time == -1 and arrival_time != -1: - nearest_req = req - nearest_arrival_time = arrival_time - elif arrival_time != -1 and nearest_arrival_time > arrival_time: - nearest_req = req - nearest_arrival_time = arrival_time - return nearest_req, nearest_arrival_time - - def finish_request(self, req : Request): - req.set_finished(self.current_time()) - - # Free resources - req.free_memory() - - # Move to finish queue - self.finish_queue.append(req) - self.request_queue[req.request_queue_idx].remove(req) - turnaround_time, response_time, tbt_time = req.get_latency() - logger.info( - f"[Request-{req.id} finished] partition: {req.request_queue_idx} arrival_time: " - f"{req.arrival_time} start_time: {req.start_time[0]} turnaround latency: {turnaround_time}, " - f"response time: {response_time} tbt_time: {tbt_time}" - ) - - def per_schedule(self, request_queue_idx): - # Wait partition is idle - if not self.execution_engine.is_partition_idle(request_queue_idx): - return False - - request_list = self.select(request_queue_idx) - if not request_list: - return False - - logger.info(f"[Request issue] partition: {request_queue_idx} batch size: {len(request_list)}") - for req in request_list: - req.set_start(self.current_time()) - logger.info( - f"[Request-{req.id} issue] partition: {req.request_queue_idx} " - f"arrival_time: {req.arrival_time} start_time: {req.start_time[0]}" - ) - # Submit batched request - self.execution_engine.submit(request_list, request_queue_idx) - - return True - - def check_finish_request(self): - # Check finished request - while self.execution_engine.finish_req_dict: - req, req_ret = next(iter(self.execution_engine.finish_req_dict.items())) - self.finish_request(req) - del self.execution_engine.finish_req_dict[req] - - def schedule(self): - # Try schedule all request queue - result = [] - for i in range(self.num_request_queue): - result.append(self.per_schedule(i)) - - # Try move to next nearest request time - next_req, next_time = self.nearest_next_reqeust_time() - if next_req is None and self.execution_engine.is_all_idle(): - # No request remained... - return - - # Need to forward the time until next_arrival_time - if self.execution_engine.is_all_idle(): - reason = self.tog_simulator.until(self.msec_to_cycle(next_time)) - self.current_cycle = self.tog_simulator.cycle() - else: - self.run(next_time) - return - - def run(self, until_time): - req_empty_info = [self.request_empty(i) for i in range(self.execution_engine.num_partion)] - def execute_cycle(): - launch_ret_info = [] - for i in range(self.execution_engine.num_partion): - if self.execution_engine.partition_state[i] == PyTorchSimRunner.PARTITION_IDLE: - ret = self.execution_engine.launch_kernel(self.current_cycle, i) - launch_ret_info.append(ret) - - self.check_finish_request() - # Check if the stop condition is met - if self.execution_engine.is_any_idle(req_empty_info) or self.execution_engine.is_all_idle(): # Ignore empty request queue - return [] - - # Schedule jobs and update the current time - result_list = self.tog_simulator.until(self.msec_to_cycle(until_time)) - self.current_cycle = self.tog_simulator.cycle() - - for core_idx in result_list: - # Kernel is finished. So set idle state - self.execution_engine.partition_state[core_idx] = PyTorchSimRunner.PARTITION_IDLE - - return result_list - - if self.current_cycle >= self.msec_to_cycle(until_time): - until_time = -1 - - if until_time == -1: - while not self.execution_engine.is_any_idle(req_empty_info): - result = execute_cycle() - req_empty_info = [self.request_empty(i) for i in range(self.execution_engine.num_partion)] - # if result is not -1, schedule new request - if len(result)==0: - break - - else: - while self.current_cycle <= self.msec_to_cycle(until_time) and not self.execution_engine.is_all_idle(): - result = execute_cycle() - # if result is not -1, schedule new request - if len(result)==0: - break - return - - def is_request_queue_empty(self): - result = True - for i in range(self.num_request_queue): - result = result and (not len(self.request_queue[i])) - return result - - def is_finished(self): - if self.is_request_queue_empty() and self.execution_engine.is_all_idle(): - self.tog_simulator.wait() - return True - return False - - def current_time(self): - return self.cycle_to_msec(self.current_cycle) - - def cycle_to_msec(self, cycle): - freq = self.tog_simulator.get_core_freq() - return cycle / (freq / 1000) - - def msec_to_cycle(self, msec): - # We treat -1 as special time - if (msec == -1): - return msec - - freq = self.tog_simulator.get_core_freq() - return int(msec * (freq / 1000)) diff --git a/Simulator/simulator.py b/Simulator/simulator.py index a02d8fc9..5b00d5d4 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -223,6 +223,7 @@ class TOGSimulator(): TOGSIM_RESULT_PATH_KEY = "TOGSIM_RESULT_PATH" FINISH_STR = "Simulation finished" ALLOC_POOL = dict() # For eagermode buffer plan + _TOGSIM_CONFIG_ENV_UNSET = object() def __init__(self, config_path=None, togsim_path=None) -> None: if config_path is None: config_path = extension_config.CONFIG_TOGSIM_CONFIG @@ -258,18 +259,32 @@ def __init__(self, config_path=None, togsim_path=None) -> None: raise RuntimeError(f"Failed to open trace file: {e}") def __enter__(self): - """Context manager entry.""" - # Set this simulator instance as the global TOGSimulator + """Context manager entry. + + Sets ``TOGSIM_CONFIG`` to this instance's config path so that compilation + (``extension_config`` / codegen) uses the same YAML as TOGSim. Previous + value is restored in ``__exit__``. + """ + if "TOGSIM_CONFIG" in os.environ: + self._old_togsim_config_env = os.environ["TOGSIM_CONFIG"] + else: + self._old_togsim_config_env = self._TOGSIM_CONFIG_ENV_UNSET + os.environ["TOGSIM_CONFIG"] = os.path.abspath(self.config_path) + self.old_tog_simulator = torch.npu.get_tog_simulator() torch.npu.set_tog_simulator(self) return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit - automatically cleanup.""" - # Reset global TOGSimulator to None self.until() torch.npu.set_tog_simulator(self.old_tog_simulator) + if self._old_togsim_config_env is self._TOGSIM_CONFIG_ENV_UNSET: + os.environ.pop("TOGSIM_CONFIG", None) + else: + os.environ["TOGSIM_CONFIG"] = self._old_togsim_config_env + def _start_process(self): cmd = f"{self.get_togsim_command(self.config_path, self.base_dir)} --models_list {self.trace_file_path}" if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: diff --git a/tests/MLP/test_mlp.py b/tests/MLP/test_mlp.py index 31bcefdf..c910729e 100644 --- a/tests/MLP/test_mlp.py +++ b/tests/MLP/test_mlp.py @@ -281,10 +281,8 @@ def train(model, device): return if __name__ == "__main__": - from Scheduler.scheduler import PyTorchSimRunner torch.set_printoptions(threshold=float('inf'), linewidth=600) - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_mlp(device) # test_train_mlp(device) diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py index f9c96aff..d4cd98f1 100644 --- a/tests/MoE/test_moe.py +++ b/tests/MoE/test_moe.py @@ -807,10 +807,8 @@ def evaluation(model, evaluation_loader): train(opt_model, train_loader) if __name__ == "__main__": - from Scheduler.scheduler import PyTorchSimRunner torch.set_printoptions(threshold=float('inf'), linewidth=600) - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_moe(device) # train_moe(device) diff --git a/tests/test_compile_overhead.py b/tests/test_compile_overhead.py deleted file mode 100644 index 449707a5..00000000 --- a/tests/test_compile_overhead.py +++ /dev/null @@ -1,45 +0,0 @@ -import os -import time -import sys -import torch -from torchvision.models import resnet18 as model1 -import argparse -import shutil - -sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) -from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator -CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - -if __name__ == "__main__": - target_model1 = model1().eval() - - # Init scheduler - for i in range(1): - timestamp = time.time() # 현재 타임스탬프 (초 단위) - print(f"[{i}] Time Stamp: {timestamp:.6f}") # 소수점 6자리까지 출력 - #try: - # shutil.rmtree("/tmp/torchinductor") - #except FileNotFoundError: - # print("no cache") - scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml") - # Register compiled model - opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False) - SchedulerDNNModel.register_model("resnet18", opt_model1) - - # Generate time stamp - for request_time in [0]*12: - # Init input data - model_input1 = torch.randn(1, 3, 224, 224) - - # Init request - new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0) - - # Add request to scheduler - print("[Reqest] Resnet18 request time: ", request_time, flush=True) - scheduler.add_request(new_request1, request_time=request_time) - - # Run scheduler - while not scheduler.is_finished(): - scheduler.schedule() - - print("Done", file=sys.stderr) \ No newline at end of file diff --git a/tests/test_gqa_decode.py b/tests/test_gqa_decode.py index 3605d638..7a7ab06c 100644 --- a/tests/test_gqa_decode.py +++ b/tests/test_gqa_decode.py @@ -6,8 +6,7 @@ import math import argparse from Simulator.simulator import TOGSimulator -from Scheduler.scheduler import PyTorchSimRunner -device = PyTorchSimRunner.setup_device().custom_device() +device = torch.device("npu:0") # ───────────────────────────────────────────────────────────────────────────── # Optimized: Flash-Decode style — tile S upfront, batch in B dimension # ───────────────────────────────────────────────────────────────────────────── diff --git a/tests/test_hetro.py b/tests/test_hetro.py index 9fac8c65..eaf145d4 100644 --- a/tests/test_hetro.py +++ b/tests/test_hetro.py @@ -2,28 +2,31 @@ import sys import torch import argparse -sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) -from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request + +sys.path.append(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim")) + +from Simulator.simulator import TOGSimulator from test_stonne import sparse_matmul + def custom_matmul(a, b): return torch.matmul(a, b) + + torch.manual_seed(0) -CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') +CONFIG_TORCHSIM_DIR = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim") if __name__ == "__main__": parser = argparse.ArgumentParser(description="") parser.add_argument("--M", type=int, default=128, help="Batch size") parser.add_argument("--N", type=int, default=128, help="Input layer size") parser.add_argument("--K", type=int, default=128, help="Hidden layer size") - parser.add_argument("--sparsity", type=float, default=0.9, help="Output layer size") - parser.add_argument("--config", type=str, default="stonne_big_c1_simple_noc.yml", help="Output layer size") - parser.add_argument("--mode", type=int, default=0, help="Output layer size") + parser.add_argument("--sparsity", type=float, default=0.9, help="Sparsity") + parser.add_argument("--config", type=str, default="stonne_big_c1_simple_noc.yml", help="TOGSim config file name under configs/") + parser.add_argument("--mode", type=int, default=0, help="0=spmm only, 1=dense matmul only, 2=both partitions") args = parser.parse_args() - M = args.M - N = args.N - K = args.K + M, N, K = args.M, args.N, args.K sparsity = args.sparsity mode = args.mode config_path = f"{CONFIG_TORCHSIM_DIR}/configs/{args.config}" @@ -33,45 +36,30 @@ def custom_matmul(a, b): print("K: ", K) print("sparsity: ", sparsity) - with torch.no_grad(): - # Init scheduler - scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, - togsim_config=config_path) - - # Register compiled model - opt_model1 = torch.compile(custom_matmul) - opt_model2 = torch.compile(sparse_matmul) - SchedulerDNNModel.register_model("matmul", opt_model1) - SchedulerDNNModel.register_model("spmm", opt_model2) + device = torch.device("npu:0") - # Init input data - for i in range(1): - dense_input1 = torch.randn(M, K) - dense_input2 = torch.randn(K, N) + opt_model1 = torch.compile(custom_matmul) + opt_model2 = torch.compile(sparse_matmul) - sparse_input1 = torch.randn(128, 128) - sparse_input2 = torch.randn(128, 128) - mask1 = torch.rand(sparse_input1.shape) > sparsity - mask2 = torch.rand(sparse_input2.shape) > sparsity + dense_input1 = torch.randn(M, K, device=device) + dense_input2 = torch.randn(K, N, device=device) - sparse_input1 = sparse_input1 * mask1 - sparse_input2 = sparse_input2 * mask2 + sparse_input1 = torch.randn(128, 128, device=device) + sparse_input2 = torch.randn(128, 128, device=device) + mask1 = torch.rand(sparse_input1.shape, device=device) > sparsity + mask2 = torch.rand(sparse_input2.shape, device=device) > sparsity + sparse_input1 = sparse_input1 * mask1 + sparse_input2 = sparse_input2 * mask2 - # Init request + with torch.no_grad(): + with TOGSimulator(config_path=config_path): if mode == 0: - new_request1 = Request("spmm", [sparse_input1, sparse_input2], [], request_queue_idx=0) - scheduler.add_request(new_request1, request_time=0) + torch.npu.launch_model(opt_model2, sparse_input1, sparse_input2, stream_index=0, timestamp=0) elif mode == 1: - new_request2 = Request("matmul", [dense_input1, dense_input2], [], request_queue_idx=0) - scheduler.add_request(new_request2, request_time=0) + torch.npu.launch_model(opt_model1, dense_input1, dense_input2, stream_index=0, timestamp=0) elif mode == 2: - new_request1 = Request("spmm", [sparse_input1, sparse_input2], [], request_queue_idx=0) - new_request2 = Request("matmul", [dense_input1, dense_input2], [], request_queue_idx=1) - - # Add request to scheduler - scheduler.add_request(new_request1, request_time=0) - scheduler.add_request(new_request2, request_time=0) - - # Run scheduler - while not scheduler.is_finished(): - scheduler.schedule() \ No newline at end of file + torch.npu.launch_model(opt_model2, sparse_input1, sparse_input2, stream_index=0, timestamp=0) + torch.npu.launch_model(opt_model1, dense_input1, dense_input2, stream_index=1, timestamp=0) + else: + raise ValueError(f"unknown mode {mode}") + torch.npu.synchronize() diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 724c10d0..beab8054 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -1,5 +1,4 @@ import os -import sys import torch from torchvision.models import resnet18 as model1 from test_transformer import EncoderBlock as model2 @@ -7,7 +6,6 @@ base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml' -os.environ['TOGSIM_CONFIG'] = config target_model1 = model1().eval() target_model2 = model2(768, 12).eval() @@ -24,4 +22,4 @@ torch.npu.synchronize() torch.npu.launch_model(opt_model1, model_input1, stream_index=0, timestamp=0) torch.npu.launch_model(opt_model2, model_input2, stream_index=1, timestamp=0) -print("Done") \ No newline at end of file +print("Done") diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py deleted file mode 100644 index 65213ef0..00000000 --- a/tests/test_scheduler_batching.py +++ /dev/null @@ -1,41 +0,0 @@ -import os -import sys -import torch -from torchvision.models import resnet18 as model1 -import argparse - -sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) -from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator -CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Poisson Request Generator (ms)") - parser.add_argument("lambda_requests", nargs="?", type=int, help="Average requests per second (λ)", default=2000) - parser.add_argument("max_time", nargs="?", type=int, help="Maximum simulation time in milliseconds", default=30) - - args = parser.parse_args() - target_model1 = model1().eval() - - # Init scheduler - scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml") - # Register compiled model - opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False) - SchedulerDNNModel.register_model("resnet18", opt_model1) - - # Generate time stamp - for request_time in poisson_request_generator(args.lambda_requests, args.max_time): - # Init input data - model_input1 = torch.randn(1, 3, 224, 224) - - # Init request - new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0) - - # Add request to scheduler - print("[Reqest] Resnet18 request time: ", request_time, flush=True) - scheduler.add_request(new_request1, request_time=request_time) - - # Run scheduler - while not scheduler.is_finished(): - scheduler.schedule() - - print("Done", file=sys.stderr) \ No newline at end of file diff --git a/tests/test_sort.py b/tests/test_sort.py index 05afe92b..5bce2532 100644 --- a/tests/test_sort.py +++ b/tests/test_sort.py @@ -115,10 +115,7 @@ def sort_test_unstable(inp): shape = tuple(map(int, args.shape.strip("()").split(","))) - from Scheduler.scheduler import PyTorchSimRunner - - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_sort_stable_suite(device) test_sort_duplicate_cases(device) \ No newline at end of file diff --git a/tests/test_sparse_core.py b/tests/test_sparse_core.py index 72eda0c8..bb4ff630 100644 --- a/tests/test_sparse_core.py +++ b/tests/test_sparse_core.py @@ -80,9 +80,6 @@ def test_sparse_mlp(device, batch_size=32, input_size=128, hidden_size=128, outp import os import sys sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim')) - from Scheduler.scheduler import PyTorchSimRunner - - module = PyTorchSimRunner.setup_device() - device = module.custom_device() + device = torch.device("npu:0") test_sparse_mlp(device, batch_size=8, input_size=16, hidden_size=32, output_size=64) diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py deleted file mode 100644 index 71594eb2..00000000 --- a/tests/test_spmm_scheduler.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -import sys -import torch -import argparse -sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) -from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request -from test_sparse_core import SparseMLP as model1 -from test_transformer import EncoderBlock as model2 -CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim') - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="") - parser.add_argument("--batch_size", type=int, default=128, help="Batch size") - parser.add_argument("--input_size", type=int, default=128, help="Input layer size") - parser.add_argument("--hidden_size", type=int, default=128, help="Hidden layer size") - parser.add_argument("--output_size", type=int, default=128, help="Output layer size") - parser.add_argument("--w1_sparsity", type=float, default=0.5, help="Sparsity of first layer weights (0 to 1)") - parser.add_argument("--w2_sparsity", type=float, default=0.5, help="Sparsity of second layer weights (0 to 1)") - parser.add_argument("--config", type=str) - args = parser.parse_args() - - batch_size = args.batch_size - input_size = args.input_size - hidden_size = args.hidden_size - output_size = args.output_size - w1_sparsity = args.w1_sparsity - w2_sparsity = args.w2_sparsity - config_path = f"{CONFIG_TORCHSIM_DIR}/configs/{args.config}" - - print("batch_size: ", batch_size) - print("input_size: ", input_size) - print("hidden_size: ", hidden_size) - print("output_size: ", output_size) - print("w1_sparsity: ", w1_sparsity) - print("w2_sparsity: ", w2_sparsity) - - with torch.no_grad(): - # Init scheduler - scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, - togsim_config=config_path) - - target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity, scheduler.execution_engine.module.custom_device()).eval() - target_model2 = model2(768, 12).eval() - - # Register compiled model - opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device())) - opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device())) - SchedulerDNNModel.register_model("mlp", opt_model1) - SchedulerDNNModel.register_model("bert", opt_model2) - - # Init input data - model_input1 = torch.randn(batch_size, input_size) - model_input2 = torch.randn(1, 512, 768) - - # Init request - new_request1 = Request("mlp", [model_input1], [], request_queue_idx=0) - #new_request2 = Request("bert", [model_input2], [], request_queue_idx=1) - - - # Add request to scheduler - scheduler.add_request(new_request1, request_time=0) - #scheduler.add_request(new_request2, request_time=0) - - # Run scheduler - while not scheduler.is_finished(): - scheduler.schedule() \ No newline at end of file diff --git a/tutorial/session1/CompilerOptimization.ipynb b/tutorial/session1/CompilerOptimization.ipynb index ead695c0..d17a6b25 100644 --- a/tutorial/session1/CompilerOptimization.ipynb +++ b/tutorial/session1/CompilerOptimization.ipynb @@ -35,8 +35,7 @@ "outputs": [], "source": [ "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"fused\")\n", - "from Scheduler.scheduler import PyTorchSimRunner\n", - "device = PyTorchSimRunner.setup_device().custom_device()\n", + "device = torch.device(\"npu:0\")\n", "\n", "input = torch.randn(1024, 1024).to(device=device)\n", "weight = torch.randn(1024, 1024).to(device=device)\n", diff --git a/tutorial/session1/ExecutionMode.ipynb b/tutorial/session1/ExecutionMode.ipynb index b6f0e048..d94323db 100644 --- a/tutorial/session1/ExecutionMode.ipynb +++ b/tutorial/session1/ExecutionMode.ipynb @@ -33,8 +33,7 @@ "metadata": {}, "outputs": [], "source": [ - "from Scheduler.scheduler import PyTorchSimRunner\n", - "device = PyTorchSimRunner.setup_device().custom_device()\n", + "device = torch.device(\"npu:0\")\n", "\n", "input = torch.randn(1024, 1024).to(device=device)\n", "weight = torch.randn(1024, 1024).to(device=device)\n", diff --git a/tutorial/session1/Inference.ipynb b/tutorial/session1/Inference.ipynb index a49e2440..6fd54aed 100644 --- a/tutorial/session1/Inference.ipynb +++ b/tutorial/session1/Inference.ipynb @@ -57,8 +57,7 @@ "metadata": {}, "outputs": [], "source": [ - "from Scheduler.scheduler import PyTorchSimRunner\n", - "device = PyTorchSimRunner.setup_device().custom_device()\n", + "device = torch.device(\"npu:0\")\n", "\n", "torch.manual_seed(0)\n", "input = torch.randn(128, 128).to(device)\n", diff --git a/tutorial/session1/LogAnalysis.ipynb b/tutorial/session1/LogAnalysis.ipynb index a82737db..24dae52b 100644 --- a/tutorial/session1/LogAnalysis.ipynb +++ b/tutorial/session1/LogAnalysis.ipynb @@ -35,8 +35,7 @@ "metadata": {}, "outputs": [], "source": [ - "from Scheduler.scheduler import PyTorchSimRunner\n", - "device = PyTorchSimRunner.setup_device().custom_device()\n", + "device = torch.device(\"npu:0\")\n", "\n", "input = torch.randn(1024, 1024).to(device=device)\n", "weight = torch.randn(1024, 1024).to(device=device)\n", diff --git a/tutorial/session1/Mapping.ipynb b/tutorial/session1/Mapping.ipynb index 684b69c0..0b978bcb 100644 --- a/tutorial/session1/Mapping.ipynb +++ b/tutorial/session1/Mapping.ipynb @@ -33,8 +33,7 @@ "metadata": {}, "outputs": [], "source": [ - "from Scheduler.scheduler import PyTorchSimRunner\n", - "device = PyTorchSimRunner.setup_device().custom_device()\n", + "device = torch.device(\"npu:0\")\n", "\n", "input = torch.randn(1024, 1024).to(device=device)\n", "weight = torch.randn(1024, 1024).to(device=device)\n", diff --git a/tutorial/session1/Training.ipynb b/tutorial/session1/Training.ipynb index 0c6b138a..badf7ed7 100644 --- a/tutorial/session1/Training.ipynb +++ b/tutorial/session1/Training.ipynb @@ -20,8 +20,7 @@ "sys.path.append(base_dir)\n", "\n", "cpu_device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - "from Scheduler.scheduler import PyTorchSimRunner\n", - "npu_device = PyTorchSimRunner.setup_device().custom_device()" + "npu_device = torch.device(\"npu:0\")" ] }, { diff --git a/tutorial/session2/Hands_on.ipynb b/tutorial/session2/Hands_on.ipynb index 2964f293..9a7c35e3 100644 --- a/tutorial/session2/Hands_on.ipynb +++ b/tutorial/session2/Hands_on.ipynb @@ -35,9 +35,7 @@ "os.environ['TORCHSIM_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")\n", "sys.path.append(base_dir)\n", "\n", - "from Scheduler.scheduler import PyTorchSimRunner\n", - "module = PyTorchSimRunner.setup_device()\n", - "device = module.custom_device()\n", + "device = torch.device(\"npu:0\")\n", "\n", "def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):\n", " if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):\n", From 8ca844a6b1227839208874502e8680d74d390fd3 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 26 Mar 2026 20:56:11 +0900 Subject: [PATCH 156/194] [Frontend/MobileNet] Add MobileNet CI and 1x1 spatial conv linear decomposition (#205) --- .github/workflows/pytorchsim_test.yml | 19 ++++ PyTorchSimFrontend/mlir/mlir_conv_common.py | 2 +- PyTorchSimFrontend/mlir/mlir_decomposition.py | 100 ++++++++++++++++-- tests/test_conv2d.py | 1 + 4 files changed, 111 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml index 2a9d60a1..a7613b6e 100644 --- a/.github/workflows/pytorchsim_test.yml +++ b/.github/workflows/pytorchsim_test.yml @@ -328,6 +328,25 @@ jobs: -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ ${{ inputs.image_name }} python3 PyTorchSim/tests/test_resnet.py --model_type resnet50 + test_mobilenet: + name: Run test_mobilenet.py + runs-on: self-hosted + steps: + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Run test_mobilenet.py + run: | + echo "Running test_mobilenet.py" + docker run --rm \ + -e vpu_num_lanes="${{ inputs.vector_lane }}" \ + -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/MobileNet/test_mobilenet.py + test_transformer: name: Run test_transformer.py runs-on: self-hosted diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py index 386e9bd5..d577dbd8 100644 --- a/PyTorchSimFrontend/mlir/mlir_conv_common.py +++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py @@ -123,6 +123,6 @@ def compute_stride(shape): return stride X_stride = compute_stride(X_shape) - arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) + arg_attributes.append([X.get_name(), [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]]) return arg_attributes diff --git a/PyTorchSimFrontend/mlir/mlir_decomposition.py b/PyTorchSimFrontend/mlir/mlir_decomposition.py index 122c2677..0f443cf8 100644 --- a/PyTorchSimFrontend/mlir/mlir_decomposition.py +++ b/PyTorchSimFrontend/mlir/mlir_decomposition.py @@ -16,6 +16,73 @@ def _pair_2d(seq: Sequence[int]) -> Tuple[int, int]: return int(seq[0]), int(seq[1]) +def _int_eq(x, v: int) -> bool: + try: + return int(x) == v + except (TypeError, ValueError): + return False + + +def _can_rewrite_pointwise_conv_on_1x1_spatial_to_linear( + input: torch.Tensor, + weight: torch.Tensor, + stride: Sequence[int], + padding: Sequence[int], + dilation: Sequence[int], + transposed: bool, + output_padding: Sequence[int], + groups: int, +) -> bool: + """ + Whether this ``aten.convolution`` is **exactly** ``F.linear`` on ``[N, C]`` (then reshaped + to ``[N, C_out, 1, 1]``): 1x1 kernel, spatial size 1x1, ``groups==1``, stride 1, no padding, + dilation 1 (typical SE line after global pool). + + If True, use ``_apply_pointwise_conv_on_1x1_spatial_as_linear``; if False, keep normal conv. + """ + if transposed or input.dim() != 4 or weight.dim() != 4: + return False + if groups != 1: + return False + if not ( + _int_eq(input.shape[2], 1) + and _int_eq(input.shape[3], 1) + and _int_eq(weight.shape[2], 1) + and _int_eq(weight.shape[3], 1) + ): + return False + + sh, sw = _pair_2d(stride) + ph, pw = _pair_2d(padding) + dh, dw = _pair_2d(dilation) + if sh != 1 or sw != 1 or ph != 0 or pw != 0 or dh != 1 or dw != 1: + return False + if len(output_padding) and any(not _int_eq(o, 0) for o in output_padding): + return False + + _, cin, _, _ = input.shape + _, cin_w, _, _ = weight.shape + try: + if int(cin_w) != int(cin): + return False + except (TypeError, ValueError): + return False + return True + + +def _apply_pointwise_conv_on_1x1_spatial_as_linear( + input: torch.Tensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor], +) -> torch.Tensor: + """Same numerics as ``convolution``; call only when ``_can_rewrite_...`` is True.""" + n, cin, _, _ = input.shape + cout, _, _, _ = weight.shape + x = input.reshape(n, cin) + w = weight.reshape(cout, cin) + return F.linear(x, w, bias).reshape(n, cout, 1, 1) + + def _group_conv_cin1_cout1( input: torch.Tensor, weight: torch.Tensor, @@ -70,7 +137,7 @@ def _group_conv_cin1_cout1( @register_decomposition(aten.convolution.default) -def decompose_group_convolution( +def decompose_convolution( input: torch.Tensor, weight: torch.Tensor, bias: Union[torch.Tensor, None], @@ -82,23 +149,36 @@ def decompose_group_convolution( groups: Union[int, torch.SymInt], ): """ - Lower grouped ``aten.convolution`` only when each group has a single input and output - channel (``Cin//groups == Cout//groups == 1``), via ``_group_conv_cin1_cout1``. + 1. Pointwise 1x1 on spatial 1x1 (groups==1): rewrite to F.linear so backends + that struggle with tiny spatial convs (e.g. SE after AdaptiveAvgPool2d(1)) see + aten.mm / linear lowering instead. + + 2. Grouped conv when Cin//groups == Cout//groups == 1: _group_conv_cin1_cout1. + + Otherwise returns NotImplemented (Inductor uses the default aten.convolution). Note ---- - The lowered path is not a performance-optimized kernel; it exists for correctness and - lowering experiments. For speed, implement a separate template (fused) kernel for group - convolution. - - Non-static ``groups`` (cannot ``int()``) falls back: returns ``NotImplemented`` so the - default ``aten.convolution`` is used. ``groups==1`` also returns ``NotImplemented``. + The grouped path is not performance-optimized; it exists for correctness experiments. """ try: gcount = operator.index(groups) except (TypeError, ValueError): return NotImplemented - # groups==1: do not decompose; Inductor keeps the default aten.convolution (plain conv). + + if _can_rewrite_pointwise_conv_on_1x1_spatial_to_linear( + input, + weight, + stride, + padding, + dilation, + transposed, + output_padding, + gcount, + ): + return _apply_pointwise_conv_on_1x1_spatial_as_linear(input, weight, bias) + + # groups==1, non-1x1 spatial: keep default aten.convolution (plain conv). if gcount == 1: return NotImplemented diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py index 533a04db..313003b1 100644 --- a/tests/test_conv2d.py +++ b/tests/test_conv2d.py @@ -50,3 +50,4 @@ def custom_conv2d(a, b, bias): test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=2, kernel_size=1, stride=1, padding=0) test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=14, kernel_size=1, stride=2, padding=0) test_conv2d(device, batch_size=1, in_channels=3, out_channels=768, input_size=224, kernel_size=16,stride=16, padding=0) + test_conv2d(device, batch_size=1, in_channels=8, out_channels=16, input_size=1, kernel_size=1,stride=1, padding=0) From 6f747224377f839f9e1dac62aaa08bf874c6b85c Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 31 Mar 2026 18:43:13 +0900 Subject: [PATCH 157/194] [Test] Add missing mobilenet test script --- tests/MobileNet/test_mobilenet.py | 106 ++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 tests/MobileNet/test_mobilenet.py diff --git a/tests/MobileNet/test_mobilenet.py b/tests/MobileNet/test_mobilenet.py new file mode 100644 index 00000000..966d479a --- /dev/null +++ b/tests/MobileNet/test_mobilenet.py @@ -0,0 +1,106 @@ +import argparse +import copy +import os + +import torch +import torch._dynamo +import torch.utils.cpp_extension +from torchvision.models import mobilenet_v2 + + +def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4): + if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol): + message = f"|{name} Test Passed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + else: + message = f"|{name} Test Failed|" + print("-" * len(message)) + print(message) + print("-" * len(message)) + print("custom out: ", out.cpu()) + print("cpu out: ", cpu_out) + exit(1) + + +def _mobilenet_v2(): + try: + from torchvision.models import MobileNet_V2_Weights + + return mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).cpu().eval() + except Exception: + return mobilenet_v2().cpu().eval() + + +def run_mobilenet(batch, config): + device = torch.device("npu:0") + + torch._dynamo.config.recompile_limit = 64 + torch._dynamo.config.cache_size_limit = 128 + + model = _mobilenet_v2() + imgsz = 224 + x = torch.randn(batch, 3, imgsz, imgsz) + + model_cpu = copy.deepcopy(model).cpu().eval() + x_cpu = copy.deepcopy(x).cpu() + y_cpu = model_cpu(x_cpu) + + model_npu = model_cpu.to(device).eval() + x_npu = copy.deepcopy(x).to(device) + compiled_model_npu = torch.compile(dynamic=False)(model_npu) + y_npu = compiled_model_npu(x_npu) + + if isinstance(y_cpu, (list, tuple)): + for i, (out_npu, out_cpu) in enumerate(zip(y_npu, y_cpu)): + test_result(f"MobileNet Output {i}", out_npu, out_cpu) + else: + test_result("MobileNet Output", y_npu, y_cpu) + + print("MobileNet Simulation Done") + + +def test_inverted_residual_module(device, batch=1, inp=32, oup=32, stride=1, expand_ratio=6, h=28, w=28): + from torchvision.models.mobilenetv2 import InvertedResidual + + torch.manual_seed(0) + + x = torch.randn(batch, inp, h, w) + + model_cpu = InvertedResidual(inp, oup, stride, expand_ratio).cpu().eval() + x_cpu = copy.deepcopy(x).cpu() + y_cpu = model_cpu(x_cpu) + + model_npu = model_cpu.to(device).eval() + x_npu = copy.deepcopy(x).to(device) + compiled_model_npu = torch.compile(dynamic=False)(model_npu) + y_npu = compiled_model_npu(x_npu) + + test_result("InvertedResidual Module", y_npu, y_cpu) + print("InvertedResidual Module Test Done") + + +if __name__ == "__main__": + base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim") + config = os.environ.get( + "TOGSIM_CONFIG", + default=f"{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml", + ) + args = argparse.ArgumentParser() + args.add_argument("--batch", type=int, default=1) + args.add_argument("--dump_path", type=str, default="results") + args = args.parse_args() + batch = args.batch + + device = torch.device("npu:0") + + # print("\n" + "=" * 80) + # print("Testing InvertedResidual Module") + # print("=" * 80) + # test_inverted_residual_module(device, batch=batch, inp=32, oup=32, stride=1, expand_ratio=6, h=28, w=28) + + print("\n" + "=" * 80) + print("Testing Full MobileNet V2 Model") + print("=" * 80) + run_mobilenet(batch, config) From 7b6cfe549aefea70ab7830edf8b474a73c60de2a Mon Sep 17 00:00:00 2001 From: HamHyungkyu Date: Sat, 4 Apr 2026 16:30:12 +0900 Subject: [PATCH 158/194] [TOGSim] Migration to Ramulator2.1 -Update Ramulator version to 2.1 -Update Ramulator2 DRAM configs --- TOGSim/extern/ramulator2 | 2 +- configs/ramulator2_configs/DDR4.yaml | 446 +++++++++++++++++- configs/ramulator2_configs/HBM2.yaml | 501 ++++++++++++++++++++- configs/ramulator2_configs/HBM2_TPUv3.yaml | 501 ++++++++++++++++++++- configs/ramulator2_configs/LPDDR5.yaml | 494 ++++++++++++++++++++ configs/ramulator2_configs/LPDDR5X.yaml | 494 ++++++++++++++++++++ configs/ramulator2_configs/gen_configs.py | 109 +++++ 7 files changed, 2471 insertions(+), 76 deletions(-) create mode 100644 configs/ramulator2_configs/LPDDR5.yaml create mode 100644 configs/ramulator2_configs/LPDDR5X.yaml create mode 100644 configs/ramulator2_configs/gen_configs.py diff --git a/TOGSim/extern/ramulator2 b/TOGSim/extern/ramulator2 index 49556128..70e85563 160000 --- a/TOGSim/extern/ramulator2 +++ b/TOGSim/extern/ramulator2 @@ -1 +1 @@ -Subproject commit 495561282d99f2ef2652618710e98c4a287025da +Subproject commit 70e855630b7f582bc8fa7370bfd582dc71d8af63 diff --git a/configs/ramulator2_configs/DDR4.yaml b/configs/ramulator2_configs/DDR4.yaml index e65528ed..45799436 100644 --- a/configs/ramulator2_configs/DDR4.yaml +++ b/configs/ramulator2_configs/DDR4.yaml @@ -1,25 +1,421 @@ -Frontend: - impl: GEM5 - -MemorySystem: - impl: GenericDRAM - clock_ratio: 1 - - DRAM: - impl: DDR4 - org: - preset: DDR4_16Gb_x4 - channel: 1 - timing: - preset: DDR4_1600J - - Controller: - impl: Generic - Scheduler: - impl: FRFCFS - RefreshManager: - impl: AllBank - plugins: - - AddrMapper: - impl: RoBaRaCoCh \ No newline at end of file +{ + "frontend": { + "impl": "External", + "clock_ratio": 1 + }, + "memory_system": { + "impl": "GenericDRAM", + "clock_ratio": 1, + "channel_mapper": { + "impl": "PassThroughChannelMapper" + }, + "controllers": [ + { + "impl": "GenericDDR", + "wr_low_watermark": 0.2, + "wr_high_watermark": 0.8, + "read_buffer_size": 32, + "write_buffer_size": 32, + "priority_buffer_size": 1568, + "scheduler": { + "impl": "FRFCFS" + }, + "refresh_manager": { + "impl": "AllBank", + "scope": "Rank" + }, + "row_policy": { + "impl": "Open" + }, + "addr_mapper": { + "impl": "RoBaRaCoCh" + }, + "dram": { + "impl": "DDR4", + "org": { + "dq": 8, + "count": [ + 1, + 1, + 4, + 4, + 65536, + 1024 + ] + }, + "timing": [ + 3200, + 4, + 22, + 22, + 22, + 52, + 74, + 24, + 12, + 16, + 4, + 8, + 4, + 8, + 4, + 12, + 34, + 576, + 12480, + 2, + 625 + ], + "channel_width": 64, + "read_latency": 26, + "timing_constraints": [ + [ + 0, + [ + 3, + 5 + ], + [ + 3, + 5 + ], + 4 + ], + [ + 0, + [ + 4, + 6 + ], + [ + 4, + 6 + ], + 4 + ], + [ + 1, + [ + 3, + 5 + ], + [ + 3, + 5 + ], + 4 + ], + [ + 1, + [ + 4, + 6 + ], + [ + 4, + 6 + ], + 4 + ], + [ + 1, + [ + 3, + 5 + ], + [ + 4, + 6 + ], + 12 + ], + [ + 1, + [ + 4, + 6 + ], + [ + 3, + 5 + ], + 24 + ], + [ + 1, + [ + 3, + 5 + ], + [ + 3, + 4, + 5, + 6 + ], + 6, + 1, + true + ], + [ + 1, + [ + 4, + 6 + ], + [ + 3, + 5 + ], + 0, + 1, + true + ], + [ + 1, + [ + 3 + ], + [ + 2 + ], + 12 + ], + [ + 1, + [ + 4 + ], + [ + 2 + ], + 44 + ], + [ + 1, + [ + 0 + ], + [ + 0 + ], + 4 + ], + [ + 1, + [ + 0 + ], + [ + 0 + ], + 34, + 4 + ], + [ + 1, + [ + 0 + ], + [ + 2 + ], + 52 + ], + [ + 1, + [ + 2 + ], + [ + 0 + ], + 22 + ], + [ + 1, + [ + 0 + ], + [ + 7 + ], + 74 + ], + [ + 1, + [ + 1, + 2 + ], + [ + 7 + ], + 22 + ], + [ + 1, + [ + 5 + ], + [ + 7 + ], + 34 + ], + [ + 1, + [ + 6 + ], + [ + 7 + ], + 66 + ], + [ + 1, + [ + 7 + ], + [ + 0, + 2 + ], + 576 + ], + [ + 2, + [ + 3, + 5 + ], + [ + 3, + 5 + ], + 8 + ], + [ + 2, + [ + 4, + 6 + ], + [ + 4, + 6 + ], + 8 + ], + [ + 2, + [ + 4, + 6 + ], + [ + 3, + 5 + ], + 32 + ], + [ + 2, + [ + 0 + ], + [ + 0 + ], + 8 + ], + [ + 3, + [ + 0 + ], + [ + 0 + ], + 74 + ], + [ + 3, + [ + 0 + ], + [ + 3, + 4, + 5, + 6 + ], + 22 + ], + [ + 3, + [ + 0 + ], + [ + 1 + ], + 52 + ], + [ + 3, + [ + 1 + ], + [ + 0 + ], + 22 + ], + [ + 3, + [ + 3 + ], + [ + 1 + ], + 12 + ], + [ + 3, + [ + 4 + ], + [ + 1 + ], + 44 + ], + [ + 3, + [ + 5 + ], + [ + 0 + ], + 34 + ], + [ + 3, + [ + 6 + ], + [ + 0 + ], + 66 + ] + ] + } + } + ] + } +} \ No newline at end of file diff --git a/configs/ramulator2_configs/HBM2.yaml b/configs/ramulator2_configs/HBM2.yaml index 70cddef0..2bdd1705 100644 --- a/configs/ramulator2_configs/HBM2.yaml +++ b/configs/ramulator2_configs/HBM2.yaml @@ -1,25 +1,476 @@ -Frontend: - impl: GEM5 - -MemorySystem: - impl: GenericDRAM - clock_ratio: 1 - - DRAM: - impl: HBM2 - org: - preset: HBM2_8Gb - channel: 1 - timing: - preset: HBM2_1.4Gbps - - Controller: - impl: Generic - Scheduler: - impl: FRFCFS - RefreshManager: - impl: AllBank - plugins: - - AddrMapper: - impl: RoBaRaCoCh \ No newline at end of file +{ + "frontend": { + "impl": "External", + "clock_ratio": 1 + }, + "memory_system": { + "impl": "GenericDRAM", + "clock_ratio": 1, + "channel_mapper": { + "impl": "PassThroughChannelMapper" + }, + "controllers": [ + { + "impl": "GenericDDR", + "wr_low_watermark": 0.2, + "wr_high_watermark": 0.8, + "read_buffer_size": 32, + "write_buffer_size": 32, + "priority_buffer_size": 1568, + "scheduler": { + "impl": "FRFCFS" + }, + "refresh_manager": { + "impl": "AllBank", + "scope": "PseudoChannel" + }, + "row_policy": { + "impl": "Open" + }, + "addr_mapper": { + "impl": "RoBaRaCoCh" + }, + "dram": { + "impl": "HBM2", + "org": { + "dq": 64, + "count": [ + 1, + 2, + 4, + 4, + 65536, + 32 + ] + }, + "timing": [ + 2000, + 2, + 14, + 14, + 12, + 14, + 34, + 48, + 16, + 5, + 5, + 2, + 4, + 4, + 4, + 6, + 8, + 15, + 350, + 160, + 8, + 3900, + 122, + 1000 + ], + "channel_width": 64, + "read_latency": 16, + "timing_constraints": [ + [ + 0, + [ + 0 + ], + [ + 0, + 1, + 2, + 7, + 8 + ], + 2 + ], + [ + 1, + [ + 3, + 5 + ], + [ + 3, + 5 + ], + 2 + ], + [ + 1, + [ + 4, + 6 + ], + [ + 4, + 6 + ], + 2 + ], + [ + 1, + [ + 3, + 5 + ], + [ + 3, + 5 + ], + 2 + ], + [ + 1, + [ + 4, + 6 + ], + [ + 4, + 6 + ], + 2 + ], + [ + 1, + [ + 3, + 5 + ], + [ + 4, + 6 + ], + 13 + ], + [ + 1, + [ + 4, + 6 + ], + [ + 3, + 5 + ], + 13 + ], + [ + 1, + [ + 3 + ], + [ + 2 + ], + 5 + ], + [ + 1, + [ + 4 + ], + [ + 2 + ], + 23 + ], + [ + 1, + [ + 0 + ], + [ + 0 + ], + 4 + ], + [ + 1, + [ + 0 + ], + [ + 0 + ], + 15, + 4 + ], + [ + 1, + [ + 0 + ], + [ + 2 + ], + 35 + ], + [ + 1, + [ + 2 + ], + [ + 0 + ], + 13 + ], + [ + 1, + [ + 0 + ], + [ + 7 + ], + 49 + ], + [ + 1, + [ + 1, + 2 + ], + [ + 7 + ], + 14 + ], + [ + 1, + [ + 5 + ], + [ + 7 + ], + 19 + ], + [ + 1, + [ + 6 + ], + [ + 7 + ], + 37 + ], + [ + 1, + [ + 7 + ], + [ + 0 + ], + 349 + ], + [ + 1, + [ + 7 + ], + [ + 2 + ], + 350 + ], + [ + 1, + [ + 8 + ], + [ + 0 + ], + 7 + ], + [ + 1, + [ + 0 + ], + [ + 8 + ], + 5 + ], + [ + 2, + [ + 3, + 5 + ], + [ + 3, + 5 + ], + 4 + ], + [ + 2, + [ + 4, + 6 + ], + [ + 4, + 6 + ], + 4 + ], + [ + 2, + [ + 4, + 6 + ], + [ + 3, + 5 + ], + 15 + ], + [ + 2, + [ + 0 + ], + [ + 0 + ], + 4 + ], + [ + 3, + [ + 0 + ], + [ + 0 + ], + 48 + ], + [ + 3, + [ + 0 + ], + [ + 3, + 5 + ], + 15 + ], + [ + 3, + [ + 0 + ], + [ + 4, + 6 + ], + 13 + ], + [ + 3, + [ + 0 + ], + [ + 1 + ], + 35 + ], + [ + 3, + [ + 1 + ], + [ + 0 + ], + 13 + ], + [ + 3, + [ + 3 + ], + [ + 1 + ], + 5 + ], + [ + 3, + [ + 4 + ], + [ + 1 + ], + 23 + ], + [ + 3, + [ + 5 + ], + [ + 0 + ], + 18 + ], + [ + 3, + [ + 6 + ], + [ + 0 + ], + 36 + ], + [ + 3, + [ + 8 + ], + [ + 0 + ], + 159 + ], + [ + 3, + [ + 0 + ], + [ + 8 + ], + 49 + ], + [ + 3, + [ + 1 + ], + [ + 8 + ], + 14 + ] + ] + } + } + ] + } +} \ No newline at end of file diff --git a/configs/ramulator2_configs/HBM2_TPUv3.yaml b/configs/ramulator2_configs/HBM2_TPUv3.yaml index e6543d14..2bdd1705 100644 --- a/configs/ramulator2_configs/HBM2_TPUv3.yaml +++ b/configs/ramulator2_configs/HBM2_TPUv3.yaml @@ -1,25 +1,476 @@ -Frontend: - impl: GEM5 - -MemorySystem: - impl: GenericDRAM - clock_ratio: 1 - - DRAM: - impl: HBM2 - org: - preset: HBM2_8Gb - channel: 1 - timing: - preset: HBM2_1.8Gbps - - Controller: - impl: Generic - Scheduler: - impl: FRFCFS - RefreshManager: - impl: AllBank - plugins: - - AddrMapper: - impl: RoBaRaCoCh \ No newline at end of file +{ + "frontend": { + "impl": "External", + "clock_ratio": 1 + }, + "memory_system": { + "impl": "GenericDRAM", + "clock_ratio": 1, + "channel_mapper": { + "impl": "PassThroughChannelMapper" + }, + "controllers": [ + { + "impl": "GenericDDR", + "wr_low_watermark": 0.2, + "wr_high_watermark": 0.8, + "read_buffer_size": 32, + "write_buffer_size": 32, + "priority_buffer_size": 1568, + "scheduler": { + "impl": "FRFCFS" + }, + "refresh_manager": { + "impl": "AllBank", + "scope": "PseudoChannel" + }, + "row_policy": { + "impl": "Open" + }, + "addr_mapper": { + "impl": "RoBaRaCoCh" + }, + "dram": { + "impl": "HBM2", + "org": { + "dq": 64, + "count": [ + 1, + 2, + 4, + 4, + 65536, + 32 + ] + }, + "timing": [ + 2000, + 2, + 14, + 14, + 12, + 14, + 34, + 48, + 16, + 5, + 5, + 2, + 4, + 4, + 4, + 6, + 8, + 15, + 350, + 160, + 8, + 3900, + 122, + 1000 + ], + "channel_width": 64, + "read_latency": 16, + "timing_constraints": [ + [ + 0, + [ + 0 + ], + [ + 0, + 1, + 2, + 7, + 8 + ], + 2 + ], + [ + 1, + [ + 3, + 5 + ], + [ + 3, + 5 + ], + 2 + ], + [ + 1, + [ + 4, + 6 + ], + [ + 4, + 6 + ], + 2 + ], + [ + 1, + [ + 3, + 5 + ], + [ + 3, + 5 + ], + 2 + ], + [ + 1, + [ + 4, + 6 + ], + [ + 4, + 6 + ], + 2 + ], + [ + 1, + [ + 3, + 5 + ], + [ + 4, + 6 + ], + 13 + ], + [ + 1, + [ + 4, + 6 + ], + [ + 3, + 5 + ], + 13 + ], + [ + 1, + [ + 3 + ], + [ + 2 + ], + 5 + ], + [ + 1, + [ + 4 + ], + [ + 2 + ], + 23 + ], + [ + 1, + [ + 0 + ], + [ + 0 + ], + 4 + ], + [ + 1, + [ + 0 + ], + [ + 0 + ], + 15, + 4 + ], + [ + 1, + [ + 0 + ], + [ + 2 + ], + 35 + ], + [ + 1, + [ + 2 + ], + [ + 0 + ], + 13 + ], + [ + 1, + [ + 0 + ], + [ + 7 + ], + 49 + ], + [ + 1, + [ + 1, + 2 + ], + [ + 7 + ], + 14 + ], + [ + 1, + [ + 5 + ], + [ + 7 + ], + 19 + ], + [ + 1, + [ + 6 + ], + [ + 7 + ], + 37 + ], + [ + 1, + [ + 7 + ], + [ + 0 + ], + 349 + ], + [ + 1, + [ + 7 + ], + [ + 2 + ], + 350 + ], + [ + 1, + [ + 8 + ], + [ + 0 + ], + 7 + ], + [ + 1, + [ + 0 + ], + [ + 8 + ], + 5 + ], + [ + 2, + [ + 3, + 5 + ], + [ + 3, + 5 + ], + 4 + ], + [ + 2, + [ + 4, + 6 + ], + [ + 4, + 6 + ], + 4 + ], + [ + 2, + [ + 4, + 6 + ], + [ + 3, + 5 + ], + 15 + ], + [ + 2, + [ + 0 + ], + [ + 0 + ], + 4 + ], + [ + 3, + [ + 0 + ], + [ + 0 + ], + 48 + ], + [ + 3, + [ + 0 + ], + [ + 3, + 5 + ], + 15 + ], + [ + 3, + [ + 0 + ], + [ + 4, + 6 + ], + 13 + ], + [ + 3, + [ + 0 + ], + [ + 1 + ], + 35 + ], + [ + 3, + [ + 1 + ], + [ + 0 + ], + 13 + ], + [ + 3, + [ + 3 + ], + [ + 1 + ], + 5 + ], + [ + 3, + [ + 4 + ], + [ + 1 + ], + 23 + ], + [ + 3, + [ + 5 + ], + [ + 0 + ], + 18 + ], + [ + 3, + [ + 6 + ], + [ + 0 + ], + 36 + ], + [ + 3, + [ + 8 + ], + [ + 0 + ], + 159 + ], + [ + 3, + [ + 0 + ], + [ + 8 + ], + 49 + ], + [ + 3, + [ + 1 + ], + [ + 8 + ], + 14 + ] + ] + } + } + ] + } +} \ No newline at end of file diff --git a/configs/ramulator2_configs/LPDDR5.yaml b/configs/ramulator2_configs/LPDDR5.yaml new file mode 100644 index 00000000..bf039f9f --- /dev/null +++ b/configs/ramulator2_configs/LPDDR5.yaml @@ -0,0 +1,494 @@ +{ + "frontend": { + "impl": "External", + "clock_ratio": 1 + }, + "memory_system": { + "impl": "GenericDRAM", + "clock_ratio": 1, + "channel_mapper": { + "impl": "PassThroughChannelMapper" + }, + "controllers": [ + { + "impl": "GenericDDR", + "wr_low_watermark": 0.2, + "wr_high_watermark": 0.8, + "read_buffer_size": 32, + "write_buffer_size": 32, + "priority_buffer_size": 1568, + "scheduler": { + "impl": "FRFCFS" + }, + "refresh_manager": { + "impl": "AllBank", + "scope": "Rank" + }, + "row_policy": { + "impl": "Open" + }, + "addr_mapper": { + "impl": "RoBaRaCoCh" + }, + "dram": { + "impl": "LPDDR5", + "org": { + "dq": 16, + "count": [ + 1, + 1, + 4, + 4, + 32768, + 1024 + ] + }, + "timing": [ + 6400, + 2, + 17, + 15, + 15, + 17, + 34, + 49, + 28, + 8, + 9, + 2, + 2, + 4, + 2, + 4, + 4, + 4, + 5, + 10, + 16, + 168, + 96, + 3125, + 391, + 1, + 0, + 8, + 2, + 1250 + ], + "channel_width": 16, + "read_latency": 19, + "timing_constraints": [ + [ + 0, + [ + 6, + 8 + ], + [ + 6, + 8 + ], + 2 + ], + [ + 0, + [ + 7, + 9 + ], + [ + 7, + 9 + ], + 2 + ], + [ + 3, + [ + 4 + ], + [ + 6, + 8 + ], + 0 + ], + [ + 3, + [ + 5 + ], + [ + 7, + 9 + ], + 0 + ], + [ + 1, + [ + 6, + 8 + ], + [ + 6, + 8 + ], + 2 + ], + [ + 1, + [ + 7, + 9 + ], + [ + 7, + 9 + ], + 2 + ], + [ + 1, + [ + 6, + 8 + ], + [ + 7, + 9 + ], + 12 + ], + [ + 1, + [ + 7, + 9 + ], + [ + 6, + 8 + ], + 16 + ], + [ + 1, + [ + 6, + 8 + ], + [ + 6, + 7, + 8, + 9 + ], + 4, + 1, + true + ], + [ + 1, + [ + 7, + 9 + ], + [ + 6, + 8 + ], + 12, + 1, + true + ], + [ + 1, + [ + 6 + ], + [ + 3 + ], + 8 + ], + [ + 1, + [ + 7 + ], + [ + 3 + ], + 39 + ], + [ + 1, + [ + 0 + ], + [ + 0 + ], + 4 + ], + [ + 1, + [ + 0 + ], + [ + 0 + ], + 16, + 4 + ], + [ + 1, + [ + 0 + ], + [ + 3 + ], + 34 + ], + [ + 1, + [ + 3 + ], + [ + 0 + ], + 17 + ], + [ + 1, + [ + 2, + 3 + ], + [ + 2, + 3 + ], + 2 + ], + [ + 1, + [ + 0 + ], + [ + 10 + ], + 49 + ], + [ + 1, + [ + 2, + 3 + ], + [ + 10 + ], + 15 + ], + [ + 1, + [ + 8 + ], + [ + 10 + ], + 23 + ], + [ + 1, + [ + 9 + ], + [ + 10 + ], + 54 + ], + [ + 1, + [ + 10 + ], + [ + 0, + 3 + ], + 168 + ], + [ + 2, + [ + 6, + 8 + ], + [ + 6, + 8 + ], + 4 + ], + [ + 2, + [ + 7, + 9 + ], + [ + 7, + 9 + ], + 4 + ], + [ + 2, + [ + 7, + 9 + ], + [ + 6, + 8 + ], + 21 + ], + [ + 2, + [ + 0 + ], + [ + 0 + ], + 4 + ], + [ + 3, + [ + 0 + ], + [ + 0 + ], + 49 + ], + [ + 3, + [ + 0 + ], + [ + 6, + 7, + 8, + 9 + ], + 15 + ], + [ + 3, + [ + 0 + ], + [ + 2 + ], + 34 + ], + [ + 3, + [ + 2 + ], + [ + 0 + ], + 15 + ], + [ + 3, + [ + 6 + ], + [ + 2 + ], + 8 + ], + [ + 3, + [ + 7 + ], + [ + 2 + ], + 39 + ], + [ + 3, + [ + 8 + ], + [ + 0 + ], + 23 + ], + [ + 3, + [ + 9 + ], + [ + 0 + ], + 54 + ], + [ + 3, + [ + 11 + ], + [ + 0 + ], + 96 + ], + [ + 3, + [ + 0 + ], + [ + 11 + ], + 49 + ], + [ + 3, + [ + 2 + ], + [ + 11 + ], + 15 + ] + ] + } + } + ] + } +} \ No newline at end of file diff --git a/configs/ramulator2_configs/LPDDR5X.yaml b/configs/ramulator2_configs/LPDDR5X.yaml new file mode 100644 index 00000000..4309aa6c --- /dev/null +++ b/configs/ramulator2_configs/LPDDR5X.yaml @@ -0,0 +1,494 @@ +{ + "frontend": { + "impl": "External", + "clock_ratio": 1 + }, + "memory_system": { + "impl": "GenericDRAM", + "clock_ratio": 1, + "channel_mapper": { + "impl": "PassThroughChannelMapper" + }, + "controllers": [ + { + "impl": "GenericDDR", + "wr_low_watermark": 0.2, + "wr_high_watermark": 0.8, + "read_buffer_size": 32, + "write_buffer_size": 32, + "priority_buffer_size": 1568, + "scheduler": { + "impl": "FRFCFS" + }, + "refresh_manager": { + "impl": "AllBank", + "scope": "Rank" + }, + "row_policy": { + "impl": "Open" + }, + "addr_mapper": { + "impl": "RoBaRaCoCh" + }, + "dram": { + "impl": "LPDDR5", + "org": { + "dq": 16, + "count": [ + 1, + 1, + 4, + 4, + 32768, + 1024 + ] + }, + "timing": [ + 8533, + 2, + 23, + 20, + 20, + 23, + 46, + 65, + 38, + 11, + 12, + 2, + 2, + 4, + 2, + 4, + 6, + 6, + 7, + 14, + 22, + 224, + 128, + 4165, + 521, + 1, + 0, + 8, + 2, + 938 + ], + "channel_width": 16, + "read_latency": 25, + "timing_constraints": [ + [ + 0, + [ + 6, + 8 + ], + [ + 6, + 8 + ], + 2 + ], + [ + 0, + [ + 7, + 9 + ], + [ + 7, + 9 + ], + 2 + ], + [ + 3, + [ + 4 + ], + [ + 6, + 8 + ], + 0 + ], + [ + 3, + [ + 5 + ], + [ + 7, + 9 + ], + 0 + ], + [ + 1, + [ + 6, + 8 + ], + [ + 6, + 8 + ], + 2 + ], + [ + 1, + [ + 7, + 9 + ], + [ + 7, + 9 + ], + 2 + ], + [ + 1, + [ + 6, + 8 + ], + [ + 7, + 9 + ], + 15 + ], + [ + 1, + [ + 7, + 9 + ], + [ + 6, + 8 + ], + 21 + ], + [ + 1, + [ + 6, + 8 + ], + [ + 6, + 7, + 8, + 9 + ], + 4, + 1, + true + ], + [ + 1, + [ + 7, + 9 + ], + [ + 6, + 8 + ], + 15, + 1, + true + ], + [ + 1, + [ + 6 + ], + [ + 3 + ], + 11 + ], + [ + 1, + [ + 7 + ], + [ + 3 + ], + 52 + ], + [ + 1, + [ + 0 + ], + [ + 0 + ], + 6 + ], + [ + 1, + [ + 0 + ], + [ + 0 + ], + 22, + 4 + ], + [ + 1, + [ + 0 + ], + [ + 3 + ], + 46 + ], + [ + 1, + [ + 3 + ], + [ + 0 + ], + 23 + ], + [ + 1, + [ + 2, + 3 + ], + [ + 2, + 3 + ], + 2 + ], + [ + 1, + [ + 0 + ], + [ + 10 + ], + 65 + ], + [ + 1, + [ + 2, + 3 + ], + [ + 10 + ], + 20 + ], + [ + 1, + [ + 8 + ], + [ + 10 + ], + 31 + ], + [ + 1, + [ + 9 + ], + [ + 10 + ], + 72 + ], + [ + 1, + [ + 10 + ], + [ + 0, + 3 + ], + 224 + ], + [ + 2, + [ + 6, + 8 + ], + [ + 6, + 8 + ], + 4 + ], + [ + 2, + [ + 7, + 9 + ], + [ + 7, + 9 + ], + 4 + ], + [ + 2, + [ + 7, + 9 + ], + [ + 6, + 8 + ], + 28 + ], + [ + 2, + [ + 0 + ], + [ + 0 + ], + 6 + ], + [ + 3, + [ + 0 + ], + [ + 0 + ], + 65 + ], + [ + 3, + [ + 0 + ], + [ + 6, + 7, + 8, + 9 + ], + 20 + ], + [ + 3, + [ + 0 + ], + [ + 2 + ], + 46 + ], + [ + 3, + [ + 2 + ], + [ + 0 + ], + 20 + ], + [ + 3, + [ + 6 + ], + [ + 2 + ], + 11 + ], + [ + 3, + [ + 7 + ], + [ + 2 + ], + 52 + ], + [ + 3, + [ + 8 + ], + [ + 0 + ], + 31 + ], + [ + 3, + [ + 9 + ], + [ + 0 + ], + 72 + ], + [ + 3, + [ + 11 + ], + [ + 0 + ], + 128 + ], + [ + 3, + [ + 0 + ], + [ + 11 + ], + 65 + ], + [ + 3, + [ + 2 + ], + [ + 11 + ], + 20 + ] + ] + } + } + ] + } +} \ No newline at end of file diff --git a/configs/ramulator2_configs/gen_configs.py b/configs/ramulator2_configs/gen_configs.py new file mode 100644 index 00000000..64eb62d2 --- /dev/null +++ b/configs/ramulator2_configs/gen_configs.py @@ -0,0 +1,109 @@ +""" +Generate machine-readable ramulator2 v2.1 config files for PyTorchSim. + +Usage: + python gen_configs.py + +Each function generates a JSON config that C++ can load directly via +Config::parse_config_file(). No preset resolution happens in C++ anymore. +""" + +import json +import sys +import os + +# Add ramulator2 Python DSL to path +RAMULATOR_PYTHON = os.path.join(os.path.dirname(__file__), + "../../TOGSim/extern/ramulator2/python") +sys.path.insert(0, RAMULATOR_PYTHON) + +import ramulator +import ramulator.dram +import ramulator.controller +import ramulator.scheduler +import ramulator.refresh_manager +import ramulator.row_policy +import ramulator.addr_mapper +import ramulator.channel_mapper +import ramulator.memory_system + + +def make_config(dram_obj, clock_ratio=1, refresh_scope="Rank"): + """Wrap a DRAM object in a single-channel GenericDRAM config for PyTorchSim. + + PyTorchSim creates one Ramulator2 instance per channel, so each config + always has exactly one controller (channel=1 in org is enforced by v2.1). + The wrapper overrides 'frontend' to ExternalFrontEnd automatically. + + refresh_scope: level name for AllBank refresh. + - DDR4 / LPDDR5 / LPDDR5X → "Rank" + - HBM2 / HBM3 → "PseudoChannel" + """ + ctrl = ramulator.controller.GenericDDR( + dram=dram_obj, + scheduler=ramulator.scheduler.FRFCFS(), + refresh_manager=ramulator.refresh_manager.AllBank(scope=refresh_scope), + row_policy=ramulator.row_policy.Open(), + addr_mapper=ramulator.addr_mapper.RoBaRaCoCh(), + ) + ms = ramulator.memory_system.GenericDRAM( + clock_ratio=clock_ratio, + controllers=[ctrl], + # Single-channel per Ramulator2 instance — passthrough maps everything to ch 0 + channel_mapper=ramulator.channel_mapper.PassThroughChannelMapper(), + ) + return { + "frontend": {"impl": "External", "clock_ratio": 1}, + "memory_system": ms.to_config(), + } + + +def gen_hbm2(): + # Available timing presets: HBM2_1600Mbps, HBM2_2000Mbps, HBM2_2400Mbps + # HBM2 has no Rank level — AllBank refresh scope must be PseudoChannel + dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_2000Mbps") + return make_config(dram, clock_ratio=1, refresh_scope="PseudoChannel") + + +def gen_hbm2_tpuv3(): + # TPUv3 HBM2: 900MHz → ~1.8 Gbps. Closest available preset: HBM2_2000Mbps + dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_2000Mbps") + return make_config(dram, clock_ratio=1, refresh_scope="PseudoChannel") + + +def gen_ddr4(): + # Available timing presets — check python/ramulator/dram/ddr4.py + dram = ramulator.dram.DDR4(org_preset="DDR4_8Gb_x8", timing_preset="DDR4_3200AA") + return make_config(dram, clock_ratio=1) + + +def gen_lpddr5(): + dram = ramulator.dram.LPDDR5(org_preset="LPDDR5_8Gb_x16", timing_preset="LPDDR5_6400") + return make_config(dram, clock_ratio=1) + + +def gen_lpddr5x(): + # LPDDR5X_8533: 8533 MT/s, tCK=938ps, CK=1066MHz + dram = ramulator.dram.LPDDR5(org_preset="LPDDR5_8Gb_x16", timing_preset="LPDDR5X_8533") + return make_config(dram, clock_ratio=1) + + +CONFIGS = { + "HBM2.yaml": gen_hbm2, + "HBM2_TPUv3.yaml": gen_hbm2_tpuv3, + "DDR4.yaml": gen_ddr4, + "LPDDR5.yaml": gen_lpddr5, + "LPDDR5X.yaml": gen_lpddr5x, +} + + +if __name__ == "__main__": + out_dir = os.path.dirname(os.path.abspath(__file__)) + for filename, gen_fn in CONFIGS.items(): + cfg = gen_fn() + out_path = os.path.join(out_dir, filename) + with open(out_path, "w") as f: + # json is valid yaml — C++ parse_config_file reads either + json.dump(cfg, f, indent=2) + print(f"Generated {out_path}") + From dd991c137c186dc06798b9fe6ef94295198f05f7 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 9 Apr 2026 21:30:27 +0900 Subject: [PATCH 159/194] [CI] Add thirdparty release manifest; pin base image tag and build on demand in docker-image workflow --- .github/workflows/docker-base-image-2-8.yml | 71 ---------- .github/workflows/docker-image-2-8.yml | 69 --------- .github/workflows/docker-image.yml | 149 ++++++++++++++++++++ .github/workflows/tag_release.yml | 76 +++++++++- scripts/ci/thirdparty_base_pin.sh | 6 + scripts/ci/thirdparty_github_asset_env.sh | 54 +++++++ thirdparty/github-releases.json | 19 +++ 7 files changed, 303 insertions(+), 141 deletions(-) delete mode 100644 .github/workflows/docker-base-image-2-8.yml delete mode 100644 .github/workflows/docker-image-2-8.yml create mode 100644 .github/workflows/docker-image.yml create mode 100755 scripts/ci/thirdparty_base_pin.sh create mode 100755 scripts/ci/thirdparty_github_asset_env.sh create mode 100644 thirdparty/github-releases.json diff --git a/.github/workflows/docker-base-image-2-8.yml b/.github/workflows/docker-base-image-2-8.yml deleted file mode 100644 index 74e81e07..00000000 --- a/.github/workflows/docker-base-image-2-8.yml +++ /dev/null @@ -1,71 +0,0 @@ -name: Docker Base Image CI (PyTorch 2.8) - -on: - push: - branches: [ "base_v2.8" ] - workflow_dispatch: - repository_dispatch: - types: [ build_base ] - -jobs: - build: - runs-on: ubuntu-latest - - permissions: - contents: read - packages: write - - steps: - - name: Checkout Code - uses: actions/checkout@v4 - - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set environment - env: - GIT_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - if [ -n "${{ github.event.pull_request.head.sha }}" ]; then - echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV - echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}" - else - echo "GITHUB_SHA=${{ github.sha }}" >> $GITHUB_ENV - echo "GITHUB_SHA=${{ github.sha }}" - fi - - gem5_response_file=/tmp/releases-gem5-latest.json - curl -s https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} - GEM5_ASSET_ID=$(jq ".assets[0].id" ${gem5_response_file}) - echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" - echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV - - llvm_response_file=/tmp/releases-gem5-latest.json - curl -s https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file} - LLVM_ASSET_ID=$(jq ".assets[0].id" ${llvm_response_file}) - echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" - echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV - - spike_response_file=/tmp/releases-spike-latest.json - curl -s https://api.github.com/repos/PSAL-POSTECH/riscv-isa-sim/releases/latest > ${spike_response_file} - SPIKE_ASSET_ID=$(jq ".assets[0].id" ${spike_response_file}) - echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID" - echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID" >> $GITHUB_ENV - - - name: Build and Push Docker Image (PyTorch 2.8) - uses: docker/build-push-action@v4 - with: - context: . - file: ./Dockerfile.base - push: true - build-args: | - PYTORCH_IMAGE=pytorch/pytorch:2.8.0-cuda12.6-cudnn9-devel - GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }} - LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }} - SPIKE_ASSET_ID=${{ env.SPIKE_ASSET_ID }} - tags: | - ghcr.io/psal-postech/torchsim_base_2_8:latest diff --git a/.github/workflows/docker-image-2-8.yml b/.github/workflows/docker-image-2-8.yml deleted file mode 100644 index 52464dff..00000000 --- a/.github/workflows/docker-image-2-8.yml +++ /dev/null @@ -1,69 +0,0 @@ -name: Docker image CI (PyTorch 2.8) - -on: - pull_request: - branches: [ "master", "develop" ] - workflow_dispatch: - -jobs: - build-and-test: - runs-on: self-hosted - - permissions: - contents: read - packages: write - - steps: - - name: Checkout Code - uses: actions/checkout@v4 - with: - ref: ${{ github.event.pull_request.head.sha }} - submodules: recursive - - - name: Login to GHCR - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Build and Push Docker Image (PyTorch 2.8) - uses: docker/build-push-action@v6 - with: - context: . - file: ./Dockerfile - push: true - no-cache: true - build-args: | - BASE_IMAGE=ghcr.io/psal-postech/torchsim_base_2_8:latest - tags: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }} - - - name: Wait for GHCR propagation - run: | - for i in {1..30}; do - echo "Checking if image exists in GHCR (attempt $i)..." - if docker manifest inspect ghcr.io/psal-postech/torchsim-test-2-8:${GITHUB_SHA} > /dev/null 2>&1; then - echo "Image is now available in GHCR." - exit 0 - fi - echo "Image not yet available, retrying in 30 seconds..." - sleep 20 - done - echo "Image did not become available in GHCR within expected time." - exit 1 - - test-pytorchsim-wrapper1: - needs: build-and-test - uses: ./.github/workflows/pytorchsim_test.yml - with: - image_name: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }} - vector_lane: 128 - spad_size: 128 - - test-pytorchsim-wrapper2: - needs: build-and-test - uses: ./.github/workflows/pytorchsim_test.yml - with: - image_name: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }} - vector_lane: 32 - spad_size: 32 diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml new file mode 100644 index 00000000..67140c89 --- /dev/null +++ b/.github/workflows/docker-image.yml @@ -0,0 +1,149 @@ +name: Docker image CI + +on: + pull_request: + branches: [ "master", "develop" ] + workflow_dispatch: + +env: + BASE_IMAGE_REPO: ghcr.io/psal-postech/torchsim_base + # PR: head commit; otherwise workflow_dispatch uses the branch SHA + SOURCE_SHA: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + +jobs: + ensure-base: + runs-on: ubuntu-latest + outputs: + base_image: ${{ steps.pin.outputs.base_image }} + permissions: + contents: read + packages: write + + steps: + - name: Checkout Code + uses: actions/checkout@v4 + with: + ref: ${{ env.SOURCE_SHA }} + submodules: recursive + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: PyTorch base image from manifest + run: | + PYTORCH_IMAGE=$(python3 -c "import json; from pathlib import Path; v=json.loads(Path('thirdparty/github-releases.json').read_text()).get('pytorch_image'); print(v or '')") + if [ -z "$PYTORCH_IMAGE" ]; then echo "thirdparty/github-releases.json: pytorch_image is required" >&2; exit 1; fi + echo "PYTORCH_IMAGE=$PYTORCH_IMAGE" >> "$GITHUB_ENV" + + - name: Thirdparty pin + id: pin + run: | + PIN="$(bash scripts/ci/thirdparty_base_pin.sh)" + echo "pin=${PIN}" >> "$GITHUB_OUTPUT" + echo "base_image=${BASE_IMAGE_REPO}:thirdparty-${PIN}" >> "$GITHUB_OUTPUT" + echo "BASE_IMAGE=${BASE_IMAGE_REPO}:thirdparty-${PIN}" >> "$GITHUB_ENV" + + - name: Check base image exists + id: exists + run: | + if docker manifest inspect "${BASE_IMAGE}" > /dev/null 2>&1; then + echo "ok=true" >> "$GITHUB_OUTPUT" + else + echo "ok=false" >> "$GITHUB_OUTPUT" + fi + + - name: Resolve GitHub release asset IDs + if: steps.exists.outputs.ok != 'true' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: bash scripts/ci/thirdparty_github_asset_env.sh >> "$GITHUB_ENV" + + - name: Build and push base image (missing pin) + if: steps.exists.outputs.ok != 'true' + uses: docker/build-push-action@v4 + with: + context: . + file: ./Dockerfile.base + push: true + build-args: | + PYTORCH_IMAGE=${{ env.PYTORCH_IMAGE }} + GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }} + LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }} + SPIKE_ASSET_ID=${{ env.SPIKE_ASSET_ID }} + tags: ${{ env.BASE_IMAGE }} + + build-and-test: + needs: ensure-base + runs-on: self-hosted + + permissions: + contents: read + packages: write + + steps: + - name: Checkout Code + uses: actions/checkout@v4 + with: + ref: ${{ env.SOURCE_SHA }} + submodules: recursive + + - name: Login to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and Push Docker Image + uses: docker/build-push-action@v6 + with: + context: . + file: ./Dockerfile + push: true + no-cache: true + build-args: | + BASE_IMAGE=${{ needs.ensure-base.outputs.base_image }} + tags: ghcr.io/psal-postech/torchsim-test:${{ env.SOURCE_SHA }} + + # Do not use GITHUB_SHA here: on pull_request it is the merge commit, while the image tag uses SOURCE_SHA (PR head). + - name: Wait for GHCR propagation + env: + IMAGE_SHA: ${{ env.SOURCE_SHA }} + run: | + IMG="ghcr.io/psal-postech/torchsim-test:${IMAGE_SHA}" + echo "Verifying tag matches push: ${IMAGE_SHA}" + for i in $(seq 1 30); do + echo "Checking if image exists in GHCR (attempt $i)..." + if docker buildx imagetools inspect "$IMG" > /dev/null 2>&1; then + echo "Image is now available in GHCR." + exit 0 + fi + if [ "$i" -eq 1 ]; then + echo "buildx imagetools inspect failed; stderr (first attempt):" + docker buildx imagetools inspect "$IMG" 2>&1 || true + fi + echo "Image not yet available, retrying in 20 seconds..." + sleep 20 + done + echo "Image did not become available in GHCR within expected time." + exit 1 + + test-pytorchsim-wrapper1: + needs: build-and-test + uses: ./.github/workflows/pytorchsim_test.yml + with: + image_name: ghcr.io/psal-postech/torchsim-test:${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + vector_lane: 128 + spad_size: 128 + + test-pytorchsim-wrapper2: + needs: build-and-test + uses: ./.github/workflows/pytorchsim_test.yml + with: + image_name: ghcr.io/psal-postech/torchsim-test:${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + vector_lane: 32 + spad_size: 32 diff --git a/.github/workflows/tag_release.yml b/.github/workflows/tag_release.yml index 0728a583..f92fc060 100644 --- a/.github/workflows/tag_release.yml +++ b/.github/workflows/tag_release.yml @@ -5,8 +5,80 @@ on: tags: - 'v*' +env: + BASE_IMAGE_REPO: ghcr.io/psal-postech/torchsim_base + jobs: + ensure-base: + runs-on: ubuntu-latest + outputs: + base_image: ${{ steps.pin.outputs.base_image }} + permissions: + contents: read + packages: write + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + repository: PSAL-POSTECH/PyTorchSim + ref: ${{ github.sha }} + submodules: recursive + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: PyTorch base image from manifest + run: | + PYTORCH_IMAGE=$(python3 -c "import json; from pathlib import Path; v=json.loads(Path('thirdparty/github-releases.json').read_text()).get('pytorch_image'); print(v or '')") + if [ -z "$PYTORCH_IMAGE" ]; then echo "thirdparty/github-releases.json: pytorch_image is required" >&2; exit 1; fi + echo "PYTORCH_IMAGE=$PYTORCH_IMAGE" >> "$GITHUB_ENV" + + - name: Thirdparty pin + id: pin + run: | + PIN="$(bash scripts/ci/thirdparty_base_pin.sh)" + echo "pin=${PIN}" >> "$GITHUB_OUTPUT" + echo "base_image=${BASE_IMAGE_REPO}:thirdparty-${PIN}" >> "$GITHUB_OUTPUT" + echo "BASE_IMAGE=${BASE_IMAGE_REPO}:thirdparty-${PIN}" >> "$GITHUB_ENV" + + - name: Check base image exists + id: exists + run: | + if docker manifest inspect "${BASE_IMAGE}" > /dev/null 2>&1; then + echo "ok=true" >> "$GITHUB_OUTPUT" + else + echo "ok=false" >> "$GITHUB_OUTPUT" + fi + + - name: Resolve GitHub release asset IDs + if: steps.exists.outputs.ok != 'true' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: bash scripts/ci/thirdparty_github_asset_env.sh >> "$GITHUB_ENV" + + - name: Build and push base image (missing pin) + if: steps.exists.outputs.ok != 'true' + uses: docker/build-push-action@v4 + with: + context: . + file: ./Dockerfile.base + push: true + build-args: | + PYTORCH_IMAGE=${{ env.PYTORCH_IMAGE }} + GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }} + LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }} + SPIKE_ASSET_ID=${{ env.SPIKE_ASSET_ID }} + tags: | + ${{ env.BASE_IMAGE }} + ${{ env.BASE_IMAGE_REPO }}:latest + build: + needs: ensure-base runs-on: self-hosted permissions: @@ -42,4 +114,6 @@ jobs: push: true secrets: | GIT_ACCESS_TOKEN=${{ secrets.GIT_ACCESS_TOKEN }} - tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG}} \ No newline at end of file + build-args: | + BASE_IMAGE=${{ needs.ensure-base.outputs.base_image }} + tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG }} diff --git a/scripts/ci/thirdparty_base_pin.sh b/scripts/ci/thirdparty_base_pin.sh new file mode 100755 index 00000000..6cfc7d9a --- /dev/null +++ b/scripts/ci/thirdparty_base_pin.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +# Deterministic short pin for tagging torchsim_base images (thirdparty + base Dockerfile). +set -euo pipefail +ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +cd "$ROOT" +{ cat thirdparty/github-releases.json; cat Dockerfile.base; } | sha256sum | awk '{print substr($1,1,12)}' diff --git a/scripts/ci/thirdparty_github_asset_env.sh b/scripts/ci/thirdparty_github_asset_env.sh new file mode 100755 index 00000000..8cbe9e12 --- /dev/null +++ b/scripts/ci/thirdparty_github_asset_env.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Emit GEM5_ASSET_ID, LLVM_ASSET_ID, SPIKE_ASSET_ID lines for appending to GITHUB_ENV. +# Requires: jq, curl, GITHUB_TOKEN, repo root as cwd or GITHUB_WORKSPACE. +set -euo pipefail +ROOT="${GITHUB_WORKSPACE:-$(cd "$(dirname "$0")/../.." && pwd)}" +MANIFEST="${ROOT}/thirdparty/github-releases.json" +if [ ! -f "$MANIFEST" ]; then + echo "Missing thirdparty manifest: $MANIFEST" >&2 + exit 1 +fi +if [ -z "${GITHUB_TOKEN:-}" ]; then + echo "GITHUB_TOKEN is not set" >&2 + exit 1 +fi + +thirdparty_asset_id() { + local key="$1" + local out_var="$2" + local repo release_tag asset_name owner name api_url tmp id + repo=$(jq -r --arg k "$key" '.[$k].repository' "$MANIFEST") + release_tag=$(jq -r --arg k "$key" '.[$k].release_tag' "$MANIFEST") + asset_name=$(jq -r --arg k "$key" '.[$k].asset_name // ""' "$MANIFEST") + owner="${repo%%/*}" + name="${repo##*/}" + if [ "$release_tag" = "latest" ]; then + api_url="https://api.github.com/repos/${owner}/${name}/releases/latest" + else + api_url="https://api.github.com/repos/${owner}/${name}/releases/tags/${release_tag}" + fi + tmp=$(mktemp) + if ! curl -fsS -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "$api_url" -o "$tmp"; then + echo "Failed to fetch release metadata for ${key} (${owner}/${name}, ${release_tag})" >&2 + rm -f "$tmp" + exit 1 + fi + if [ -n "$asset_name" ]; then + id=$(jq -r --arg n "$asset_name" '.assets[] | select(.name == $n) | .id' "$tmp" | head -n1) + else + id=$(jq -r '.assets[0].id' "$tmp") + fi + rm -f "$tmp" + if [ -z "$id" ] || [ "$id" = "null" ]; then + echo "Could not resolve asset id for ${key} (${owner}/${name}, tag=${release_tag}, asset_name=${asset_name:-})" >&2 + exit 1 + fi + echo "${out_var}=${id}" +} + +thirdparty_asset_id gem5 GEM5_ASSET_ID +thirdparty_asset_id llvm_project LLVM_ASSET_ID +thirdparty_asset_id spike SPIKE_ASSET_ID diff --git a/thirdparty/github-releases.json b/thirdparty/github-releases.json new file mode 100644 index 00000000..25c220c9 --- /dev/null +++ b/thirdparty/github-releases.json @@ -0,0 +1,19 @@ +{ + "description": "GitHub release pins for CI (docker base image). pytorch_image is the ARG PYTORCH_IMAGE for Dockerfile.base. Use release_tag \"latest\" or an exact release tag for GitHub deps. asset_name must match the release attachment filename. CI builds ghcr.io/.../torchsim_base:thirdparty-<12 hex> when missing (pin = sha256 of this file plus Dockerfile.base) and updates :latest on that push.", + "pytorch_image": "pytorch/pytorch:2.8.0-cuda12.6-cudnn9-devel", + "gem5": { + "repository": "PSAL-POSTECH/gem5", + "release_tag": "v1.0.1", + "asset_name": "gem5-release.tar.gz" + }, + "llvm_project": { + "repository": "PSAL-POSTECH/llvm-project", + "release_tag": "v1.0.6", + "asset_name": "riscv-llvm-release.tar.gz" + }, + "spike": { + "repository": "PSAL-POSTECH/riscv-isa-sim", + "release_tag": "v1.0.1", + "asset_name": "spike-release.tar.gz" + } +} From 54ccd4c897e563650647e532fa698b1f0f57d542 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 7 Apr 2026 16:33:47 +0900 Subject: [PATCH 160/194] [Frontend] Put TorchInductor cache under TORCHSIM_DUMP_PATH --- PyTorchSimDevice/torch_openreg/__init__.py | 1 + PyTorchSimFrontend/extension_codecache.py | 6 +-- PyTorchSimFrontend/extension_config.py | 38 +++++++++++++++++-- PyTorchSimFrontend/mlir/mlir_autotune.py | 4 +- .../mlir/mlir_codegen_backend.py | 3 +- PyTorchSimFrontend/mlir/mlir_common.py | 3 +- PyTorchSimFrontend/mlir/mlir_template.py | 2 +- 7 files changed, 45 insertions(+), 12 deletions(-) diff --git a/PyTorchSimDevice/torch_openreg/__init__.py b/PyTorchSimDevice/torch_openreg/__init__.py index 5e404f7d..e8158391 100644 --- a/PyTorchSimDevice/torch_openreg/__init__.py +++ b/PyTorchSimDevice/torch_openreg/__init__.py @@ -17,6 +17,7 @@ torch.utils.generate_methods_for_privateuse1_backend(for_storage=True) sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')) +import PyTorchSimFrontend.extension_config # noqa: F401 from PyTorchSimFrontend.mlir.mlir_codegen_backend import ExtensionWrapperCodegen from PyTorchSimFrontend.mlir.mlir_scheduling import MLIRScheduling torch._inductor.codegen.common.register_backend_for_device( diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index ac711650..65c96f11 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -4,11 +4,11 @@ import subprocess import torch +from PyTorchSimFrontend import extension_config from torch._inductor.codecache import get_hash, write from torch._inductor.async_compile import AsyncCompile from AsmParser.tog_generator import tog_generator from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen -from PyTorchSimFrontend import extension_config from Simulator.simulator import FunctionalSimulator, CycleSimulator, TOGSimulator # Configure logger for extension_codecache module (WARNING level by default) @@ -20,7 +20,7 @@ def hash_prefix(hash_value): return hash_value[1:12] def get_write_path(src_code): - return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(get_hash(src_code.strip()))) + return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, hash_prefix(get_hash(src_code.strip()))) def get_lock_path(write_path): @@ -283,7 +283,7 @@ def run_kernel_simulation(*args, **kwargs): # Wait for compilation key = future.result() from filelock import FileLock - result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key)) + result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, hash_prefix(key)) lock = FileLock(get_lock_path(result_path), timeout=LOCK_TIMEOUT) with lock: # Run simulator pass diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index 1b7ccf8d..5dec8a4b 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -8,8 +8,42 @@ CONFIG_GEM5_PATH = os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt") CONFIG_TORCHSIM_LLVM_PATH = os.environ.get('TORCHSIM_LLVM_PATH', default="/usr/bin") +CONFIG_TORCHSIM_TOG_HOST_CC = os.environ.get("TORCHSIM_TOG_HOST_CC", "gcc") + +def _default_tog_host_cflags(): + """Host flags for ``dlopen``'d ``*_tog.so`` / ``tile_operation_graph.so``.""" + if os.environ.get("TORCHSIM_TOG_HOST_CFLAGS"): + return os.environ["TORCHSIM_TOG_HOST_CFLAGS"] + if True: #int(os.environ.get("TORCHSIM_TOG_SO_DEBUG", "0")): + return ( + "-g -Og -fno-omit-frame-pointer -fPIC -std=c11 " + "-Wall -Wextra -Wno-unused-variable -Wno-unused-parameter" + ) + return ( + "-O2 -fPIC -std=c11 -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter" + ) + + +CONFIG_TORCHSIM_TOG_HOST_CFLAGS = _default_tog_host_cflags() + + +def _default_tog_host_ldflags(): + if os.environ.get("TORCHSIM_TOG_HOST_LDFLAGS"): + return os.environ["TORCHSIM_TOG_HOST_LDFLAGS"] + # Keep debug sections in .so; optional build-id helps GDB locate DWARF. + base = "-shared" + if int(os.environ.get("TORCHSIM_TOG_SO_DEBUG", "0")): + return base + " -Wl,--build-id" + return base + + +CONFIG_TORCHSIM_TOG_HOST_LDFLAGS = _default_tog_host_ldflags() + CONFIG_TORCHSIM_DUMP_MLIR_IR = int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False)) CONFIG_TORCHSIM_DUMP_LLVM_IR = int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False)) +CONFIG_TORCHSIM_DUMP_PATH = os.environ.get("TORCHSIM_DUMP_PATH", os.path.join(CONFIG_TORCHSIM_DIR, "outputs")) +CONFIG_TORCHSIM_LOG_PATH = os.environ.get("TORCHSIM_LOG_PATH", os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results")) +os.environ["TORCHINDUCTOR_CACHE_DIR"] = os.path.join(CONFIG_TORCHSIM_DUMP_PATH, ".torchinductor") def __getattr__(name): # TOGSim config @@ -99,10 +133,6 @@ def __getattr__(name): if name == "CONFIG_TOGSIM_DEBUG_LEVEL": return os.environ.get("TOGSIM_DEBUG_LEVEL", "") - if name == "CONFIG_TORCHSIM_DUMP_PATH": - return os.environ.get('TORCHSIM_DUMP_PATH', default = CONFIG_TORCHSIM_DIR) - if name == "CONFIG_TORCHSIM_LOG_PATH": - return os.environ.get('TORCHSIM_LOG_PATH', default = os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results")) # SRAM Buffer allocation plan def load_plan_from_module(module_path): diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py index b8f5eaf9..fe1f86a1 100644 --- a/PyTorchSimFrontend/mlir/mlir_autotune.py +++ b/PyTorchSimFrontend/mlir/mlir_autotune.py @@ -21,7 +21,7 @@ def hash_prefix(hash_value): return hash_value[1:12] def get_write_path(src_code): - return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(get_hash(src_code.strip()))) + return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, hash_prefix(get_hash(src_code.strip()))) @dataclasses.dataclass class MLIRBenchmarkRequest(): @@ -61,7 +61,7 @@ def make_run_fn( # Check already cached result. write_path = get_write_path(self.source_code) key, _ = write(self.source_code, "mlir", specified_dir=write_path) - result_dir = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key), "togsim_result") + result_dir = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, hash_prefix(key), "togsim_result") # Find the most recent .log file in the result directory if os.path.exists(result_dir) and os.path.isdir(result_dir): diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 8bfdc57f..05102c79 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -9,6 +9,8 @@ from typing import Optional from collections import defaultdict from concurrent.futures import ThreadPoolExecutor + +from PyTorchSimFrontend import extension_config from torch._dynamo.testing import rand_strided from torch._inductor.autotune_process import TensorMeta from torch._dynamo.utils import dynamo_timed @@ -23,7 +25,6 @@ ) from torch.utils._sympy.functions import ModularIndexing, FloorDiv from PyTorchSimFrontend import extension_codecache -from PyTorchSimFrontend import extension_config from . import mlir_common from .mlir_common import LoopLevel, LoopNest from .mlir_ops import ExtensionOverrides diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 23c02066..7d604c3a 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -9,6 +9,8 @@ from functools import reduce from operator import mul import torch + +from PyTorchSimFrontend import extension_config from torch._inductor.codegen import common from torch._inductor.codegen import cpp from torch._inductor.virtualized import V @@ -30,7 +32,6 @@ sympy_subs, unique, ) -from PyTorchSimFrontend import extension_config from PyTorchSimFrontend import extension_codecache from PyTorchSimFrontend.extension_utils import ( diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index b126d3af..6eb6efb4 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -13,6 +13,7 @@ from typing import List, Optional from unittest.mock import patch +from PyTorchSimFrontend import extension_config from torch._inductor.codegen.common import KernelTemplate, CSE, DeferredLine from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, ChoiceCaller, ir_node_to_tensor from torch._inductor.select_algorithm import PartialRender @@ -29,7 +30,6 @@ from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode from torch._inductor.codegen import common -from PyTorchSimFrontend import extension_config from . import mlir_common # Configure logger for mlir_template module From 7019ff23c2f3ae2f82a6e21e0ecb3c59bd78f869 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 1 Apr 2026 23:46:05 +0900 Subject: [PATCH 161/194] [Frontend] Modify dma_start attribute position --- .../mlir/mlir_codegen_backend.py | 6 ++--- PyTorchSimFrontend/mlir/mlir_common.py | 24 +++++++++++++++++-- PyTorchSimFrontend/mlir/mlir_template.py | 21 +++++++++------- 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 05102c79..58d6a70d 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -536,7 +536,7 @@ def load(self, name: str, index: sympy.Expr): compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"]) # MVIN Encoding - attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding={padding}}}" + attribute = mlir_common.format_dma_op_attributes(dram_stride, tile_stride, int(padding)) code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var, dram_shape, tile_shape, attribute) self.cse.generate(dma_buffer, code, assignment = False) # FIXME: assignment = False does not support caching @@ -607,7 +607,7 @@ def store(self, name: str, index: sympy.Expr, value, mode=None, *args, **kwargs) sram_index_var = self.spad_buffer_dict[str(value)][3] # Generate DMA instruction - attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}" + attribute = mlir_common.format_dma_op_attributes(dram_stride, tile_stride, 0) code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var, dram_shape, tile_shape, attribute) self.dma_stores.writeline(common.DeferredLine(name, code)) @@ -736,7 +736,7 @@ def store_reduction(self, name, index, value): ops._store(value, sram_var, sram_index_var, tile_shape, buffer_name=name) # Generate DMA instruction - attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}" + attribute = mlir_common.format_dma_op_attributes(dram_stride, tile_stride, 0) code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var, dram_shape, tile_shape, attribute) self.reductions_suffix.writeline(common.DeferredLine(name, code)) diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 7d604c3a..5cde19eb 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -3,8 +3,7 @@ import contextvars from contextlib import contextmanager from dataclasses import dataclass -from typing import Dict -from typing import List +from typing import Dict, Iterable, List, Optional, Sequence, Union from collections import defaultdict from functools import reduce from operator import mul @@ -120,6 +119,27 @@ def get_dtype_nbytes(dtype): } } +def format_dma_op_attributes( + dram_stride: Sequence, + sram_stride: Sequence, + padding: int = 0, + *, + subtile_size: Optional[Sequence] = None, + async_type: Optional[int] = None, +) -> str: + """Attribute dict for memref.dma_start; stride lists as bracketed integer lists.""" + parts = [ + f"dram_stride = {dram_stride}", + f"sram_stride = {sram_stride}", + f"padding = {int(padding)}", + ] + if subtile_size: + parts.append(f"subtile_size = {subtile_size}") + av = int(async_type) if async_type is not None else 1 + parts.append(f"async = {av} : i64") + return "{" + ", ".join(parts) + "}" + + class ParallelLoopBuffer(IndentedBuffer): def indent(self, offset=1, attribute="", suffix=""): @contextlib.contextmanager diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py index 6eb6efb4..c8fc036f 100644 --- a/PyTorchSimFrontend/mlir/mlir_template.py +++ b/PyTorchSimFrontend/mlir/mlir_template.py @@ -952,14 +952,19 @@ def generate_dma_code(): zero_cse = self.get_const_cse(0, "index") sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim()) - attribute_parts = [f"dram_stride={_dram_stride}", f"sram_stride={sram_strides}", f"padding={int(padding)}"] if subtile_size: - attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}") - attribute = " {" + ", ".join(attribute_parts) + "}" + attribute = mlir_common.format_dma_op_attributes( + _dram_stride, + sram_strides, + int(padding), + subtile_size=subtile_size, + async_type=int(async_type) if async_type is not None else None, + ) + else: + attribute = mlir_common.format_dma_op_attributes(_dram_stride, sram_strides, int(padding)) code = self.get_dma_code(dma_type, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var, - dram_shape, tile_shape, "") + dram_shape, tile_shape, attribute) local_code.writeline(code) - local_code.writeline(attribute) return textwrap.indent(local_code.getvalue(), " "*indent_size).strip() if not lazy_mode: @@ -1025,7 +1030,7 @@ def load_epilogue(self, name: str, index: sympy.Expr): # Allocate sram buffer dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name]) sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, self.kernel_group.tile_desc, index) - attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}" + attribute = mlir_common.format_dma_op_attributes(dram_stride, tile_stride, 0) code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var, dram_shape, tile_shape, attribute) self.cse.generate(self.dma_loads, code, assignment = False) @@ -1093,7 +1098,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs): ops._store(value, sram_var, compute_index_var, tile_shape, buffer_name=buffer_name) # Generate DMA instruction - attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}" + attribute = mlir_common.format_dma_op_attributes(dram_stride, tile_stride, 0) code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var, dram_shape, tile_shape, attribute) self.dma_stores.writeline(DeferredLine(name, code)) @@ -1244,7 +1249,7 @@ def store_reduction_epilogue(self, name, index, value): # MVOUT Encoding # Generate DMA instruction - attribute = f"{{dram_stride={dram_stride}, sram_stride={final_tile_stride}, padding=0}}" + attribute = mlir_common.format_dma_op_attributes(dram_stride, final_tile_stride, 0) code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var, dram_shape, final_tile_shape, attribute) self.reductions_suffix.writeline(DeferredLine(name, code)) From 178ef52143be2429c6d12070e6313ed2e1a550e6 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 14 Apr 2026 13:23:00 +0900 Subject: [PATCH 162/194] [TOGSim] DMA tag keys int64; Core trace helpers; log elem_bits Refactor Core instruction/DMA traces into core_trace_log; add InstFinishTraceTag; extend DMA/Instruction/TileGraphParser for int64_t tag keys and elem_bits in traces. --- TOGSim/include/Core.h | 11 ++- TOGSim/include/CoreTraceLog.h | 36 ++++++++ TOGSim/include/DMA.h | 31 +++---- TOGSim/include/Instruction.h | 37 ++++---- TOGSim/include/SparseCore.h | 3 +- TOGSim/include/TileGraphParser.h | 32 +++---- TOGSim/include/TraceLogTags.h | 34 ++++++++ TOGSim/src/Core.cc | 145 ++++++++++++++++++------------- TOGSim/src/CoreTraceLog.cc | 122 ++++++++++++++++++++++++++ TOGSim/src/DMA.cc | 31 +++++-- TOGSim/src/Instruction.cc | 36 ++++++-- TOGSim/src/SparseCore.cc | 51 +++++++++-- TOGSim/src/TileGraphParser.cc | 70 +++++++-------- 13 files changed, 468 insertions(+), 171 deletions(-) create mode 100644 TOGSim/include/CoreTraceLog.h create mode 100644 TOGSim/include/TraceLogTags.h create mode 100644 TOGSim/src/CoreTraceLog.cc diff --git a/TOGSim/include/Core.h b/TOGSim/include/Core.h index e4d2f30a..286feb5f 100644 --- a/TOGSim/include/Core.h +++ b/TOGSim/include/Core.h @@ -10,6 +10,14 @@ #include "Tile.h" #include "SimulationConfig.h" #include "DMA.h" +#include "TraceLogTags.h" + +/** Log tag kind for Core::finish_instruction (see TraceLogTag names in TraceLogTags.h). */ +enum class InstFinishTraceTag { + Fnshed, + DmaIssueComplete, + DmaRespComplete, +}; class Core { public: @@ -22,7 +30,8 @@ class Core { virtual void cycle(); virtual void print_stats(); virtual void print_current_stats(); - virtual void finish_instruction(std::shared_ptr& inst); + virtual void finish_instruction(std::shared_ptr& inst, + InstFinishTraceTag tag = InstFinishTraceTag::Fnshed); virtual bool has_memory_request(); virtual void pop_memory_request(); virtual mem_fetch* top_memory_request() { return _request_queue.front(); } diff --git a/TOGSim/include/CoreTraceLog.h b/TOGSim/include/CoreTraceLog.h new file mode 100644 index 00000000..e78c1ef2 --- /dev/null +++ b/TOGSim/include/CoreTraceLog.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include + +#include "Instruction.h" +#include "TraceLogTags.h" + +/** + * Instruction / tile trace formatting and Core spdlog::trace helpers. + * Keeps Core.cc focused on simulation logic. + */ +namespace core_trace_log { + +std::string format_dma_inst_issued_detail(Instruction& inst); +/** Opcode + (detail...) for DMA issue / skip traces. */ +std::string format_dma_inst_issued_trace_line(Instruction& inst); +/** Opcode + (detail...) for COMP / BAR / MOVIN / MOVOUT finished or issued lines. */ +std::string format_instruction_detail_line(Instruction& inst); + +void trace_tile_scheduled(cycle_type core_cycle, uint32_t core_id, const std::string& tag15); + +void trace_instruction_line(cycle_type core_cycle, + uint32_t core_id, + const std::string& tag15, + uint64_t global_inst_id, + const std::string& message); + +void log_error_dma_instruction_invalid(cycle_type core_cycle, uint32_t core_id); +void log_error_dram_responses_trace_not_finished(cycle_type core_cycle, uint32_t core_id); +void log_error_instruction_already_finished(cycle_type core_cycle, + uint32_t core_id, + const std::string& opcode_name); +void log_error_undefined_opcode(); + +} // namespace core_trace_log diff --git a/TOGSim/include/DMA.h b/TOGSim/include/DMA.h index 3056c626..08bdcab4 100644 --- a/TOGSim/include/DMA.h +++ b/TOGSim/include/DMA.h @@ -12,41 +12,41 @@ #include "Memfetch.h" struct VectorCompare { - bool operator()(const std::vector& a, const std::vector& b) const { + bool operator()(const std::vector& a, const std::vector& b) const { return a < b; } }; class DMA { public: - DMA(uint32_t id, uint32_t dram_req_size); + DMA(uint32_t id, uint32_t dram_req_size, bool l2_datacache_enabled); void issue_tile(std::shared_ptr inst); bool is_finished() { return _finished; } bool empty() { return _current_inst==nullptr; } - void register_tag(int subgraph_id, std::vector& key) { + void register_tag(int subgraph_id, std::vector& key) { if (tag_table.find(subgraph_id) == tag_table.end()) { - tag_table[subgraph_id] = std::map, uint32_t>(); - waiters[subgraph_id] = std::map, std::vector>>(); + tag_table[subgraph_id] = std::map, uint32_t>(); + waiters[subgraph_id] = std::map, std::vector>>(); } tag_table[subgraph_id][key] = 0; waiters[subgraph_id][key] = std::vector>(); } - void set_tag_finish(int subgraph_id, std::vector& key) { + void set_tag_finish(int subgraph_id, std::vector& key) { if (tag_table.find(subgraph_id) == tag_table.end()) { throw std::runtime_error("Subgraph does not exist in tag_table"); } tag_table[subgraph_id][key] = 1; } - void set_tag_sparse(int subgraph_id, std::vector& key) { + void set_tag_sparse(int subgraph_id, std::vector& key) { if (tag_table.find(subgraph_id) == tag_table.end()) { throw std::runtime_error("Subgraph does not exist in tag_table"); } tag_table[subgraph_id][key] = -1; } - void mark_tag_used(int subgraph_id, std::vector& key) { + void mark_tag_used(int subgraph_id, std::vector& key) { if (tag_table.find(subgraph_id) == tag_table.end()) { throw std::runtime_error("Subgraph does not exist in tag_table"); } else if (!tag_table[subgraph_id][key]) { @@ -59,7 +59,7 @@ class DMA { for (const auto& entry: tag_table) { auto subgraph_id = entry.first; for (const auto& tag_entry: tag_table[subgraph_id]) { - const std::vector& tag_key = tag_entry.first; + const std::vector& tag_key = tag_entry.first; uint32_t value = tag_entry.second; if (value == 1) { spdlog::debug("[Tag Table][{}] Unused tag found: (key={}, val={})", @@ -69,7 +69,7 @@ class DMA { } } - bool tag_key_exist(int subgraph_id, std::vector& key) { + bool tag_key_exist(int subgraph_id, std::vector& key) { auto subgraph_it = tag_table.find(subgraph_id); if (subgraph_it == tag_table.end()) return false; @@ -78,7 +78,7 @@ class DMA { auto key_it = key_map.find(key); return key_it != key_map.end(); } - uint32_t get_tag_finish(int subgraph_id, std::vector& key) { + uint32_t get_tag_finish(int subgraph_id, std::vector& key) { auto subgraph_it = tag_table.find(subgraph_id); auto& key_map = subgraph_it->second; auto key_it = key_map.find(key); @@ -95,7 +95,7 @@ class DMA { tag_table.erase(subgraph_id); waiters.erase(subgraph_id); } - void register_tag_waiter(int subgraph_id, std::vector& key, std::shared_ptr inst) { + void register_tag_waiter(int subgraph_id, std::vector& key, std::shared_ptr inst) { auto subgraph_it = tag_table.find(subgraph_id); auto& key_map = subgraph_it->second; auto key_it = key_map.find(key); @@ -104,7 +104,7 @@ class DMA { } waiters[subgraph_id][key].push_back(inst); } - std::vector>& get_tag_waiter(int subgraph_id, std::vector& key) { + std::vector>& get_tag_waiter(int subgraph_id, std::vector& key) { auto subgraph_it = tag_table.find(subgraph_id); auto& key_map = subgraph_it->second; auto key_it = key_map.find(key); @@ -129,8 +129,9 @@ class DMA { size_t _tile_idx_stride=1; uint32_t _tile_idx; bool _finished=true; - std::map, uint32_t>> tag_table; - std::map, std::vector>>> waiters; + bool _l2_datacache_enabled = false; + std::map, uint32_t>> tag_table; + std::map, std::vector>>> waiters; std::queue _pending_accesses; bool _generated_once = false; }; diff --git a/TOGSim/include/Instruction.h b/TOGSim/include/Instruction.h index 9fad13f4..bb62a440 100644 --- a/TOGSim/include/Instruction.h +++ b/TOGSim/include/Instruction.h @@ -18,13 +18,14 @@ typedef uint64_t addr_type; typedef uint64_t cycle_type; std::string opcode_to_string(Opcode opcode); +std::string format_tag_key_list_hex(const std::vector& tag_keys); class Instruction : public std::enable_shared_from_this { public: Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_parents, addr_type dram_addr, - std::vector tile_size, std::vector tile_stride, size_t precision, - std::vector tag_idx_list, std::vector tag_stride_list, - std::vector accum_tag_idx_list); + std::vector tile_size, std::vector tile_stride, size_t elem_bits, + std::vector tag_idx_list, std::vector tag_stride_list, + std::vector accum_tag_idx_list); Instruction(Opcode opcode); void finish_instruction(); void add_child(std::shared_ptr child); @@ -32,6 +33,7 @@ class Instruction : public std::enable_shared_from_this { const Opcode get_opcode() { return opcode; } bool is_dma_read() { return opcode == Opcode::MOVIN; } bool is_dma_write() { return opcode == Opcode::MOVOUT; } + bool is_dma_instruction() const { return opcode == Opcode::MOVIN || opcode == Opcode::MOVOUT; } bool is_async_dma() { return _is_async_dma; } bool is_indirect_mode() { return _is_indirect_mode; } std::string get_indirect_index_path() { return _indirect_index_path; } @@ -45,11 +47,12 @@ class Instruction : public std::enable_shared_from_this { } } size_t get_tile_numel() { return _tile_numel; } - size_t get_precision() { return _precision; } + size_t get_elem_bits() const { return _elem_bits; } void inc_waiting_request(); void dec_waiting_request(); size_t get_waiting_request() { return _nr_waiting_request; } std::vector& get_tile_size() { return tile_size; } + std::vector& get_tile_stride() { return tile_stride; } void set_overlapping_cycle(cycle_type cycle) { overlapping_cycle = cycle; } cycle_type get_overlapping_cycle() { return overlapping_cycle; } cycle_type get_compute_cycle() { return compute_cycle; } @@ -68,12 +71,12 @@ class Instruction : public std::enable_shared_from_this { int get_compute_type() { return _compute_type; } void set_numa_id(int numa_id) { _numa_id = numa_id; } uint32_t get_numa_id() { return _numa_id; } - std::vector& get_tag_idx_list() { return _tag_idx_list; } - std::vector& get_tag_stride_list() { return _tag_stride_list; } - std::vector& get_tag_id() { return _tag_key; } - void set_addr_name(std::string name, int id) { _addr_name = name; _addr_id = id; } + std::vector& get_tag_idx_list() { return _tag_idx_list; } + std::vector& get_tag_stride_list() { return _tag_stride_list; } + std::vector& get_tag_id() { return _tag_key; } + void set_addr_name(std::string name, int64_t id) { _addr_name = name; _addr_id = id; } std::string get_addr_name() { return _addr_name; } - int get_addr_id() { return _addr_id; } + int64_t get_addr_id() { return _addr_id; } void set_nr_inner_loop(int nr) { _nr_inner_loop = nr; } int get_nr_inner_loop() { return _nr_inner_loop; } void set_is_async(bool is_async) { _is_async_dma = is_async; } @@ -81,6 +84,7 @@ class Instruction : public std::enable_shared_from_this { bool is_sparse_inst() { return _is_sparse_inst; } void set_sparse_state(bool state) { _is_sparse_inst = state; } std::set>& get_child_inst() { return child_inst; } + uint64_t get_global_inst_id() const { return _global_inst_id; } cycle_type start_cycle; cycle_type finish_cycle; @@ -89,6 +93,9 @@ class Instruction : public std::enable_shared_from_this { bool finished=false; int subgraph_id; private: + uint64_t _global_inst_id = 0; + static uint64_t _next_global_inst_id; + void *_owner = nullptr; std::list>* _owner_ready_queue_ref = nullptr; Opcode opcode; @@ -100,17 +107,17 @@ class Instruction : public std::enable_shared_from_this { std::vector tile_stride; size_t _tile_numel; size_t _nr_waiting_request=0; - size_t _precision=0; + size_t _elem_bits = 0; addr_type dram_addr; uint32_t _numa_id = 0; // For DMA instruction int _compute_type = 0; - std::vector _tag_idx_list; - std::vector _tag_stride_list; - std::vector _tag_key; - std::vector _accum_tag_idx_list; + std::vector _tag_idx_list; + std::vector _tag_stride_list; + std::vector _tag_key; + std::vector _accum_tag_idx_list; std::vector _trace_address; std::string _addr_name; - int _addr_id; + int64_t _addr_id = 0; int _nr_inner_loop = 0; bool _is_async_dma=false; bool _is_indirect_mode=false; diff --git a/TOGSim/include/SparseCore.h b/TOGSim/include/SparseCore.h index 02781ab3..a91004ed 100644 --- a/TOGSim/include/SparseCore.h +++ b/TOGSim/include/SparseCore.h @@ -59,7 +59,8 @@ class SparseCore : public Core { void print_stats() override; void print_current_stats() override; std::shared_ptr pop_finished_tile() override; - void finish_instruction(std::shared_ptr& inst) override; + void finish_instruction(std::shared_ptr& inst, + InstFinishTraceTag tag = InstFinishTraceTag::Fnshed) override; void dumpTrace(int stonne_core_id, const std::string& path); bool isTraceMode(int stonne_core_id) { return traceMode.at(stonne_core_id); } void setTraceMode(int stonne_core_id, bool mode) { traceMode.at(stonne_core_id) = mode; } diff --git a/TOGSim/include/TileGraphParser.h b/TOGSim/include/TileGraphParser.h index f067fb2d..d255a735 100644 --- a/TOGSim/include/TileGraphParser.h +++ b/TOGSim/include/TileGraphParser.h @@ -80,9 +80,9 @@ class TileGraphParser { int getCoreIdFromConfig(const YAML::Node& attribute_config, int subgraph_id); std::string getMetaByName(std::string key) { return _tog_meta[key]; } const YAML::Node& get_attribute_file() { return _attribute_config; } - std::vector calc_tag(std::vector& accum_tag, std::vector& tag_idx, std::vector& tag_stride); - void register_memory_tag(std::string name, std::vector& tag_key); - bool check_memory_tag(std::string name, std::vector& tag_key); + std::vector calc_tag(std::vector& accum_tag, std::vector& tag_idx, std::vector& tag_stride); + void register_memory_tag(std::string name, std::vector& tag_key); + bool check_memory_tag(std::string name, std::vector& tag_key); void clear_tag_table() { _tag_table.clear(); } std::string get_indirect_path() { namespace fs = std::filesystem; @@ -118,12 +118,12 @@ class TileGraphParser { uint64_t get_dma_counter() { return dma_counter; } void inc_dma_counter() { dma_counter++; } bool is_sparse_tile(uint64_t idx) { return sparse_tile_set.find(idx) != sparse_tile_set.end(); } - int register_addr_name(const std::string& addr_name) { + int64_t register_addr_name(const std::string& addr_name) { if (_addr_name_map.find(addr_name) == _addr_name_map.end()) - _addr_name_map[addr_name] = _addr_name_map.size(); + _addr_name_map[addr_name] = static_cast(_addr_name_map.size()); return _addr_name_map[addr_name]; } - int get_addr_name_id(const std::string& addr_name) { return _addr_name_map[addr_name]; } + int64_t get_addr_name_id(const std::string& addr_name) { return _addr_name_map[addr_name]; } private: void register_tile(std::shared_ptr tile_node); @@ -148,8 +148,8 @@ class TileGraphParser { std::vector> _cache_plan; std::map> _loop_size_map; std::map _tog_meta; - std::map>, uint32_t> _tag_table; - std::unordered_map _addr_name_map; + std::map>, uint32_t> _tag_table; + std::unordered_map _addr_name_map; }; class TileComputeNode : public TileNode { @@ -171,11 +171,11 @@ class TileMemoryNode : public TileNode { public: TileMemoryNode(onnx::NodeProto& node); std::string get_base_addr_name() { return _base_addr_name; } - size_t get_precision() { return _element_size; } + size_t get_elem_bits() const { return _elem_bits; } std::vector get_tile_size() { return _tile_size; } std::vector& get_tile_stride() { return _tile_stride; } std::vector& get_tag_idx_list() { return _tag_idx_list; } - std::vector& get_tag_stride_list() { return _tag_stride_list; } + std::vector& get_tag_stride_list() { return _tag_stride_list; } std::vector& get_loop_idx_list() { return _loop_idx_list; } std::vector& get_loop_stride_list () { return _loop_stride_list; } bool is_async_node() { return _is_async; } @@ -185,12 +185,12 @@ class TileMemoryNode : public TileNode { private: std::vector _tile_size; std::vector _tile_stride; - size_t _element_size; + size_t _elem_bits = 0; bool _is_async; bool _is_indirect; std::string _base_addr_name; std::vector _tag_idx_list; - std::vector _tag_stride_list; + std::vector _tag_stride_list; std::vector _loop_idx_list; std::vector _loop_stride_list; }; @@ -200,14 +200,14 @@ class TileMemoryWaitNode : public TileNode { TileMemoryWaitNode(onnx::NodeProto& node); std::string get_base_addr_name() { return _base_addr_name; } std::vector& get_tag_idx_list() { return _tag_idx_list; } - std::vector& get_tag_stride_list() { return _tag_stride_list; } - std::vector& get_tag_divider_list() { return _tag_divider_list; } + std::vector& get_tag_stride_list() { return _tag_stride_list; } + std::vector& get_tag_divider_list() { return _tag_divider_list; } void print_node() override; private: std::vector _tag_idx_list; - std::vector _tag_stride_list; - std::vector _tag_divider_list; + std::vector _tag_stride_list; + std::vector _tag_divider_list; std::string _base_addr_name; }; diff --git a/TOGSim/include/TraceLogTags.h b/TOGSim/include/TraceLogTags.h new file mode 100644 index 00000000..6c158099 --- /dev/null +++ b/TOGSim/include/TraceLogTags.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include + +/** Trace bracket tags: max 15 characters; use pad15() so logs show a fixed 15-char field (space-padded). */ +namespace TraceLogTag { + +/** Right-pad (or truncate) to exactly 15 characters for aligned log columns. */ +inline std::string pad15(std::string_view sv) { + if (sv.size() > 15) { + sv = sv.substr(0, 15); + } + std::string out(sv); + out.resize(15, ' '); + return out; +} + +inline constexpr const char* kTileScheduled = "TILE_SCHEDULED"; + +inline constexpr const char* kInstructionIssued = "INST_ISSUED"; +inline constexpr const char* kInstructionFinished = "INST_FINISHED"; +/** Async MOVIN skipped: same tag still in flight. */ +inline constexpr const char* kInstructionSkipped = "INST_SKIP"; + +inline constexpr const char* kAsyncDmaAllRequestsIssued = "ASYNC_DMA_ISSUE"; +inline constexpr const char* kAllDramResponsesReceived = "DRAM_RESP_DONE"; + +inline constexpr const char* kL2CacheableStatusForAddress = "L2CACHE_STAT"; +inline constexpr const char* kDmaNumaPlacement = "DRAM_NUMA"; + +/** Field label for get_global_inst_id() in trace lines (≤15 chars). */ +inline constexpr const char* kGlobalInstIdKey = "INST_ID"; +} // namespace TraceLogTag diff --git a/TOGSim/src/Core.cc b/TOGSim/src/Core.cc index 1f831661..d9be4ca3 100644 --- a/TOGSim/src/Core.cc +++ b/TOGSim/src/Core.cc @@ -1,4 +1,7 @@ #include "Core.h" +#include "CoreTraceLog.h" +#include +#include Core::Core(uint32_t id, SimulationConfig config) : _id(id), @@ -6,7 +9,7 @@ Core::Core(uint32_t id, SimulationConfig config) _core_cycle(0), _stat_dma_cycle(0), _num_systolic_array_per_core(config.num_systolic_array_per_core), - _dma(id, config.dram_req_size) { + _dma(id, config.dram_req_size, config.l2d_type != L2CacheType::NOCACHE) { _sa_compute_pipeline.resize(_num_systolic_array_per_core); _stat_tot_sa_compute_cycle.resize(_num_systolic_array_per_core); _stat_sa_compute_cycle.resize(_num_systolic_array_per_core); @@ -22,9 +25,9 @@ bool Core::can_issue(const std::shared_ptr& op) { } void Core::issue(std::shared_ptr op) { - if (op->get_instructions().size()){ - spdlog::trace("[{}][Core {}][TILE_SCHEDULED]", - _core_cycle, _id); + if (op->get_instructions().size()) { + core_trace_log::trace_tile_scheduled(_core_cycle, _id, + TraceLogTag::pad15(TraceLogTag::kTileScheduled)); } for (const auto& inst : op->get_instructions()) { if (inst->is_ready()) @@ -120,13 +123,16 @@ void Core::dma_cycle() { if (instruction->is_dma_read() && instruction->is_async_dma()) { auto& key = instruction->get_tag_id(); assert(!_dma.get_tag_finish(instruction->subgraph_id, key)); + spdlog::trace( + "[{}][Core {}] TOG async DMA response (table notify): tag_addr=0x{:016x} global_inst_id={} " + "subgraph_id={}", + _core_cycle, + _id, + static_cast(static_cast(instruction->get_addr_id())), + instruction->get_global_inst_id(), + instruction->subgraph_id); _dma.set_tag_finish(instruction->subgraph_id, key); - spdlog::trace("[{}][Core {}] {} ASYNC FINISHED, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", - _core_cycle, _id, opcode_to_string(instruction->get_opcode()), - instruction->subgraph_id, instruction->get_addr_name(), - fmt::format("[{}]", fmt::join(instruction->get_tag_id(), ", ")), - fmt::format("[{}]", fmt::join(instruction->get_tag_idx_list(), ", ")), - fmt::format("[{}]", fmt::join(instruction->get_tag_stride_list(), ", "))); + finish_instruction(instruction, InstFinishTraceTag::DmaRespComplete); for (auto & wait_inst : _dma.get_tag_waiter(instruction->subgraph_id, key)) { _dma.mark_tag_used(instruction->subgraph_id, key); finish_instruction(wait_inst); @@ -143,18 +149,18 @@ void Core::dma_cycle() { /* Only DMA write operation is finished! */ finish_instruction(finished_inst); } else if (finished_inst->is_dma_read() && finished_inst->is_async_dma()) { - /* Register tag table for async dma load */ - _dma.register_tag(finished_inst->subgraph_id, finished_inst->get_tag_id()); - finish_instruction(finished_inst); + /* Register tag table for async dma load; see TraceLogTag::kAsyncDmaAllRequestsIssued */ + finish_instruction(finished_inst, InstFinishTraceTag::DmaIssueComplete); } else if(!finished_inst->is_dma_read()) { - spdlog::error("[{}][Core {}] DMA instruction in not valid", _core_cycle, _id); + core_trace_log::log_error_dma_instruction_invalid(_core_cycle, _id); exit(EXIT_FAILURE); } else if (finished_inst->get_opcode() == Opcode::BAR) { - spdlog::trace("[{}][Core {}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, - opcode_to_string(finished_inst->get_opcode()), finished_inst->get_addr_name(), - fmt::format("[{}]", fmt::join(finished_inst->get_tag_id(), ", ")), - fmt::format("[{}]", fmt::join(finished_inst->get_tag_idx_list(), ", ")), - fmt::format("[{}]", fmt::join(finished_inst->get_tag_stride_list(), ", "))); + core_trace_log::trace_instruction_line(_core_cycle, + _id, + TraceLogTag::pad15(TraceLogTag::kInstructionFinished), + finished_inst->get_global_inst_id(), + core_trace_log::format_instruction_detail_line( + *finished_inst)); } /*Pass to waiting queue */ _dma_waiting_queue[finished_inst.get()] = std::move(finished_inst); @@ -223,34 +229,37 @@ void Core::cycle() { finish_instruction(inst); else _dma.register_tag_waiter(inst->subgraph_id, key, inst); - spdlog::trace("[{}][Core {}][SIKIPPED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, - opcode_to_string(inst->get_opcode()), - inst->get_addr_name(), - fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), - fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), - fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", "))); + core_trace_log::trace_instruction_line(_core_cycle, + _id, + TraceLogTag::pad15( + TraceLogTag::kInstructionSkipped), + inst->get_global_inst_id(), + core_trace_log::format_dma_inst_issued_trace_line( + *inst)); issued = true; _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; break; } else { - spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, - opcode_to_string(inst->get_opcode()), - inst->get_addr_name(), - fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), - fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), - fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", "))); + core_trace_log::trace_instruction_line(_core_cycle, + _id, + TraceLogTag::pad15( + TraceLogTag::kInstructionIssued), + inst->get_global_inst_id(), + core_trace_log::format_dma_inst_issued_trace_line( + *inst)); + _dma.register_tag(inst->subgraph_id, inst->get_tag_id()); _ld_inst_queue.push(inst); issued = true; break; } } case Opcode::MOVOUT: - spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, - opcode_to_string(inst->get_opcode()), - inst->get_addr_name(), - fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), - fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), - fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", "))); + core_trace_log::trace_instruction_line(_core_cycle, + _id, + TraceLogTag::pad15(TraceLogTag::kInstructionIssued), + inst->get_global_inst_id(), + core_trace_log::format_dma_inst_issued_trace_line( + *inst)); _st_inst_queue.push(inst); issued = true; break; @@ -273,8 +282,13 @@ void Core::cycle() { _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; instructions.erase(it); } else { - spdlog::trace("[{}][Core {}][INST_ISSUED][SA {}] {}-{}, finsh at {}", _core_cycle, _id, _systolic_array_rr, - opcode_to_string(inst->get_opcode()), inst->get_compute_type(), inst->finish_cycle); + core_trace_log::trace_instruction_line(_core_cycle, + _id, + TraceLogTag::pad15( + TraceLogTag::kInstructionIssued), + inst->get_global_inst_id(), + core_trace_log::format_instruction_detail_line( + *inst)); target_pipeline.push(inst); issued = true; if (inst->get_compute_type()) { @@ -300,16 +314,18 @@ void Core::cycle() { } else { _dma.register_tag_waiter(inst->subgraph_id, key, inst); } - spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id, - opcode_to_string(inst->get_opcode()), inst->get_addr_name(), - fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")), - fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), - fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", "))); + core_trace_log::trace_instruction_line(_core_cycle, + _id, + TraceLogTag::pad15( + TraceLogTag::kInstructionIssued), + inst->get_global_inst_id(), + core_trace_log::format_instruction_detail_line( + *inst)); issued = true; } break; default: - spdlog::error("Undefined instruction opcode type"); + core_trace_log::log_error_undefined_opcode(); exit(EXIT_FAILURE); } @@ -341,27 +357,34 @@ void Core::cycle() { } } -void Core::finish_instruction(std::shared_ptr& inst) { +void Core::finish_instruction(std::shared_ptr& inst, InstFinishTraceTag tag) { + if (tag == InstFinishTraceTag::DmaRespComplete) { + if (!inst->finished) { + core_trace_log::log_error_dram_responses_trace_not_finished(_core_cycle, _id); + exit(EXIT_FAILURE); + } + core_trace_log::trace_instruction_line(_core_cycle, + _id, + TraceLogTag::pad15(TraceLogTag::kAllDramResponsesReceived), + inst->get_global_inst_id(), + core_trace_log::format_instruction_detail_line(*inst)); + return; + } if (inst->finished) { - spdlog::error("[{}][Core {}][ERROR] {} inst already finished!!", _core_cycle, _id, - opcode_to_string(inst->get_opcode())); + core_trace_log::log_error_instruction_already_finished(_core_cycle, _id, + opcode_to_string(inst->get_opcode())); exit(EXIT_FAILURE); } inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); - if (inst->get_opcode() == Opcode::COMP) { - spdlog::trace("[{}][Core {}][INST_FINISHED] {}-{}", - _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_compute_type()); - } else if (inst->get_opcode() != Opcode::BAR && inst->is_async_dma()){ - spdlog::trace("[{}][Core {}][ASYNC] {} subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", - _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->subgraph_id, inst->get_addr_name(), - inst->get_tag_id(), - fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")), - fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", "))); - } else if ((inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) && !inst->is_async_dma()) { - spdlog::trace("[{}][Core {}][INST_FINISHED] {} addr_name: {}", _core_cycle, _id, - opcode_to_string(inst->get_opcode()), inst->get_addr_name()); - } + const char* trace_tag = (tag == InstFinishTraceTag::DmaIssueComplete) + ? TraceLogTag::kAsyncDmaAllRequestsIssued + : TraceLogTag::kInstructionFinished; + core_trace_log::trace_instruction_line(_core_cycle, + _id, + TraceLogTag::pad15(trace_tag), + inst->get_global_inst_id(), + core_trace_log::format_instruction_detail_line(*inst)); } bool Core::running() { diff --git a/TOGSim/src/CoreTraceLog.cc b/TOGSim/src/CoreTraceLog.cc new file mode 100644 index 00000000..ebc31de0 --- /dev/null +++ b/TOGSim/src/CoreTraceLog.cc @@ -0,0 +1,122 @@ +#include "CoreTraceLog.h" + +#include + +#include +#include +#include + +namespace core_trace_log { + +std::string format_dma_inst_issued_detail(Instruction& inst) { + const auto& ts = inst.get_tile_size(); + const int rank = static_cast(std::max(1, ts.size())); + if (inst.get_opcode() == Opcode::MOVIN) { + return fmt::format( + "addr_name={} dram=0x{:016x} rank={} size=[{}] stride=[{}] elem_bits={} async={} indirect={} tag_id=[{}]", + inst.get_addr_name(), + static_cast(inst.get_base_dram_address()), + rank, + fmt::join(ts, ","), + fmt::join(inst.get_tile_stride(), ","), + inst.get_elem_bits(), + inst.is_async_dma(), + inst.is_indirect_mode(), + format_tag_key_list_hex(inst.get_tag_id())); + } + uint64_t tag_hex = 0; + const auto& tidx = inst.get_tag_idx_list(); + if (!tidx.empty()) { + tag_hex = static_cast(tidx[0]); + } + return fmt::format( + "addr_name={} dram=0x{:016x} rank={} elem_bits={} async={} indirect={} tag=0x{:016x} stride=[{}] size=[{}] " + "tag_idx=[{}]", + inst.get_addr_name(), + static_cast(inst.get_base_dram_address()), + rank, + inst.get_elem_bits(), + inst.is_async_dma(), + inst.is_indirect_mode(), + tag_hex, + fmt::join(inst.get_tile_stride(), ","), + fmt::join(ts, ","), + fmt::join(tidx, ",")); +} + +std::string format_dma_inst_issued_trace_line(Instruction& inst) { + return fmt::format("{} ({})", opcode_to_string(inst.get_opcode()), format_dma_inst_issued_detail(inst)); +} + +std::string format_instruction_detail_line(Instruction& inst) { + const Opcode op = inst.get_opcode(); + const std::string opname = opcode_to_string(op); + if (op == Opcode::COMP) { + return fmt::format("{} (compute_type={} compute_cycle={} overlapping_cycle={})", + opname, + inst.get_compute_type(), + inst.get_compute_cycle(), + inst.get_overlapping_cycle()); + } + if ((op == Opcode::MOVIN || op == Opcode::MOVOUT) && inst.is_async_dma()) { + return fmt::format("{} (ASYNC subgraph_id={} addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}])", + opname, + inst.subgraph_id, + inst.get_addr_name(), + format_tag_key_list_hex(inst.get_tag_id()), + fmt::join(inst.get_tag_idx_list(), ","), + fmt::join(inst.get_tag_stride_list(), ",")); + } + if (op == Opcode::MOVIN || op == Opcode::MOVOUT) { + return fmt::format("{} (addr_name={})", opname, inst.get_addr_name()); + } + if (op == Opcode::BAR) { + return fmt::format("{} (addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}])", + opname, + inst.get_addr_name(), + format_tag_key_list_hex(inst.get_tag_id()), + fmt::join(inst.get_tag_idx_list(), ","), + fmt::join(inst.get_tag_stride_list(), ",")); + } + return opname; +} + +void trace_tile_scheduled(cycle_type core_cycle, uint32_t core_id, const std::string& tag15) { + spdlog::trace("[{}][Core {}][{}]", core_cycle, core_id, tag15); +} + +void trace_instruction_line(cycle_type core_cycle, + uint32_t core_id, + const std::string& tag15, + uint64_t global_inst_id, + const std::string& message) { + spdlog::trace("[{}][Core {}][{}][{}={}] {}", + core_cycle, + core_id, + tag15, + TraceLogTag::kGlobalInstIdKey, + global_inst_id, + message); +} + +void log_error_dma_instruction_invalid(cycle_type core_cycle, uint32_t core_id) { + spdlog::error("[{}][Core {}] DMA instruction in not valid", core_cycle, core_id); +} + +void log_error_dram_responses_trace_not_finished(cycle_type core_cycle, uint32_t core_id) { + spdlog::error("[{}][Core {}][ERROR] ALL_DRAM_RESPONSES_RECEIVED trace but inst not finished yet", + core_cycle, + core_id); +} + +void log_error_instruction_already_finished(cycle_type core_cycle, + uint32_t core_id, + const std::string& opcode_name) { + spdlog::error("[{}][Core {}][ERROR] {} inst already finished!!", core_cycle, core_id, opcode_name); +} + +void log_error_undefined_opcode() { + spdlog::error("Undefined instruction opcode type"); +} + +} // namespace core_trace_log diff --git a/TOGSim/src/DMA.cc b/TOGSim/src/DMA.cc index fefee6d2..5d509953 100644 --- a/TOGSim/src/DMA.cc +++ b/TOGSim/src/DMA.cc @@ -1,9 +1,11 @@ #include "DMA.h" #include "TileGraph.h" +#include "TraceLogTags.h" -DMA::DMA(uint32_t id, uint32_t dram_req_size) { +DMA::DMA(uint32_t id, uint32_t dram_req_size, bool l2_datacache_enabled) { _id = id; _dram_req_size = dram_req_size; + _l2_datacache_enabled = l2_datacache_enabled; _current_inst = nullptr; _finished = true; } @@ -31,12 +33,27 @@ std::shared_ptr> DMA::get_memory_access(cycle_type core_ bool is_cacheable = owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size); - spdlog::trace("[{}][Core {}][SRAM] Address: 0x{:016x}, Is_cacheable: {}", - core_cycle, _id, base_daddr, is_cacheable); - spdlog::trace("[{}][Core {}][NUMA] Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}", - core_cycle, _id, owner_subgraph->get_core_id(), - _current_inst->get_numa_id(), _current_inst->get_addr_name(), - _current_inst->is_dma_write()); + if (_l2_datacache_enabled) { + spdlog::trace( + "[{}][Core {}][{}][INST_ID={}] dram=0x{:016x} cacheable={}", + core_cycle, + _id, + TraceLogTag::pad15(TraceLogTag::kL2CacheableStatusForAddress), + _current_inst->get_global_inst_id(), + base_daddr, + is_cacheable); + } + spdlog::trace( + "[{}][Core {}][{}][INST_ID={}] core_id={} subgraph_id={} numa_id={} addr_name={} is_write={}", + core_cycle, + _id, + TraceLogTag::pad15(TraceLogTag::kDmaNumaPlacement), + _current_inst->get_global_inst_id(), + owner_subgraph->get_core_id(), + _current_inst->subgraph_id, + _current_inst->get_numa_id(), + _current_inst->get_addr_name(), + _current_inst->is_dma_write()); for (const auto& addr : *addr_set) { mem_access_type acc_type = _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W diff --git a/TOGSim/src/Instruction.cc b/TOGSim/src/Instruction.cc index aef9079c..1dc3ff42 100644 --- a/TOGSim/src/Instruction.cc +++ b/TOGSim/src/Instruction.cc @@ -1,5 +1,23 @@ #include "Instruction.h" +#include + +uint64_t Instruction::_next_global_inst_id = 0; + +std::string format_tag_key_list_hex(const std::vector& tag_keys) { + if (tag_keys.empty()) { + return {}; + } + std::string out; + for (size_t i = 0; i < tag_keys.size(); ++i) { + if (i > 0) { + out.push_back(','); + } + out += fmt::format("0x{:016x}", static_cast(tag_keys[i])); + } + return out; +} + std::string opcode_to_string(Opcode opcode) { switch (opcode) { case Opcode::MOVIN: return "MOVIN"; @@ -11,13 +29,14 @@ std::string opcode_to_string(Opcode opcode) { } Instruction::Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_parents, - addr_type dram_addr, std::vector tile_size, std::vector tile_stride, size_t precision, - std::vector tag_idx_list, std::vector tag_stride_list, - std::vector accum_tag_idx_list) + addr_type dram_addr, std::vector tile_size, std::vector tile_stride, size_t elem_bits, + std::vector tag_idx_list, std::vector tag_stride_list, + std::vector accum_tag_idx_list) : opcode(opcode), compute_cycle(compute_cycle), ready_counter(num_parents), dram_addr(dram_addr), - tile_size(tile_size), tile_stride(tile_stride), _precision(precision), + tile_size(tile_size), tile_stride(tile_stride), _elem_bits(elem_bits), _tag_idx_list(tag_idx_list), _tag_stride_list(tag_stride_list), _accum_tag_idx_list(accum_tag_idx_list) { + _global_inst_id = _next_global_inst_id++; assert(_tag_idx_list.size()==_tag_stride_list.size()); _tile_numel = 1; for (auto dim : tile_size) @@ -26,6 +45,7 @@ Instruction::Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_par Instruction::Instruction(Opcode opcode) : opcode(opcode) { + _global_inst_id = _next_global_inst_id++; _tile_numel = 1; } @@ -51,9 +71,9 @@ void Instruction::dec_waiting_request() { void Instruction::prepare_tag_key() { /* Calculate tag key */ - int key_offset = 0; + int64_t key_offset = 0; _tag_key.push_back(_addr_id); - for (int i=0; i<_tag_idx_list.size(); i++) + for (size_t i = 0; i < _tag_idx_list.size(); i++) key_offset += _tag_idx_list.at(i) * _tag_stride_list.at(i); for (auto accum_dim : _accum_tag_idx_list) _tag_key.push_back(accum_dim); @@ -88,10 +108,10 @@ std::shared_ptr> Instruction::get_dram_address(addr_type dra dim1*tile_stride.at(tile_stride.size() - 3) + \ dim2*tile_stride.at(tile_stride.size() - 2) + \ dim3*tile_stride.at(tile_stride.size() - 1); - address = dram_addr + address * _precision; + address = dram_addr + (address * _elem_bits + 7) / 8; if (indirect_index != NULL) { uint64_t index_val = indirect_index[index_count++]; - address += index_val * _precision; + address += (index_val * _elem_bits + 7) / 8; } address_set->insert(address - (address & dram_req_size-1)); } diff --git a/TOGSim/src/SparseCore.cc b/TOGSim/src/SparseCore.cc index d5629b9c..1bf1163a 100644 --- a/TOGSim/src/SparseCore.cc +++ b/TOGSim/src/SparseCore.cc @@ -1,4 +1,5 @@ #include "SparseCore.h" +#include "TraceLogTags.h" SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config) { /* Init stonne cores*/ @@ -239,7 +240,11 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { { auto acc_type = mem_access_type::GLOBAL_ACC_R; auto type = mf_type::READ_REQUEST; - spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id, + spdlog::trace("[{}][StonneCore {}/{}][{}] {}", + _core_cycle, + _id, + subcore_id, + TraceLogTag::pad15(TraceLogTag::kInstructionIssued), opcode_to_string(inst->get_opcode())); for (auto addr : inst->get_trace_address()) { addr = addr - (addr & _config.dram_req_size-1); @@ -260,7 +265,11 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { { auto acc_type = mem_access_type::GLOBAL_ACC_W; auto type = mf_type::WRITE_REQUEST; - spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id, + spdlog::trace("[{}][StonneCore {}/{}][{}] {}", + _core_cycle, + _id, + subcore_id, + TraceLogTag::pad15(TraceLogTag::kInstructionIssued), opcode_to_string(inst->get_opcode())); for (auto addr : inst->get_trace_address()) { addr = addr - (addr & _config.dram_req_size-1); @@ -285,8 +294,13 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) { inst->finish_cycle = _core_cycle + inst->get_compute_cycle(); else inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle(); - spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}, finsh at {}", _core_cycle, _id, subcore_id, - opcode_to_string(inst->get_opcode()), inst->finish_cycle); + spdlog::trace("[{}][StonneCore {}/{}][{}] {}, finish_at={}", + _core_cycle, + _id, + subcore_id, + TraceLogTag::pad15(TraceLogTag::kInstructionIssued), + opcode_to_string(inst->get_opcode()), + inst->finish_cycle); target_pipeline.push(inst); issued = true; } @@ -397,7 +411,22 @@ std::shared_ptr SparseCore::pop_finished_tile() { return result; } -void SparseCore::finish_instruction(std::shared_ptr& inst) { +void SparseCore::finish_instruction(std::shared_ptr& inst, InstFinishTraceTag tag) { + if (tag == InstFinishTraceTag::DmaRespComplete) { + if (!inst->finished) { + spdlog::error("[{}][StonneCore {}][Error] ALL_DRAM_RESPONSES_RECEIVED trace but inst not finished", + _core_cycle, + _id); + exit(EXIT_FAILURE); + } + spdlog::trace("[{}][StonneCore {}][{}][INST_ID={}] {}", + _core_cycle, + _id, + TraceLogTag::pad15(TraceLogTag::kAllDramResponsesReceived), + inst->get_global_inst_id(), + opcode_to_string(inst->get_opcode())); + return; + } if (inst->finished) { spdlog::error("[{}][StonneCore {}][Error] {} inst already finished!!", _core_cycle, _id, opcode_to_string(inst->get_opcode())); @@ -405,12 +434,16 @@ void SparseCore::finish_instruction(std::shared_ptr& inst) { } inst->finish_instruction(); static_cast(inst->get_owner())->inc_finished_inst(); + const char* trace_tag = (tag == InstFinishTraceTag::DmaIssueComplete) + ? TraceLogTag::kAsyncDmaAllRequestsIssued + : TraceLogTag::kInstructionFinished; + const std::string tag15 = TraceLogTag::pad15(trace_tag); if (inst->get_opcode() == Opcode::COMP) { - spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}", - _core_cycle, _id, opcode_to_string(inst->get_opcode())); + spdlog::info("[{}][StonneCore {}][{}] {}", _core_cycle, _id, tag15, + opcode_to_string(inst->get_opcode())); } else if (inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) { - spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}", _core_cycle, _id, - opcode_to_string(inst->get_opcode())); + spdlog::info("[{}][StonneCore {}][{}] {}", _core_cycle, _id, tag15, + opcode_to_string(inst->get_opcode())); } } diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc index 882aba6b..5060d336 100644 --- a/TOGSim/src/TileGraphParser.cc +++ b/TOGSim/src/TileGraphParser.cc @@ -192,7 +192,7 @@ TileMemoryNode::TileMemoryNode(onnx::NodeProto& node) : TileNode(node) { if (attribute.name() == "torchsim_base_addr") { _base_addr_name = attribute.s(); } else if (attribute.name() == "torchsim_element_size") { - _element_size = attribute.i(); + _elem_bits = static_cast(attribute.i()); } else if (attribute.name() == "torchsim_tile_size") { for (int i = 0; i < attribute.ints_size(); i++) _tile_size.push_back(attribute.ints(i)); @@ -204,7 +204,7 @@ TileMemoryNode::TileMemoryNode(onnx::NodeProto& node) : TileNode(node) { _tag_idx_list.push_back(attribute.strings(i)); } else if (attribute.name() == "torchsim_tag_stride_list") { for (int i = 0; i < attribute.ints_size(); i++) - _tag_stride_list.push_back(attribute.ints(i)); + _tag_stride_list.push_back(static_cast(attribute.ints(i))); } else if (attribute.name() == "torchsim_loop_idx_list") { for (int i = 0; i < attribute.strings_size(); i++) _loop_idx_list.push_back(attribute.strings(i)); @@ -226,7 +226,7 @@ void TileMemoryNode::print_node() { TileNode::print_node(); std::string spaces(get_depth(), '\t'); spdlog::debug("{} base_addr_name: {}", spaces, _base_addr_name); - spdlog::debug("{} element_size: {}", spaces, _element_size); + spdlog::debug("{} elem_bits: {}", spaces, _elem_bits); spdlog::debug("{} loop_stride_list: {} ", spaces, _loop_stride_list); spdlog::debug("{} tile_size: {} ", spaces, _tile_size); spdlog::debug("{} tile_stride: {} ", spaces, _tile_stride); @@ -243,10 +243,10 @@ TileMemoryWaitNode::TileMemoryWaitNode(onnx::NodeProto& node) : TileNode(node) { _tag_idx_list.push_back(attribute.strings(i)); } else if (attribute.name() == "torchsim_tag_stride_list") { for (int i = 0; i < attribute.ints_size(); i++) - _tag_stride_list.push_back(attribute.ints(i)); + _tag_stride_list.push_back(static_cast(attribute.ints(i))); } else if (attribute.name() == "torchsim_tag_divider_list") { for (int i = 0; i < attribute.ints_size(); i++) - _tag_divider_list.push_back(attribute.ints(i)); + _tag_divider_list.push_back(static_cast(attribute.ints(i))); } else if (attribute.name() == "torchsim_base_addr") { _base_addr_name = attribute.s(); } @@ -352,12 +352,12 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa /* Base address setting */ std::string base_addr_name = mem_node->get_base_addr_name(); - int base_addr_id = tog_parser->register_addr_name(base_addr_name); + int64_t base_addr_id = tog_parser->register_addr_name(base_addr_name); addr_type base_addr = tog_parser->lookup(base_addr_name); addr_type offset = std::inner_product(iter_list.begin(), iter_list.end(), mem_node->get_loop_stride_list().begin(), 0); - std::vector tag_list; - std::vector accum_tag_list; + std::vector tag_list; + std::vector accum_tag_list; std::vector outer_loop_idx; std::vector outer_loop_size; /* Add accumulation loop info to accum_tag list */ @@ -406,8 +406,8 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa } /* Check need to make this memory node */ - std::vector& tag_stride_list = mem_node->get_tag_stride_list(); - std::vector key = tog_parser->calc_tag(accum_tag_list, tag_list, tag_stride_list); + std::vector& tag_stride_list = mem_node->get_tag_stride_list(); + std::vector key = tog_parser->calc_tag(accum_tag_list, tag_list, tag_stride_list); if (tog_parser->check_memory_tag(base_addr_name, key)) continue; tog_parser->register_memory_tag(base_addr_name, key); @@ -422,7 +422,7 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa std::shared_ptr inst = std::make_shared( Opcode::MOVIN, 0, 0, base_addr+offset, - mem_node->get_tile_size(), mem_node->get_tile_stride(), mem_node->get_precision(), + mem_node->get_tile_size(), mem_node->get_tile_stride(), mem_node->get_elem_bits(), tag_list, tag_stride_list, accum_tag_list ); inst->set_addr_name(base_addr_name, base_addr_id); @@ -465,7 +465,7 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa /* Lookup given name's address */ std::string base_addr_name = mem_node->get_base_addr_name(); - int base_addr_id = tog_parser->register_addr_name(base_addr_name); + int64_t base_addr_id = tog_parser->register_addr_name(base_addr_name); addr_type base_addr = tog_parser->lookup(base_addr_name); addr_type offset = std::inner_product(iter_list.begin(), iter_list.end(), mem_node->get_loop_stride_list().begin(), 0); @@ -482,8 +482,8 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa std::shared_ptr inst = std::make_shared( Opcode::MOVOUT, 0, 0, base_addr+offset, - mem_node->get_tile_size(), mem_node->get_tile_stride(), mem_node->get_precision(), - std::vector(1), mem_node->get_tag_stride_list(), std::vector() + mem_node->get_tile_size(), mem_node->get_tile_stride(), mem_node->get_elem_bits(), + std::vector(1, 0), mem_node->get_tag_stride_list(), std::vector() ); inst->set_addr_name(base_addr_name, base_addr_id); inst->prepare_tag_key(); @@ -500,15 +500,15 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa printIndexMap("[TOGParser] DMA Wait Node ", iter); std::shared_ptr wait_node = std::static_pointer_cast(tile_node); auto base_addr_name = wait_node->get_base_addr_name(); - int base_addr_id = tog_parser->register_addr_name(base_addr_name); + int64_t base_addr_id = tog_parser->register_addr_name(base_addr_name); addr_type base_addr = tog_parser->lookup(base_addr_name); /* Lookup given name's address */ std::vector iter_list; - std::vector tag_list; - std::vector& tag_stride_list = wait_node->get_tag_stride_list(); - std::vector& tag_divider_list = wait_node->get_tag_divider_list(); - std::vector new_tag_stride_list; - std::vector accum_tag_list; + std::vector tag_list; + std::vector& tag_stride_list = wait_node->get_tag_stride_list(); + std::vector& tag_divider_list = wait_node->get_tag_divider_list(); + std::vector new_tag_stride_list; + std::vector accum_tag_list; auto& wait_tag_list = wait_node->get_tag_idx_list(); for (int i=0; i> TileLoopNode::get_tiles_from_iter(TileGraphPa } else if (tile_node->get_type() == TileType::COMPUTE_NODE) { printIndexMap("[TOGParser] Compute Node ", iter); std::shared_ptr compute_node = std::static_pointer_cast(tile_node); - std::vector tag_list = {0}; - std::vector tag_stride_list = {1}; - std::vector accum_tag_list; + std::vector tag_list = {0}; + std::vector tag_stride_list = {1}; + std::vector accum_tag_list; std::shared_ptr inst = std::make_shared( Opcode::COMP, compute_node->get_cycle(), 0, 0, @@ -587,9 +587,6 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa inst->add_child(child_inst); } } - /* Add instruction to tile */ - if (inst->get_opcode() == Opcode::MOVIN) - tile_vec.back()->inc_required_sram_size(inst->get_tile_numel() * inst->get_precision()); } link_map.clear(); /* iterate nested loop */ @@ -668,9 +665,6 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa inst->add_child(child_inst); } } - /* Add instruction to tile */ - if (inst->get_opcode() == Opcode::MOVIN) - tile_vec.back()->inc_required_sram_size(inst->get_tile_numel() * inst->get_precision()); } return tile_vec; @@ -691,13 +685,13 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa _attribute_path = attribute_path; if (!std::filesystem::exists(onnx_path)) { - throw std::runtime_error("Error: ONNX file not found at path: " + onnx_path); + throw std::runtime_error("Error: TOG graph path not found: " + onnx_path); } /* Note: this parsing algorithm assume that all node are sorted in topological-order */ std::ifstream model_istream(onnx_path); google::protobuf::io::IstreamInputStream zero_copy_input(&model_istream); onnx::ModelProto model_proto; - + /* Attribute parsing */ if (_attribute_config["address_info"]) { const auto& address_info = _attribute_config["address_info"]; @@ -744,7 +738,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa } load_sparse_meta_data(); - /* ONNX file parsing */ + /* TOG file parsing */ _tog_path = onnx_path; model_proto.ParseFromZeroCopyStream(&zero_copy_input) && model_istream.eof(); @@ -904,10 +898,10 @@ void TileGraphParser::register_tile(std::shared_ptr tile_node) { } } -std::vector TileGraphParser::calc_tag(std::vector& accum_tag, std::vector& tag_idx, std::vector& tag_stride) { - int key_offset = 0; - std::vector tag_key; - for (int i=0; i TileGraphParser::calc_tag(std::vector& accum_tag, std::vector& tag_idx, std::vector& tag_stride) { + int64_t key_offset = 0; + std::vector tag_key; + for (size_t i = 0; i < tag_idx.size(); i++) key_offset += tag_idx.at(i) * tag_stride.at(i); for (auto accum_dim : accum_tag) tag_key.push_back(accum_dim); @@ -915,12 +909,12 @@ std::vector TileGraphParser::calc_tag(std::vector& accum_tag, std::vec return tag_key; } -void TileGraphParser::register_memory_tag(std::string name, std::vector& tag_key) { +void TileGraphParser::register_memory_tag(std::string name, std::vector& tag_key) { assert(_tag_table.find(std::make_pair(name, tag_key))==_tag_table.end()); _tag_table[std::make_pair(name, tag_key)] = true; } -bool TileGraphParser::check_memory_tag(std::string name, std::vector& tag_key) { +bool TileGraphParser::check_memory_tag(std::string name, std::vector& tag_key) { return _tag_table.find(std::make_pair(name, tag_key))==_tag_table.end() ? false : true; } From 352309a0266d7fc10fa402794314cf6abfa27769 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Fri, 10 Apr 2026 19:05:25 +0900 Subject: [PATCH 163/194] [TOGSim] Update DRAM stat printing --- TOGSim/src/Dram.cc | 97 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/TOGSim/src/Dram.cc b/TOGSim/src/Dram.cc index 656e57f8..dcaf94bc 100644 --- a/TOGSim/src/Dram.cc +++ b/TOGSim/src/Dram.cc @@ -1,5 +1,32 @@ #include "Dram.h" +#include + +namespace { + +/** Bytes/s effective GB/s and avg-per-channel utilization % for a window of `window_cycles` DRAM ticks. */ +struct DramBwSnapshot { + double bandwidth_gbs = 0; + double util_avg_ch_pct = 0; +}; + +DramBwSnapshot make_dram_bw_snapshot(long long total_rw_transactions, uint64_t window_cycles, + uint32_t n_ch, uint32_t req_size, uint32_t n_bl, + double dram_freq_mhz) { + DramBwSnapshot out; + if (window_cycles == 0 || n_ch == 0) + return out; + const double tx = static_cast(total_rw_transactions); + const double w = static_cast(window_cycles); + const double bytes_per_cycle = tx * static_cast(req_size) / w; + out.bandwidth_gbs = bytes_per_cycle * dram_freq_mhz / 1000.0; + const double avg_per_ch = tx / static_cast(n_ch); + out.util_avg_ch_pct = avg_per_ch * 100.0 * static_cast(n_bl) / (2.0 * w); + return out; +} + +} // namespace + uint32_t Dram::get_channel_id(mem_fetch* access) { uint32_t channel_id; if (_n_ch_per_partition >= 16) @@ -87,6 +114,39 @@ void DramRamulator2::cycle() { _mem[ch]->return_queue_pop(); } } + + if (_n_ch == 0) + return; + const int iv = _config.dram_print_interval; + if (iv <= 0) + return; + const uint64_t cc = *_core_cycles; + if (cc % static_cast(iv) != 0 || cc == 0) + return; + + const double f_mhz = static_cast(_config.dram_freq_mhz); + const uint64_t w = static_cast(iv); + long long r_all = 0; + long long w_all = 0; + for (int ch = 0; ch < _n_ch; ch++) { + const long long r = _mem[ch]->interval_reads(); + const long long wtxn = _mem[ch]->interval_writes(); + r_all += r; + w_all += wtxn; + const DramBwSnapshot bw = + make_dram_bw_snapshot(r + wtxn, w, 1u, _req_size, _n_bl, f_mhz); + spdlog::trace( + "[DRAM] ch {} | BW {:.2f} GB/s, {:.2f}% util | {} reads, {} writes (interval {} cycles)", + ch, bw.bandwidth_gbs, bw.util_avg_ch_pct, r, wtxn, w); + } + const DramBwSnapshot bw_all = + make_dram_bw_snapshot(r_all + w_all, w, _n_ch, _req_size, _n_bl, f_mhz); + spdlog::info( + "[DRAM] all {} ch | BW {:.2f} GB/s, {:.2f}% util (avg/ch) | {} reads, {} writes (interval {} cycles)", + _n_ch, bw_all.bandwidth_gbs, bw_all.util_avg_ch_pct, r_all, w_all, w); + for (int ch = 0; ch < _n_ch; ch++) { + _mem[ch]->reset_interval_bw_counters(); + } } void DramRamulator2::cache_cycle() { @@ -120,9 +180,44 @@ void DramRamulator2::pop(uint32_t cid) { } void DramRamulator2::print_stat() { + spdlog::info("========= DRAM stat ========="); + if (_n_ch == 0) + return; + + for (int ch = 0; ch < _n_ch; ch++) { + _mem[ch]->finalize_once(); + } + + spdlog::trace("=== Ramulator2 stats (channels 0.. {}) ===", _n_ch - 1); + for (int ch = 0; ch < _n_ch; ch++) { + std::cout << "--- channel " << ch << " ---\n"; + _mem[ch]->print_stats_yaml(std::cout); + } + std::cout.flush(); + + const uint64_t cycles = *_core_cycles; + if (cycles == 0) + return; + const double f_mhz = static_cast(_config.dram_freq_mhz); + spdlog::info("[DRAM] per-channel avg BW ({} sim cycles):", cycles); + long long tr_all = 0; + long long tw_all = 0; for (int ch = 0; ch < _n_ch; ch++) { - _mem[ch]->print(stdout); + const long long tr = _mem[ch]->total_reads(); + const long long tw = _mem[ch]->total_writes(); + tr_all += tr; + tw_all += tw; + const DramBwSnapshot bw = + make_dram_bw_snapshot(tr + tw, cycles, 1u, _req_size, _n_bl, f_mhz); + spdlog::info( + "[DRAM] ch {} | avg BW {:.2f} GB/s, {:.2f}% util | {} reads, {} writes", + ch, bw.bandwidth_gbs, bw.util_avg_ch_pct, tr, tw); } + const DramBwSnapshot bw_all = make_dram_bw_snapshot( + tr_all + tw_all, cycles, _n_ch, _req_size, _n_bl, f_mhz); + spdlog::info( + "[DRAM] all ch 0..{} | avg BW {:.2f} GB/s, {:.2f}% util (avg/ch) | {} reads, {} writes", + _n_ch - 1, bw_all.bandwidth_gbs, bw_all.util_avg_ch_pct, tr_all, tw_all); } void DramRamulator2::print_cache_stats() { From d059a1930776756f43ae73d4f60d7ff63a2d70af Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 13 Apr 2026 16:05:44 +0900 Subject: [PATCH 164/194] [TOGSim] Fix conversion of global address to channel address --- TOGSim/include/Dram.h | 5 +-- TOGSim/src/Dram.cc | 64 +++++++++++++++++++++++++++++++++------ TOGSim/src/Instruction.cc | 4 +-- 3 files changed, 60 insertions(+), 13 deletions(-) diff --git a/TOGSim/include/Dram.h b/TOGSim/include/Dram.h index d28ac25f..978bcdf9 100644 --- a/TOGSim/include/Dram.h +++ b/TOGSim/include/Dram.h @@ -29,6 +29,8 @@ class Dram { virtual void print_stat() {} virtual void print_cache_stats() {}; uint32_t get_channels_per_partition() { return _n_ch_per_partition; } + new_addr_type partition_dram_address(new_addr_type raw_addr) const; + protected: SimulationConfig _config; CacheConfig _m_cache_config; @@ -37,6 +39,7 @@ class Dram { uint32_t _n_partitions; uint32_t _n_ch_per_partition; uint32_t _req_size; + int _tx_log2 = 0; cycle_type _cycles; cycle_type* _core_cycles; std::vector> m_cache_latency_queue; @@ -83,8 +86,6 @@ class SimpleDRAM: public Dram { void print_cache_stats() override; private: int _latency = 1; - int _tx_ch_log2; - int _tx_log2; std::vector>> _mem; }; diff --git a/TOGSim/src/Dram.cc b/TOGSim/src/Dram.cc index dcaf94bc..95a55ca3 100644 --- a/TOGSim/src/Dram.cc +++ b/TOGSim/src/Dram.cc @@ -4,6 +4,28 @@ namespace { +static bool is_power_of_2_u32(uint32_t n) { return n != 0 && (n & (n - 1)) == 0; } + +static uint32_t floor_log2_u32(uint32_t n) { + uint32_t r = 0; + while (n >>= 1) + ++r; + return r; +} + +/** Smallest power of two >= n (n >= 1). */ +static uint32_t next_power_of_2_u32(uint32_t n) { + if (n <= 1) + return 1; + --n; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + return n + 1; +} + /** Bytes/s effective GB/s and avg-per-channel utilization % for a window of `window_cycles` DRAM ticks. */ struct DramBwSnapshot { double bandwidth_gbs = 0; @@ -27,14 +49,38 @@ DramBwSnapshot make_dram_bw_snapshot(long long total_rw_transactions, uint64_t w } // namespace +new_addr_type Dram::partition_dram_address(new_addr_type raw_addr) const { + if (_req_size == 0 || _n_ch_per_partition == 0) + return raw_addr; + const new_addr_type tx = raw_addr >> _tx_log2; + const new_addr_type q = tx / _n_ch_per_partition; + return static_cast(q << _tx_log2); +} + uint32_t Dram::get_channel_id(mem_fetch* access) { - uint32_t channel_id; - if (_n_ch_per_partition >= 16) - channel_id = ipoly_hash_function((new_addr_type)access->get_addr()/_req_size, 0, _n_ch_per_partition); - else - channel_id = ipoly_hash_function((new_addr_type)access->get_addr()/_req_size, 0, 16) % _n_ch_per_partition; + uint32_t channel_in_partition = 0; + if (_n_ch_per_partition > 1) { + const new_addr_type tx = static_cast(access->get_addr() >> _tx_log2); + new_addr_type rest_high; + unsigned init_index = 0; + if (is_power_of_2_u32(_n_ch_per_partition)) { + const unsigned lb = floor_log2_u32(_n_ch_per_partition); + rest_high = tx >> lb; + init_index = static_cast(tx & (_n_ch_per_partition - 1u)); + } else { + /* gpgpu-sim "gap" channels: quotient / remainder split at txn granularity. */ + rest_high = tx / _n_ch_per_partition; + init_index = static_cast(tx % _n_ch_per_partition); + } + /* ipoly_hash_function only implements 16/32/64 (see Hashing.cc); fold like addrdec IPOLY + mod when needed. */ + const uint32_t poly_n = next_power_of_2_u32(std::max(16u, _n_ch_per_partition)); + const uint32_t poly_use = std::min(poly_n, 64u); + channel_in_partition = + static_cast(ipoly_hash_function(rest_high, init_index, poly_use)) % _n_ch_per_partition; + } - channel_id += ((access->get_numa_id() % _n_partitions)* _n_ch_per_partition); + const uint32_t channel_id = + channel_in_partition + static_cast(access->get_numa_id() % _n_partitions) * _n_ch_per_partition; return channel_id; } @@ -46,6 +92,7 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) { _n_partitions = config.dram_num_partitions; _n_ch_per_partition = config.dram_channels_per_partitions; _config = config; + _tx_log2 = static_cast(std::log2(_req_size)); spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}B", config.max_dram_bandwidth(), config.dram_freq_mhz, _n_ch, _req_size); /* Initialize DRAM Channels */ @@ -160,7 +207,8 @@ bool DramRamulator2::is_full(uint32_t cid, mem_fetch* request) { } void DramRamulator2::push(uint32_t cid, mem_fetch* request) { - addr_type target_addr = (request->get_addr() >> _tx_ch_log2) << _tx_log2; + const addr_type raw_addr = request->get_addr(); + const addr_type target_addr = partition_dram_address(raw_addr); request->set_addr(target_addr); m_from_crossbar_queue[cid].push(request); } @@ -233,8 +281,6 @@ SimpleDRAM::SimpleDRAM(SimulationConfig config, cycle_type* core_cycle) : Dram(c _mem.push_back(std::make_unique>("SimpleDRAM", true, -1)); } _latency = config.dram_latency; - _tx_log2 = log2(_req_size); - _tx_ch_log2 = log2(_n_ch_per_partition) + _tx_log2; } bool SimpleDRAM::running() { diff --git a/TOGSim/src/Instruction.cc b/TOGSim/src/Instruction.cc index 1dc3ff42..f236d160 100644 --- a/TOGSim/src/Instruction.cc +++ b/TOGSim/src/Instruction.cc @@ -108,10 +108,10 @@ std::shared_ptr> Instruction::get_dram_address(addr_type dra dim1*tile_stride.at(tile_stride.size() - 3) + \ dim2*tile_stride.at(tile_stride.size() - 2) + \ dim3*tile_stride.at(tile_stride.size() - 1); - address = dram_addr + (address * _elem_bits + 7) / 8; + address = dram_addr + (address * _elem_bits + 7) >> 3; if (indirect_index != NULL) { uint64_t index_val = indirect_index[index_count++]; - address += (index_val * _elem_bits + 7) / 8; + address += (index_val * _elem_bits + 7) >> 3; } address_set->insert(address - (address & dram_req_size-1)); } From b6805674fbfa07651bcd31076f56a97f491aebc8 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 13 Apr 2026 22:16:58 +0900 Subject: [PATCH 165/194] [TOGSim] Adjust ramulator2.1 config --- TOGSim/extern/ramulator2 | 2 +- configs/ramulator2_configs/DDR4.yaml | 2 +- configs/ramulator2_configs/HBM2.yaml | 2 +- configs/ramulator2_configs/HBM2_TPUv3.yaml | 6 ++--- configs/ramulator2_configs/LPDDR5.yaml | 4 ++-- configs/ramulator2_configs/LPDDR5X.yaml | 18 +++++++-------- configs/ramulator2_configs/gen_configs.py | 26 +++++++++++++++++----- 7 files changed, 38 insertions(+), 22 deletions(-) diff --git a/TOGSim/extern/ramulator2 b/TOGSim/extern/ramulator2 index 70e85563..ad6acd97 160000 --- a/TOGSim/extern/ramulator2 +++ b/TOGSim/extern/ramulator2 @@ -1 +1 @@ -Subproject commit 70e855630b7f582bc8fa7370bfd582dc71d8af63 +Subproject commit ad6acd97e9fc60c44ed96a49267b7c20ab76e4d3 diff --git a/configs/ramulator2_configs/DDR4.yaml b/configs/ramulator2_configs/DDR4.yaml index 45799436..c4b16617 100644 --- a/configs/ramulator2_configs/DDR4.yaml +++ b/configs/ramulator2_configs/DDR4.yaml @@ -22,7 +22,7 @@ }, "refresh_manager": { "impl": "AllBank", - "scope": "Rank" + "scope": "Channel" }, "row_policy": { "impl": "Open" diff --git a/configs/ramulator2_configs/HBM2.yaml b/configs/ramulator2_configs/HBM2.yaml index 2bdd1705..3dda8abf 100644 --- a/configs/ramulator2_configs/HBM2.yaml +++ b/configs/ramulator2_configs/HBM2.yaml @@ -11,7 +11,7 @@ }, "controllers": [ { - "impl": "GenericDDR", + "impl": "HBM", "wr_low_watermark": 0.2, "wr_high_watermark": 0.8, "read_buffer_size": 32, diff --git a/configs/ramulator2_configs/HBM2_TPUv3.yaml b/configs/ramulator2_configs/HBM2_TPUv3.yaml index 2bdd1705..01cab613 100644 --- a/configs/ramulator2_configs/HBM2_TPUv3.yaml +++ b/configs/ramulator2_configs/HBM2_TPUv3.yaml @@ -11,11 +11,11 @@ }, "controllers": [ { - "impl": "GenericDDR", + "impl": "HBM", "wr_low_watermark": 0.2, "wr_high_watermark": 0.8, - "read_buffer_size": 32, - "write_buffer_size": 32, + "read_buffer_size": 64, + "write_buffer_size": 64, "priority_buffer_size": 1568, "scheduler": { "impl": "FRFCFS" diff --git a/configs/ramulator2_configs/LPDDR5.yaml b/configs/ramulator2_configs/LPDDR5.yaml index bf039f9f..cbb08b5e 100644 --- a/configs/ramulator2_configs/LPDDR5.yaml +++ b/configs/ramulator2_configs/LPDDR5.yaml @@ -11,7 +11,7 @@ }, "controllers": [ { - "impl": "GenericDDR", + "impl": "LPDDR5", "wr_low_watermark": 0.2, "wr_high_watermark": 0.8, "read_buffer_size": 32, @@ -22,7 +22,7 @@ }, "refresh_manager": { "impl": "AllBank", - "scope": "Rank" + "scope": "Channel" }, "row_policy": { "impl": "Open" diff --git a/configs/ramulator2_configs/LPDDR5X.yaml b/configs/ramulator2_configs/LPDDR5X.yaml index 4309aa6c..a8f454c4 100644 --- a/configs/ramulator2_configs/LPDDR5X.yaml +++ b/configs/ramulator2_configs/LPDDR5X.yaml @@ -11,7 +11,7 @@ }, "controllers": [ { - "impl": "GenericDDR", + "impl": "LPDDR5", "wr_low_watermark": 0.2, "wr_high_watermark": 0.8, "read_buffer_size": 32, @@ -22,7 +22,7 @@ }, "refresh_manager": { "impl": "AllBank", - "scope": "Rank" + "scope": "Channel" }, "row_policy": { "impl": "Open" @@ -52,7 +52,7 @@ 23, 46, 65, - 38, + 37, 11, 12, 2, @@ -63,7 +63,7 @@ 6, 6, 7, - 14, + 13, 22, 224, 128, @@ -220,7 +220,7 @@ [ 3 ], - 52 + 51 ], [ 1, @@ -314,7 +314,7 @@ [ 10 ], - 72 + 71 ], [ 1, @@ -361,7 +361,7 @@ 6, 8 ], - 28 + 27 ], [ 2, @@ -434,7 +434,7 @@ [ 2 ], - 52 + 51 ], [ 3, @@ -454,7 +454,7 @@ [ 0 ], - 72 + 71 ], [ 3, diff --git a/configs/ramulator2_configs/gen_configs.py b/configs/ramulator2_configs/gen_configs.py index 64eb62d2..d27cd6de 100644 --- a/configs/ramulator2_configs/gen_configs.py +++ b/configs/ramulator2_configs/gen_configs.py @@ -28,7 +28,12 @@ import ramulator.memory_system -def make_config(dram_obj, clock_ratio=1, refresh_scope="Rank"): +def _dram_standard_name(dram_obj): + """DRAMStandard.name from class or instance (e.g. 'HBM2', 'DDR4').""" + return getattr(type(dram_obj), "name", None) or getattr(dram_obj, "name", None) or "" + + +def make_config(dram_obj, clock_ratio=1, refresh_scope="Channel"): """Wrap a DRAM object in a single-channel GenericDRAM config for PyTorchSim. PyTorchSim creates one Ramulator2 instance per channel, so each config @@ -36,10 +41,22 @@ def make_config(dram_obj, clock_ratio=1, refresh_scope="Rank"): The wrapper overrides 'frontend' to ExternalFrontEnd automatically. refresh_scope: level name for AllBank refresh. - - DDR4 / LPDDR5 / LPDDR5X → "Rank" - - HBM2 / HBM3 → "PseudoChannel" + - DDR4 / LPDDR5 / LPDDR5X -> "Channel" + - HBM2 / HBM3 -> "PseudoChannel" + + Controller choice (matches C++ controller impls): + - HBM* -> ramulator.controller.HBM + - LPDDR* -> ramulator.controller.LPDDR5 (incl. LPDDR5X timing on the LPDDR5 DRAM model) + - otherwise -> GenericDDR """ - ctrl = ramulator.controller.GenericDDR( + dram_name = str(_dram_standard_name(dram_obj)).upper() + if dram_name.startswith("HBM"): + ctrl_cls = ramulator.controller.HBM + elif dram_name.startswith("LPDDR"): + ctrl_cls = ramulator.controller.LPDDR5 + else: + ctrl_cls = ramulator.controller.GenericDDR + ctrl = ctrl_cls( dram=dram_obj, scheduler=ramulator.scheduler.FRFCFS(), refresh_manager=ramulator.refresh_manager.AllBank(scope=refresh_scope), @@ -70,7 +87,6 @@ def gen_hbm2_tpuv3(): dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_2000Mbps") return make_config(dram, clock_ratio=1, refresh_scope="PseudoChannel") - def gen_ddr4(): # Available timing presets — check python/ramulator/dram/ddr4.py dram = ramulator.dram.DDR4(org_preset="DDR4_8Gb_x8", timing_preset="DDR4_3200AA") From 8bbb3c20d6372503812ce4201a9c00877eb79b1a Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 14 Apr 2026 11:40:50 +0900 Subject: [PATCH 166/194] [Version] Update a LLVM version dependecy --- thirdparty/github-releases.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/github-releases.json b/thirdparty/github-releases.json index 25c220c9..34b63e54 100644 --- a/thirdparty/github-releases.json +++ b/thirdparty/github-releases.json @@ -8,7 +8,7 @@ }, "llvm_project": { "repository": "PSAL-POSTECH/llvm-project", - "release_tag": "v1.0.6", + "release_tag": "v1.0.7", "asset_name": "riscv-llvm-release.tar.gz" }, "spike": { From 93e8c7aad61f18286ce94f2e95a230007766450c Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 14 Apr 2026 13:52:52 +0900 Subject: [PATCH 167/194] [MLIR] Update MLIR version --- PyTorchSimFrontend/extension_codecache.py | 2 +- thirdparty/github-releases.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 65c96f11..8da2d71c 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -95,7 +95,7 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si -dma-fine-grained='systolic-array-size={vectorlane_size}' \ -global-idx='vlen={vlen}' \ -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \ - -test-tile-operation-graph='vectorlane={vectorlane_size} tls_mode={extension_config.CONFIG_TLS_MODE}' \ + -test-tile-operation-graph='vectorlane={vectorlane_size} sample-mode={extension_config.CONFIG_TLS_MODE}' \ -test-memref-to-gemmini="vectorlane={vectorlane_size} timing=1" \ -convert-linalg-to-loops \ -convert-vector-to-scf='full-unroll' \ diff --git a/thirdparty/github-releases.json b/thirdparty/github-releases.json index 34b63e54..ec89c24f 100644 --- a/thirdparty/github-releases.json +++ b/thirdparty/github-releases.json @@ -8,7 +8,7 @@ }, "llvm_project": { "repository": "PSAL-POSTECH/llvm-project", - "release_tag": "v1.0.7", + "release_tag": "v1.0.8", "asset_name": "riscv-llvm-release.tar.gz" }, "spike": { From 0993319df2e9b0144f2c6ffe6ee415063aa08672 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 14 Apr 2026 23:50:02 +0900 Subject: [PATCH 168/194] [Autotune] subprocess timeouts from first finite-cycle wall time --- PyTorchSimFrontend/extension_codecache.py | 8 +++- PyTorchSimFrontend/extension_config.py | 4 ++ PyTorchSimFrontend/mlir/mlir_autotune.py | 12 +++-- .../mlir/mlir_codegen_backend.py | 44 ++++++++++++++++--- Simulator/simulator.py | 25 ++++++++++- 5 files changed, 77 insertions(+), 16 deletions(-) diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 8da2d71c..6192c47b 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -279,7 +279,7 @@ def task(): return key future = self.submit(task) - def run_kernel_simulation(*args, **kwargs): + def run_kernel_simulation(*args, autotune_subprocess_timeout_sec=None, **kwargs): # Wait for compilation key = future.result() from filelock import FileLock @@ -311,7 +311,11 @@ def run_kernel_simulation(*args, **kwargs): result = None # No result for non-autotune mode else: result_path = TOGSimulator.run_standalone( - onnx_path, kernel_attribute_path, autotune_mode=autotune) + onnx_path, + kernel_attribute_path, + autotune_mode=autotune, + timeout_sec=autotune_subprocess_timeout_sec, + ) result = TOGSimulator.get_result_from_file(result_path) return result return run_kernel_simulation diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index 5dec8a4b..cf8d806e 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -89,6 +89,10 @@ def __getattr__(name): return config_yaml["codegen_autotune_max_retry"] if name == "codegen_autotune_template_topk": return config_yaml["codegen_autotune_template_topk"] + # Added to first candidate wall time for other candidates' TOGSim subprocess timeout (>= 1 s). + if name == "codegen_autotune_wall_slack_sec": + v = float(config_yaml.get("codegen_autotune_wall_slack_sec", 15)) + return max(1.0, v) # Compiler Optimization if name == "codegen_compiler_optimization": diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py index fe1f86a1..3489afbd 100644 --- a/PyTorchSimFrontend/mlir/mlir_autotune.py +++ b/PyTorchSimFrontend/mlir/mlir_autotune.py @@ -1,4 +1,3 @@ -import functools import torch import os import dataclasses @@ -76,7 +75,7 @@ def make_run_fn( latest_log_file = log_files_with_time[0][0] result_path = os.path.join(result_dir, latest_log_file) result = TOGSimulator.get_result_from_file(result_path) - def cached_run_fn(*args, **kwargs): + def cached_run_fn(*args, autotune_subprocess_timeout_sec=None, **kwargs): return result return cached_run_fn @@ -93,11 +92,10 @@ def cached_run_fn(*args, **kwargs): for tensor in list(input_tensors) + list(output_tensors) ] - # Generate partial function. - return functools.partial( - run_method, - *args, - ) + def schedule_run(autotune_subprocess_timeout_sec=None): + return run_method(*args, autotune_subprocess_timeout_sec=autotune_subprocess_timeout_sec) + + return schedule_run def update_workspace_size(self) -> None: # FIXME: Not implemented yet. Checkout torch/_inductor/codegen/rocm/rocm_benchmark_request.py diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 58d6a70d..492b7416 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1,6 +1,7 @@ import contextlib import sympy import sys +import time import re import os from functools import reduce @@ -1028,25 +1029,58 @@ def make_choices(self, nodes, kernel_name): return choices def autotune(self, *args): - def get_cycle(choice): + def get_cycle(choice, subprocess_timeout_sec=None): bench_runner = choice[0] for n_try in range(extension_config.codegen_autotune_max_retry): # TODO: make simple try: - out = bench_runner() + if subprocess_timeout_sec is not None: + out = bench_runner( + autotune_subprocess_timeout_sec=subprocess_timeout_sec + ) + else: + out = bench_runner() return out[-1] - except (extension_codecache.SpadOverflowError, RuntimeError) as e: + except (extension_codecache.SpadOverflowError, RuntimeError): return float("inf") return float("inf") # Exceeded maximum number of autotuning attempts choices = self.make_choices(*args) if len(choices) == 0: # Can't autotune return [None, None, None] + slack_sec = float(extension_config.codegen_autotune_wall_slack_sec) + # Get cycle time for each choice # Show progress bar only when CONFIG_DEBUG_MODE is off show_progress = not extension_config.CONFIG_DEBUG_MODE with ProgressBar("[Auto-tune] Running benchmarks", silent_mode=not show_progress) if show_progress else contextlib.nullcontext(): - with ThreadPoolExecutor(max_workers=8) as executor: - results = list(executor.map(get_cycle, choices)) + results = [float("inf")] * len(choices) + baseline_wall = None + parallel_from = 0 + + for idx, choice in enumerate(choices): + t0 = time.perf_counter() + c = get_cycle(choice, None) + elapsed = time.perf_counter() - t0 + results[idx] = c + parallel_from = idx + 1 + if c != float("inf"): + baseline_wall = elapsed + break + + pending = choices[parallel_from:] + if baseline_wall is not None and pending: + timeout_sec = baseline_wall + slack_sec + workers = min(8, len(pending), os.cpu_count()) + executor = ThreadPoolExecutor(max_workers=workers) + try: + tail = list( + executor.map( + lambda ch: get_cycle(ch, timeout_sec), pending + ) + ) + finally: + executor.shutdown(wait=True, cancel_futures=True) + results[parallel_from : parallel_from + len(tail)] = tail min_idx = results.index(min(results)) if min(results) == float("inf"): diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 5b00d5d4..2b9f05be 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -509,7 +509,14 @@ def get_togsim_command(config_path, togsim_path=None): return cmd @staticmethod - def run_standalone(model_path, attribute_path="", autotune_mode=False, config_path=None, togsim_path=None): + def run_standalone( + model_path, + attribute_path="", + autotune_mode=False, + config_path=None, + togsim_path=None, + timeout_sec=None, + ): """ Run a single kernel simulation in standalone mode. This method starts a new TOGSim process, runs the kernel, and waits for completion. @@ -521,6 +528,8 @@ def run_standalone(model_path, attribute_path="", autotune_mode=False, config_pa autotune_mode: If True, run in autotune mode (silent) config_path: Path to TOGSim config file (required) togsim_path: Path to TOGSim directory (optional, defaults to CONFIG_TORCHSIM_DIR/TOGSim) + timeout_sec: If set, terminate the Simulator subprocess after this many seconds + (autotune uses this to skip very slow tile candidates). Returns: Path to the simulation result log file @@ -559,7 +568,19 @@ def run_standalone(model_path, attribute_path="", autotune_mode=False, config_pa logger.debug(f"[TOGSim] cmd> {cmd}") logger.info("[TOGSim] TOGSim simulation started") with ProgressBar("[TOGSim] Running simulation", silent_mode=autotune_mode): - result = subprocess.check_output(shlex.split(cmd)) + completed = subprocess.run( + shlex.split(cmd), + capture_output=True, + check=True, + timeout=timeout_sec, + ) + result = completed.stdout + except subprocess.TimeoutExpired as e: + logger.warning( + "[TOGSim] Simulator subprocess exceeded timeout (%.1f s); terminating.", + float(timeout_sec) if timeout_sec is not None else -1.0, + ) + raise RuntimeError("TOGSim subprocess timeout") from e except subprocess.CalledProcessError as e: logger.error(f"[TOGSim] Command failed with exit code {e.returncode}") logger.error(f"[TOGSim] Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}") From 46c49541795d80eb4d62aa5ff88ac423c37f64b7 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 15 Apr 2026 22:30:50 +0900 Subject: [PATCH 169/194] [Autotune] Add non-subtiling option in tile_candidates --- PyTorchSimFrontend/mlir/mlir_gemm_template.py | 1 + configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py index 9c61c3d9..8a8cd585 100644 --- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py +++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py @@ -340,6 +340,7 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no for idx, (TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates): # Case 1: calculate sub tile size for fine-grained DMA if extension_config.CONFIG_SUBTILE: + full_tile_candidates.append([TILE_M, TILE_N, TILE_K]*2) SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane if (TILE_M == M and TILE_N == N and TILE_N <= 512): SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml index f8ac0a54..a7607108 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml @@ -26,5 +26,5 @@ pytorchsim_timing_mode: 1 codegen_mapping_strategy: autotune codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 -codegen_autotune_template_topk: 4 +codegen_autotune_template_topk: 8 codegen_compiler_optimization: all From fbe0bc0ea7afe507c43068e2b15089657d1aaf16 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 16 Apr 2026 21:04:12 +0900 Subject: [PATCH 170/194] [Lower] Add filter condition --- PyTorchSimFrontend/mlir/mlir_lowering.py | 134 +++++++++++++++++++---- 1 file changed, 111 insertions(+), 23 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py index b717089f..7f33d956 100644 --- a/PyTorchSimFrontend/mlir/mlir_lowering.py +++ b/PyTorchSimFrontend/mlir/mlir_lowering.py @@ -1,5 +1,5 @@ import math -from typing import List, Optional, Sequence +from typing import Any, Callable, List, Optional, Sequence import torch from torch._inductor.lowering import lowerings, index_impl @@ -29,26 +29,67 @@ aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm") _orig_sort_values_stable_lowering = lowerings.get(aten.sort.values_stable) -def tuned_mm(mat1, mat2, * ,layout=None): + +def _device_is_npu(device: Optional[torch.device]) -> bool: + return device is not None and device.type == "npu" + + +def _tensor_args_all_npu(*roots, optional=()) -> bool: + """True only if every tensor-like IR node under roots/optional is on an NPU device.""" + stack: list = list(roots) + list(optional) + while stack: + n = stack.pop() + if n is None: + continue + if isinstance(n, (list, tuple)): + stack.extend(n) + continue + get_dev = getattr(n, "get_device", None) + if get_dev is None: + continue + if not _device_is_npu(get_dev()): + return False + return True + + +def _override_lowerings_npu( + aten_op: Any, + mlir_impl: Callable[..., Any], + npu_ok: Callable[..., bool], +) -> None: + """Register mlir_impl for each overload; fall back to the prior lowering if npu_ok is false.""" + for overload in aten_op.overloads(): + op = getattr(aten_op, overload) + orig = lowerings.get(op) + + def wrapped(*args, _orig=orig, **kwargs): + if not npu_ok(*args, **kwargs): + return _orig(*args, **kwargs) + return mlir_impl(*args, **kwargs) + + lowerings[op] = wrapped + + +def _mlir_tuned_mm(mat1, mat2, *, layout=None): m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout) mlir_template = MLIRGemmTemplate([mat1, mat2], layout) return mlir_template.generate(input_nodes=[mat1, mat2], layout=layout).output_node() -def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None): +def _mlir_tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None): m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout) mlir_template = MLIRGemmTemplate([mat1, mat2, inp_expanded], layout) return mlir_template.generate().output_node() -def tuned_bmm(mat1, mat2, *, layout=None): +def _mlir_tuned_bmm(mat1, mat2, *, layout=None): m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout) mlir_template = MLIRBMMTemplate([mat1, mat2], layout) return mlir_template.generate().output_node() -def tuned_flash_sdpa( +def _mlir_tuned_flash_sdpa( query : TensorBox, key : TensorBox, value : TensorBox, @@ -69,7 +110,6 @@ def tuned_flash_sdpa( return (mlir_template.generate().output_node(), None, None, None, None, None, None, None, None) - def conv_layout( x: TensorBox, weight: TensorBox, @@ -104,7 +144,7 @@ def conv_layout( stride, ) -def convolution( +def _mlir_convolution( x: TensorBox, weight: TensorBox, bias: TensorBox, @@ -176,7 +216,7 @@ def maxpool_layout( stride, ) -def custom_maxpool( +def _mlir_custom_maxpool( x: TensorBox, kernel_size: List[int], stride: List[int], @@ -197,7 +237,7 @@ def custom_maxpool( template_node = mlir_template.generate().output_node() return template_node, x # FIXME: x is dummy IRNode, indices are not used in our case -def sparse_addmm(*args, **kwargs): +def _mlir_sparse_addmm(*args, **kwargs): _, sp_mat1, sp_mat2 = args mat1_layout = sp_mat1.layout out_range = args[0].data.data.data.ranges @@ -207,7 +247,7 @@ def sparse_addmm(*args, **kwargs): ) return aten_spmm.bind((sp_mat1, sp_mat2), layout).output_node() -def custom_unsafe_index(x, indices): +def _mlir_custom_unsafe_index(x, indices): # We can't fuse indirect access + indexed_expression + computation if isinstance(x, TensorBox): x.realize() @@ -229,7 +269,7 @@ def _cat_layout(tensors: Sequence[TensorBox], dim: int) -> ir.Layout: stride, ) -def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0): +def _mlir_custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0): if tensors and dim < 0: dim += len(tensors[0].get_size()) copy_default_lowering = lowerings.get(aten.copy_.default) @@ -255,7 +295,7 @@ def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0): mlir_template = MLIRCatTemplate(list(new_tensors), layout, dim=dim) return mlir_template.generate().output_node() -def custom_sort_default( +def _mlir_custom_sort_default( value: TensorBox, dim: int = -1, descending: bool = False, @@ -303,15 +343,63 @@ def _sort_layouts(x: TensorBox, dim: int, descending: bool): index_layout = ir.FixedLayout(x.get_device(), torch.int64, i_sizes, i_stride) return value_layout, index_layout -lowerings.update({getattr(aten.mm, overload): tuned_mm for overload in aten.mm.overloads()}) -lowerings.update({getattr(aten.addmm, overload): tuned_addmm for overload in aten.addmm.overloads()}) -lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()}) -lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()}) -lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()}) -lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()}) -lowerings.update({getattr(aten.cat, overload): custom_cat_default for overload in aten.cat.overloads()}) -lowerings.update({getattr(aten.sort, overload): custom_sort_default for overload in aten.sort.overloads()}) - +_override_lowerings_npu( + aten.mm, + _mlir_tuned_mm, + lambda mat1, mat2, **_: _tensor_args_all_npu(mat1, mat2), +) +_override_lowerings_npu( + aten.addmm, + _mlir_tuned_addmm, + lambda inp, mat1, mat2, **_: _tensor_args_all_npu(inp, mat1, mat2), +) +_override_lowerings_npu( + aten.convolution, + _mlir_convolution, + lambda *a, **_: len(a) >= 2 + and _tensor_args_all_npu(a[0], a[1], optional=(a[2] if len(a) > 2 else None,)), +) +_override_lowerings_npu( + aten.bmm, + _mlir_tuned_bmm, + lambda mat1, mat2, **_: _tensor_args_all_npu(mat1, mat2), +) +_override_lowerings_npu( + aten._sparse_addmm, + _mlir_sparse_addmm, + lambda *a, **_: len(a) >= 3 and _tensor_args_all_npu(a[1], a[2]), +) +_override_lowerings_npu( + aten._unsafe_index, + _mlir_custom_unsafe_index, + lambda x, indices, **_: _tensor_args_all_npu(x, indices), +) +_override_lowerings_npu( + aten.cat, + _mlir_custom_cat_default, + lambda *a, **_k: a and _tensor_args_all_npu(a[0]), +) +_override_lowerings_npu( + aten.sort, + _mlir_custom_sort_default, + lambda *a, **_k: a and _tensor_args_all_npu(a[0]), +) + if extension_config.CONFIG_USE_TIMING_POOLING: - lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template -lowerings.update({getattr(aten._scaled_dot_product_fused_attention_overrideable, overload): tuned_flash_sdpa for overload in aten._scaled_dot_product_fused_attention_overrideable.overloads()}) + _override_lowerings_npu( + aten.max_pool2d_with_indices, + _mlir_custom_maxpool, + lambda *a, **_: bool(a) and _tensor_args_all_npu(a[0]), + ) + +_override_lowerings_npu( + aten._scaled_dot_product_fused_attention_overrideable, + _mlir_tuned_flash_sdpa, + lambda *a, **k: len(a) >= 3 + and _tensor_args_all_npu( + a[0], + a[1], + a[2], + optional=(a[3] if len(a) > 3 else k.get("attn_bias"),), + ), +) From 901f93e0f3f5d1a797949d5d1a227c79836d22d6 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Mon, 20 Apr 2026 01:29:06 +0000 Subject: [PATCH 171/194] [Tutorial] ispass2026 session1 --- PyTorchSimFrontend/extension_config.py | 9 +- tutorial/session1/CompilerOptimization.ipynb | 65 ++++++++++++-- tutorial/session1/DNNServing.ipynb | 87 +++++++++++------- tutorial/session1/ExecutionMode.ipynb | 88 ++++++++++++++++--- tutorial/session1/Inference.ipynb | 31 ++++++- tutorial/session1/LogAnalysis.ipynb | 32 ++++++- tutorial/session1/Mapping.ipynb | 78 +++++++++++++--- tutorial/session1/Training.ipynb | 56 ++++++++++-- .../session1/tutorial_external_mapping.json | 2 +- 9 files changed, 362 insertions(+), 86 deletions(-) diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py index cf8d806e..d79ca390 100644 --- a/PyTorchSimFrontend/extension_config.py +++ b/PyTorchSimFrontend/extension_config.py @@ -41,9 +41,6 @@ def _default_tog_host_ldflags(): CONFIG_TORCHSIM_DUMP_MLIR_IR = int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False)) CONFIG_TORCHSIM_DUMP_LLVM_IR = int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False)) -CONFIG_TORCHSIM_DUMP_PATH = os.environ.get("TORCHSIM_DUMP_PATH", os.path.join(CONFIG_TORCHSIM_DIR, "outputs")) -CONFIG_TORCHSIM_LOG_PATH = os.environ.get("TORCHSIM_LOG_PATH", os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results")) -os.environ["TORCHINDUCTOR_CACHE_DIR"] = os.path.join(CONFIG_TORCHSIM_DUMP_PATH, ".torchinductor") def __getattr__(name): # TOGSim config @@ -137,6 +134,12 @@ def __getattr__(name): if name == "CONFIG_TOGSIM_DEBUG_LEVEL": return os.environ.get("TOGSIM_DEBUG_LEVEL", "") + if name == "CONFIG_TORCHSIM_DUMP_PATH": + dump_path = os.environ.get('TORCHSIM_DUMP_PATH', default = os.path.join(CONFIG_TORCHSIM_DIR, "outputs")) + os.environ["TORCHINDUCTOR_CACHE_DIR"] = os.path.join(dump_path, ".torchinductor") + return dump_path + if name == "CONFIG_TORCHSIM_LOG_PATH": + return os.environ.get('TORCHSIM_LOG_PATH', default = os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results")) # SRAM Buffer allocation plan def load_plan_from_module(module_path): diff --git a/tutorial/session1/CompilerOptimization.ipynb b/tutorial/session1/CompilerOptimization.ipynb index d17a6b25..6c23bfec 100644 --- a/tutorial/session1/CompilerOptimization.ipynb +++ b/tutorial/session1/CompilerOptimization.ipynb @@ -10,7 +10,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T10:34:23.862488Z", + "iopub.status.busy": "2026-04-16T10:34:23.862221Z", + "iopub.status.idle": "2026-04-16T10:34:26.839597Z", + "shell.execute_reply": "2026-04-16T10:34:26.838615Z", + "shell.execute_reply.started": "2026-04-16T10:34:23.862467Z" + } + }, "outputs": [], "source": [ "import torch\n", @@ -31,7 +39,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T10:34:26.840859Z", + "iopub.status.busy": "2026-04-16T10:34:26.840581Z", + "iopub.status.idle": "2026-04-16T10:34:46.109858Z", + "shell.execute_reply": "2026-04-16T10:34:46.108862Z", + "shell.execute_reply.started": "2026-04-16T10:34:26.840841Z" + } + }, "outputs": [], "source": [ "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"fused\")\n", @@ -50,10 +66,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T10:41:01.000313Z", + "iopub.status.busy": "2026-04-16T10:41:00.999980Z", + "iopub.status.idle": "2026-04-16T10:41:01.273172Z", + "shell.execute_reply": "2026-04-16T10:41:01.272081Z", + "shell.execute_reply.started": "2026-04-16T10:41:01.000290Z" + } + }, "outputs": [], "source": [ - "!cat /root/workspace/PyTorchSim/outputs/20251202_060538/togsim_result.log | grep \"Total execution cycle\"" + "!cat /workspace/PyTorchSim/togsim_results/20260416_103442_5281e75b.log | grep \"Total execution cycle\"" ] }, { @@ -66,7 +90,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T10:44:29.448759Z", + "iopub.status.busy": "2026-04-16T10:44:29.448400Z", + "iopub.status.idle": "2026-04-16T10:44:41.303261Z", + "shell.execute_reply": "2026-04-16T10:44:41.302462Z", + "shell.execute_reply.started": "2026-04-16T10:44:29.448732Z" + } + }, "outputs": [], "source": [ "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"non_fused\")\n", @@ -85,12 +117,27 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T10:46:37.996794Z", + "iopub.status.busy": "2026-04-16T10:46:37.996476Z", + "iopub.status.idle": "2026-04-16T10:46:38.497173Z", + "shell.execute_reply": "2026-04-16T10:46:38.496104Z", + "shell.execute_reply.started": "2026-04-16T10:46:37.996776Z" + } + }, "outputs": [], "source": [ - "!cat /root/workspace/PyTorchSim/outputs/20251202_055530/togsim_result.log | grep \"Total execution cycle\"\n", - "!cat /root/workspace/PyTorchSim/outputs/20251202_055532/togsim_result.log | grep \"Total execution cycle\"" + "!cat /workspace/PyTorchSim/togsim_results/20260416_104436_000cb9bc.log | grep \"Total execution cycle\"\n", + "!cat /workspace/PyTorchSim/togsim_results/20260416_104440_e50cdae1.log | grep \"Total execution cycle\"" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -109,7 +156,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/tutorial/session1/DNNServing.ipynb b/tutorial/session1/DNNServing.ipynb index 741f463f..0b4e0837 100644 --- a/tutorial/session1/DNNServing.ipynb +++ b/tutorial/session1/DNNServing.ipynb @@ -10,7 +10,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T11:17:18.018872Z", + "iopub.status.busy": "2026-04-16T11:17:18.018643Z", + "iopub.status.idle": "2026-04-16T11:17:20.890421Z", + "shell.execute_reply": "2026-04-16T11:17:20.889693Z", + "shell.execute_reply.started": "2026-04-16T11:17:18.018853Z" + } + }, "outputs": [], "source": [ "import torch\n", @@ -30,29 +38,32 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T11:17:20.891167Z", + "iopub.status.busy": "2026-04-16T11:17:20.890953Z", + "iopub.status.idle": "2026-04-16T11:19:42.197046Z", + "shell.execute_reply": "2026-04-16T11:19:42.196023Z", + "shell.execute_reply.started": "2026-04-16T11:17:20.891152Z" + } + }, "outputs": [], "source": [ "import torch\n", "from torchvision.models import resnet18\n", - "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request\n", + "from Simulator.simulator import TOGSimulator\n", "from PyTorchSimFrontend import extension_config\n", "\n", - "scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG)\n", - "device = scheduler.execution_engine.module.custom_device()\n", + "device = torch.device(\"npu:0\")\n", + "config = extension_config.CONFIG_TOGSIM_CONFIG\n", "\n", "model = resnet18().eval()\n", "input = torch.randn(1, 3, 224, 224).to(device=device)\n", "opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))\n", "\n", - "SchedulerDNNModel.register_model(\"resnet18\", opt_fn)\n", - "request = Request(\"resnet18\", [input], [], request_queue_idx=0)\n", - "scheduler.add_request(request, request_time=0)\n", - "\n", "# Run scheduler\n", - "while not scheduler.is_finished():\n", - " with torch.no_grad():\n", - " scheduler.schedule()\n", + "with TOGSimulator(config_path=config):\n", + " torch.npu.launch_model(opt_fn, input, stream_index=0, timestamp=0)\n", "\n", "print(\"ResNet18 Simulation Done\")" ] @@ -73,37 +84,45 @@ "import os\n", "import torch\n", "from torchvision.models import resnet18\n", - "\n", - "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator\n", + "from Simulator.simulator import TOGSimulator\n", + "from PyTorchSimFrontend import extension_config\n", + "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", + "from Scheduler.scheduler import poisson_request_generator\n", "TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", "\n", - "lambda_requests = 10\n", - "max_time = 30\n", + "model0_lambda = 5.0\n", + "max_time_msec = 1000.0\n", "\n", "target_model1 = resnet18().eval()\n", "\n", - "# Init scheduler\n", - "scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG)\n", - "# Register compiled model\n", - "opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)\n", - "SchedulerDNNModel.register_model(\"resnet18\", opt_model1)\n", + "device = torch.device(\"npu:0\")\n", + "config = extension_config.CONFIG_TOGSIM_CONFIG\n", + "opt_model0 = torch.compile(target_model1.to(device=device, memory_format=torch.channels_last), dynamic=False)\n", "\n", - "# Generate time stamp\n", - "for request_time in poisson_request_generator(lambda_requests, max_time):\n", - " # Init input data\n", - " model_input1 = torch.randn(1, 3, 224, 224)\n", + "events = []\n", + "x = torch.randn(1, 3, 224, 224, device=device)\n", + "for t in poisson_request_generator(model0_lambda, max_msec_time=max_time_msec):\n", + " events.append((t, 0, opt_model0, (x,))) # stream_index 0 → queue / partition 0\n", "\n", - " # Init request\n", - " new_request1 = Request(\"resnet18\", [model_input1], [], request_queue_idx=0)\n", + "events.sort(key=lambda e: e[0])\n", "\n", - " # Add request to scheduler\n", - " print(\"[Reqest] Resnet18 request time: \", request_time, flush=True)\n", - " scheduler.add_request(new_request1, request_time=request_time)\n", "\n", - "# Run scheduler\n", - "while not scheduler.is_finished():\n", - " scheduler.schedule()" + "with TOGSimulator(config_path=config):\n", + " for t_msec, stream_index, model, args in events:\n", + " torch.npu.launch_model(\n", + " model,\n", + " *args,\n", + " stream_index=stream_index,\n", + " timestamp=int(t_msec),\n", + " )" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -122,7 +141,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/tutorial/session1/ExecutionMode.ipynb b/tutorial/session1/ExecutionMode.ipynb index d94323db..bd7d7d73 100644 --- a/tutorial/session1/ExecutionMode.ipynb +++ b/tutorial/session1/ExecutionMode.ipynb @@ -10,7 +10,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:56:08.883802Z", + "iopub.status.busy": "2026-04-16T05:56:08.883406Z", + "iopub.status.idle": "2026-04-16T05:56:11.858647Z", + "shell.execute_reply": "2026-04-16T05:56:11.857788Z", + "shell.execute_reply.started": "2026-04-16T05:56:08.883784Z" + } + }, "outputs": [], "source": [ "import torch\n", @@ -30,7 +38,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:56:11.859394Z", + "iopub.status.busy": "2026-04-16T05:56:11.859139Z", + "iopub.status.idle": "2026-04-16T05:56:31.283787Z", + "shell.execute_reply": "2026-04-16T05:56:31.282907Z", + "shell.execute_reply.started": "2026-04-16T05:56:11.859372Z" + } + }, "outputs": [], "source": [ "device = torch.device(\"npu:0\")\n", @@ -52,7 +68,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:56:37.980561Z", + "iopub.status.busy": "2026-04-16T05:56:37.980194Z", + "iopub.status.idle": "2026-04-16T05:56:46.194881Z", + "shell.execute_reply": "2026-04-16T05:56:46.194059Z", + "shell.execute_reply.started": "2026-04-16T05:56:37.980534Z" + } + }, "outputs": [], "source": [ "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_functional_only.yml\"\n", @@ -74,7 +98,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:56:46.195666Z", + "iopub.status.busy": "2026-04-16T05:56:46.195511Z", + "iopub.status.idle": "2026-04-16T05:56:49.736201Z", + "shell.execute_reply": "2026-04-16T05:56:49.735438Z", + "shell.execute_reply.started": "2026-04-16T05:56:46.195650Z" + } + }, "outputs": [], "source": [ "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", @@ -97,7 +129,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:59:18.661437Z", + "iopub.status.busy": "2026-04-16T05:59:18.661188Z", + "iopub.status.idle": "2026-04-16T05:59:53.388013Z", + "shell.execute_reply": "2026-04-16T05:59:53.387130Z", + "shell.execute_reply.started": "2026-04-16T05:59:18.661408Z" + } + }, "outputs": [], "source": [ "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", @@ -112,10 +152,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T06:00:06.720227Z", + "iopub.status.busy": "2026-04-16T06:00:06.719962Z", + "iopub.status.idle": "2026-04-16T06:00:06.979872Z", + "shell.execute_reply": "2026-04-16T06:00:06.978988Z", + "shell.execute_reply.started": "2026-04-16T06:00:06.720210Z" + } + }, "outputs": [], "source": [ - "!cat /root/workspace/PyTorchSim/outputs/20251202_160520/togsim_result.log | grep \"Total execution cycle\"" + "!cat /workspace/PyTorchSim/togsim_results/20260416_055926_3c61ae14.log | grep \"Total execution cycle\"" ] }, { @@ -128,7 +176,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T06:01:00.604737Z", + "iopub.status.busy": "2026-04-16T06:01:00.604494Z", + "iopub.status.idle": "2026-04-16T06:01:34.826968Z", + "shell.execute_reply": "2026-04-16T06:01:34.826043Z", + "shell.execute_reply.started": "2026-04-16T06:01:00.604717Z" + } + }, "outputs": [], "source": [ "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_2_cores.yml\"\n", @@ -143,10 +199,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T06:01:53.294075Z", + "iopub.status.busy": "2026-04-16T06:01:53.293728Z", + "iopub.status.idle": "2026-04-16T06:01:53.549156Z", + "shell.execute_reply": "2026-04-16T06:01:53.548315Z", + "shell.execute_reply.started": "2026-04-16T06:01:53.294047Z" + } + }, "outputs": [], "source": [ - "!cat /root/workspace/PyTorchSim/outputs/20251202_160547/togsim_result.log | grep \"Total execution cycle\"" + "!cat /workspace/PyTorchSim/togsim_results/20260416_060100_05df9481.log | grep \"Total execution cycle\"" ] }, { @@ -159,7 +223,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -173,7 +237,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/tutorial/session1/Inference.ipynb b/tutorial/session1/Inference.ipynb index 6fd54aed..caa5924e 100644 --- a/tutorial/session1/Inference.ipynb +++ b/tutorial/session1/Inference.ipynb @@ -11,7 +11,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:42:44.479626Z", + "iopub.status.busy": "2026-04-16T05:42:44.479480Z", + "iopub.status.idle": "2026-04-16T05:42:47.646477Z", + "shell.execute_reply": "2026-04-16T05:42:47.645578Z", + "shell.execute_reply.started": "2026-04-16T05:42:44.479609Z" + } + }, "outputs": [], "source": [ "import torch\n", @@ -31,7 +39,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:42:47.968708Z", + "iopub.status.busy": "2026-04-16T05:42:47.968420Z", + "iopub.status.idle": "2026-04-16T05:42:49.772696Z", + "shell.execute_reply": "2026-04-16T05:42:49.771704Z", + "shell.execute_reply.started": "2026-04-16T05:42:47.968688Z" + } + }, "outputs": [], "source": [ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", @@ -97,11 +113,18 @@ "source": [ "test_result(\"MatMul\", npu_out, cpu_out)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -115,7 +138,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/tutorial/session1/LogAnalysis.ipynb b/tutorial/session1/LogAnalysis.ipynb index 24dae52b..5cd14f41 100644 --- a/tutorial/session1/LogAnalysis.ipynb +++ b/tutorial/session1/LogAnalysis.ipynb @@ -10,7 +10,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T10:00:05.422374Z", + "iopub.status.busy": "2026-04-16T10:00:05.422205Z", + "iopub.status.idle": "2026-04-16T10:00:08.512084Z", + "shell.execute_reply": "2026-04-16T10:00:08.511285Z", + "shell.execute_reply.started": "2026-04-16T10:00:05.422359Z" + } + }, "outputs": [], "source": [ "import torch\n", @@ -32,7 +40,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T10:00:46.974212Z", + "iopub.status.busy": "2026-04-16T10:00:46.973814Z", + "iopub.status.idle": "2026-04-16T10:00:52.152064Z", + "shell.execute_reply": "2026-04-16T10:00:52.151231Z", + "shell.execute_reply.started": "2026-04-16T10:00:46.974195Z" + } + }, "outputs": [], "source": [ "device = torch.device(\"npu:0\")\n", @@ -54,7 +70,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T10:25:36.625640Z", + "iopub.status.busy": "2026-04-16T10:25:36.625388Z", + "iopub.status.idle": "2026-04-16T10:25:40.123959Z", + "shell.execute_reply": "2026-04-16T10:25:40.123131Z", + "shell.execute_reply.started": "2026-04-16T10:25:36.625622Z" + } + }, "outputs": [], "source": [ "os.environ['TOGSIM_DEBUG_LEVEL']=\"trace\"\n", @@ -90,7 +114,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/tutorial/session1/Mapping.ipynb b/tutorial/session1/Mapping.ipynb index 0b978bcb..92ddd5a8 100644 --- a/tutorial/session1/Mapping.ipynb +++ b/tutorial/session1/Mapping.ipynb @@ -10,7 +10,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:49:05.540163Z", + "iopub.status.busy": "2026-04-16T05:49:05.539948Z", + "iopub.status.idle": "2026-04-16T05:49:08.550103Z", + "shell.execute_reply": "2026-04-16T05:49:08.549146Z", + "shell.execute_reply.started": "2026-04-16T05:49:05.540146Z" + } + }, "outputs": [], "source": [ "import torch\n", @@ -30,7 +38,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:49:08.550908Z", + "iopub.status.busy": "2026-04-16T05:49:08.550691Z", + "iopub.status.idle": "2026-04-16T05:49:28.225867Z", + "shell.execute_reply": "2026-04-16T05:49:28.225051Z", + "shell.execute_reply.started": "2026-04-16T05:49:08.550893Z" + } + }, "outputs": [], "source": [ "device = torch.device(\"npu:0\")\n", @@ -45,10 +61,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:49:44.788982Z", + "iopub.status.busy": "2026-04-16T05:49:44.788640Z", + "iopub.status.idle": "2026-04-16T05:49:45.048201Z", + "shell.execute_reply": "2026-04-16T05:49:45.047229Z", + "shell.execute_reply.started": "2026-04-16T05:49:44.788954Z" + } + }, "outputs": [], "source": [ - "!cat /root/workspace/PyTorchSim/outputs/20251202_154524/togsim_result.log | grep \"Total execution cycle\"" + "!cat /workspace/PyTorchSim/togsim_results/20260416_054924_5e1428f9.log | grep \"Total execution cycle\"" ] }, { @@ -62,7 +86,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:49:53.216985Z", + "iopub.status.busy": "2026-04-16T05:49:53.216635Z", + "iopub.status.idle": "2026-04-16T05:50:11.043854Z", + "shell.execute_reply": "2026-04-16T05:50:11.042989Z", + "shell.execute_reply.started": "2026-04-16T05:49:53.216960Z" + } + }, "outputs": [], "source": [ "torch._dynamo.reset()\n", @@ -79,10 +111,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:50:18.200344Z", + "iopub.status.busy": "2026-04-16T05:50:18.200118Z", + "iopub.status.idle": "2026-04-16T05:50:18.456838Z", + "shell.execute_reply": "2026-04-16T05:50:18.455901Z", + "shell.execute_reply.started": "2026-04-16T05:50:18.200327Z" + } + }, "outputs": [], "source": [ - "!cat /root/workspace/PyTorchSim/outputs/20251202_141933/togsim_result.log | grep \"Total execution cycle\"" + "!cat /workspace/PyTorchSim/togsim_results/20260416_055004_6ef0f564.log | grep \"Total execution cycle\"" ] }, { @@ -95,7 +135,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T11:22:40.778257Z", + "iopub.status.busy": "2026-04-16T11:22:40.777947Z", + "iopub.status.idle": "2026-04-16T11:23:10.573193Z", + "shell.execute_reply": "2026-04-16T11:23:10.572225Z", + "shell.execute_reply.started": "2026-04-16T11:22:40.778230Z" + } + }, "outputs": [], "source": [ "torch._dynamo.reset()\n", @@ -112,10 +160,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T11:56:35.774938Z", + "iopub.status.busy": "2026-04-16T11:56:35.774682Z", + "iopub.status.idle": "2026-04-16T11:56:36.022450Z", + "shell.execute_reply": "2026-04-16T11:56:36.020569Z", + "shell.execute_reply.started": "2026-04-16T11:56:35.774921Z" + } + }, "outputs": [], "source": [ - "!cat /root/workspace/PyTorchSim/outputs/20251202_141951/togsim_result.log | grep \"Total execution cycle\"" + "!cat /workspace/PyTorchSim/togsim_results/20260416_112306_10ad96fd.log | grep \"Total execution cycle\"" ] }, { @@ -142,7 +198,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/tutorial/session1/Training.ipynb b/tutorial/session1/Training.ipynb index badf7ed7..1f86a5b8 100644 --- a/tutorial/session1/Training.ipynb +++ b/tutorial/session1/Training.ipynb @@ -10,7 +10,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:45:04.361593Z", + "iopub.status.busy": "2026-04-16T05:45:04.361471Z", + "iopub.status.idle": "2026-04-16T05:45:07.515245Z", + "shell.execute_reply": "2026-04-16T05:45:07.514397Z", + "shell.execute_reply.started": "2026-04-16T05:45:04.361578Z" + } + }, "outputs": [], "source": [ "import os\n", @@ -33,7 +41,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:45:07.516141Z", + "iopub.status.busy": "2026-04-16T05:45:07.515901Z", + "iopub.status.idle": "2026-04-16T05:45:07.635695Z", + "shell.execute_reply": "2026-04-16T05:45:07.634872Z", + "shell.execute_reply.started": "2026-04-16T05:45:07.516123Z" + } + }, "outputs": [], "source": [ "torch.manual_seed(0)\n", @@ -43,7 +59,7 @@ "cpu_input.requires_grad = True\n", "cpu_weight.requires_grad = True\n", "\n", - "opt_fn = torch.matmul\n", + "opt_fn = torch.compile(torch.matmul)\n", "cpu_out = opt_fn(cpu_input, cpu_weight)\n", "\n", "loss_fn = torch.nn.CrossEntropyLoss()\n", @@ -61,7 +77,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:45:07.636349Z", + "iopub.status.busy": "2026-04-16T05:45:07.636190Z", + "iopub.status.idle": "2026-04-16T05:45:13.350714Z", + "shell.execute_reply": "2026-04-16T05:45:13.349588Z", + "shell.execute_reply.started": "2026-04-16T05:45:07.636333Z" + } + }, "outputs": [], "source": [ "torch.manual_seed(0)\n", @@ -82,7 +106,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:45:13.351955Z", + "iopub.status.busy": "2026-04-16T05:45:13.351757Z", + "iopub.status.idle": "2026-04-16T05:45:13.356589Z", + "shell.execute_reply": "2026-04-16T05:45:13.355757Z", + "shell.execute_reply.started": "2026-04-16T05:45:13.351935Z" + } + }, "outputs": [], "source": [ "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n", @@ -104,7 +136,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-16T05:45:13.357014Z", + "iopub.status.busy": "2026-04-16T05:45:13.356871Z", + "iopub.status.idle": "2026-04-16T05:45:13.361392Z", + "shell.execute_reply": "2026-04-16T05:45:13.360681Z", + "shell.execute_reply.started": "2026-04-16T05:45:13.357000Z" + } + }, "outputs": [], "source": [ "test_result(\"MatMul Input Grad\", npu_input.grad, cpu_input.grad)\n", @@ -121,7 +161,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -135,7 +175,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/tutorial/session1/tutorial_external_mapping.json b/tutorial/session1/tutorial_external_mapping.json index 3982d950..184a29da 100644 --- a/tutorial/session1/tutorial_external_mapping.json +++ b/tutorial/session1/tutorial_external_mapping.json @@ -2,6 +2,6 @@ "1024_1024_1024" : { "TILE_M" : 512, "TILE_N" : 512, - "TILE_K" : 512 + "TILE_K" : 256 } } \ No newline at end of file From 174b3cc258bee77d5a5aaaab5437c3c9ac8acc4c Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 20 Apr 2026 12:25:58 +0900 Subject: [PATCH 172/194] [Tutorial] Add guideline for a hands-on --- PyTorchSimFrontend/mlir/mlir_ops.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py index 218f60a9..58e8b73b 100644 --- a/PyTorchSimFrontend/mlir/mlir_ops.py +++ b/PyTorchSimFrontend/mlir/mlir_ops.py @@ -331,6 +331,13 @@ def exp2(operand, *args, **kwargs): # Hands-on part: implement exp2 using math.exp2 # V.kernel.var_info = {operand: [tile_size, dtype]} # Ex) V.kernel.var_info[operand] = [8, "f32"] + # + # tile_size, dtype = V.kernel.var_info[operand] + # if tile_size > 1: + # shape = f"vector<{tile_size}x{dtype}>" + # else: + # shape = dtype + # return f'math.exp2 %{operand} : {shape}', [tile_size, dtype] ln2 = math.log(2) coeff = ops.constant(ln2, "f32") From 6d64afa6568998698d49f3444ea54684f1462669 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 20 Apr 2026 12:53:52 +0900 Subject: [PATCH 173/194] [Tutorial] Add CI for tutorial image --- .github/workflows/docker-tutorial-image.yml | 17 +++- tutorial/jupyterhub/Dockerfile.ksc2025 | 90 ----------------- tutorial/jupyterhub/Dockerfile.tutorial | 103 ++++++++++++++++++++ tutorial/jupyterhub/jupyterhub_config.py | 6 +- 4 files changed, 118 insertions(+), 98 deletions(-) delete mode 100644 tutorial/jupyterhub/Dockerfile.ksc2025 create mode 100644 tutorial/jupyterhub/Dockerfile.tutorial diff --git a/.github/workflows/docker-tutorial-image.yml b/.github/workflows/docker-tutorial-image.yml index c0d8267d..e03bef22 100644 --- a/.github/workflows/docker-tutorial-image.yml +++ b/.github/workflows/docker-tutorial-image.yml @@ -2,7 +2,7 @@ name: Docker image for tutorial on: push: - branches: [ "tutorial" ] + branches: [ "ispass2026" ] jobs: build: @@ -25,11 +25,18 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - # Step 3: Build and Push Docker Image + - name: PyTorch base image from manifest + run: | + PYTORCH_IMAGE=$(python3 -c "import json; from pathlib import Path; v=json.loads(Path('thirdparty/github-releases.json').read_text()).get('pytorch_image'); print(v or '')") + if [ -z "$PYTORCH_IMAGE" ]; then echo "thirdparty/github-releases.json: pytorch_image is required" >&2; exit 1; fi + echo "PYTORCH_IMAGE=$PYTORCH_IMAGE" >> "$GITHUB_ENV" + - name: Build and Push Docker Image - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v6 with: context: . - file: ./tutorial/jupyterhub/Dockerfile.ksc2025 + file: ./tutorial/jupyterhub/Dockerfile.tutorial push: true - tags: ghcr.io/psal-postech/torchsim_ksc2025:latest + build-args: | + PYTORCH_IMAGE=${{ env.PYTORCH_IMAGE }} + tags: ghcr.io/psal-postech/torchsim-tutorial:ispass2026 diff --git a/tutorial/jupyterhub/Dockerfile.ksc2025 b/tutorial/jupyterhub/Dockerfile.ksc2025 deleted file mode 100644 index 7633c048..00000000 --- a/tutorial/jupyterhub/Dockerfile.ksc2025 +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) 2020 The Regents of the University of California -# All Rights Reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime - -# Copied from Gem5 Docker file -ENV DEBIAN_FRONTEND=noninteractive -RUN apt -y update && apt -y upgrade && \ - apt -y install build-essential git m4 scons zlib1g zlib1g-dev \ - libprotobuf-dev protobuf-compiler libprotoc-dev libgoogle-perftools-dev \ - python3-dev python-is-python3 doxygen libboost-all-dev \ - libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \ - python3-venv black libssl-dev libasan5 libubsan1 -RUN pip install mypy pre-commit jupyter pydot tabulate jupyterlab_execute_time - -# Pass Access Token securely -ENV PATH=$PATH:/root/.local/bin -ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH - -# Build Gem5 -RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch tutorial -RUN cd gem5 && scons build/RISCV/gem5.opt -j $(nproc) && git checkout TorchSim -ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt - -# Build LLVM RISC-V -RUN git clone https://github.com/PSAL-POSTECH/llvm-project.git --branch torchsim --depth 1 -RUN cd llvm-project && mkdir build && cd build && \ - cmake -DLLVM_ENABLE_PROJECTS=mlir -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/riscv-llvm -DLLVM_TARGETS_TO_BUILD=RISCV -G "Unix Makefiles" ../llvm && \ - make -j && make install - -# Store RISC-V LLVM for TorchSim -ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin -ENV TORCHSIM_DIR=/workspace/PyTorchSim - -# Download RISC-V tool chain -RUN apt install -y wget && \ - wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz && \ - wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && \ - tar -zxvf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && tar -zxvf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && \ - rm *.tar.gz - -ENV RISCV=/workspace/riscv -ENV PATH=$RISCV/bin:$PATH - -# Install Spike simulator -RUN apt -y install device-tree-compiler -RUN git clone https://github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \ - ../configure --prefix=$RISCV && make -j && make install - -# Install Proxy kernel -RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \ - cd riscv-pk && git checkout 4f3debe4d04f56d31089c1c716a27e2d5245e9a1 && mkdir build && cd build && \ - ../configure --prefix=$RISCV --host=riscv64-unknown-elf && make -j && make install - -# Install torchsim dependency -RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0 - -# Prepare PyTorchSim project -RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch tutorial -RUN cd PyTorchSim/TOGSim && \ - git submodule update --recursive --init && \ - mkdir -p build && \ - cd build && \ - conan install .. --build=missing && \ - cmake .. && \ - make -j$(nproc) - -RUN pip install jupyterhub jupyterlab diff --git a/tutorial/jupyterhub/Dockerfile.tutorial b/tutorial/jupyterhub/Dockerfile.tutorial new file mode 100644 index 00000000..5a0e7458 --- /dev/null +++ b/tutorial/jupyterhub/Dockerfile.tutorial @@ -0,0 +1,103 @@ +# Copyright (c) 2020 The Regents of the University of California +# All Rights Reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Base image: CI passes build-arg from ``thirdparty/github-releases.json`` (``pytorch_image``). +# Default matches that manifest for local ``docker build``. +ARG PYTORCH_IMAGE=pytorch/pytorch:2.8.0-cuda12.6-cudnn9-devel +FROM ${PYTORCH_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive +WORKDIR /workspace + +# Build deps (Gem5 / LLVM / TorchSim); keep layer lean where possible. +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + build-essential git m4 scons zlib1g zlib1g-dev \ + libprotobuf-dev protobuf-compiler libprotoc-dev libgoogle-perftools-dev \ + python3-dev python-is-python3 doxygen libboost-all-dev \ + libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config \ + python3-venv black libssl-dev libasan5 libubsan1 \ + wget ca-certificates device-tree-compiler ninja-build \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir mypy pre-commit jupyter pydot tabulate \ + jupyterlab_execute_time onnx matplotlib conan==1.56.0 + +ENV PATH=$PATH:/root/.local/bin +ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH + +# Gem5 (TorchSim branch before build) +RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch ispass2026 \ + && cd gem5 && git checkout TorchSim \ + && scons build/RISCV/gem5.opt -j"$(nproc)" +ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt + +# LLVM MLIR (RISC-V) +RUN git clone https://github.com/PSAL-POSTECH/llvm-project.git --branch ispass2026 --depth 1 \ + && cd llvm-project && mkdir build && cd build \ + && cmake -G Ninja \ + -DLLVM_ENABLE_PROJECTS=mlir \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=/riscv-llvm \ + -DLLVM_TARGETS_TO_BUILD=RISCV \ + ../llvm \ + && cmake --build . -j"$(nproc)" \ + && cmake --install . +ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin +ENV TORCHSIM_DIR=/workspace/PyTorchSim + +# RISC-V GNU toolchains (glibc + bare-metal), Ubuntu release bundles +RUN wget -q https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz \ + && wget -q https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz \ + && tar -xzf riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz \ + && tar -xzf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz \ + && rm -f *.tar.gz + +ENV RISCV=/workspace/riscv +ENV PATH=$RISCV/bin:$PATH + +# Spike +RUN git clone https://github.com/PSAL-POSTECH/riscv-isa-sim.git --branch ispass2026 \ + && cd riscv-isa-sim && mkdir build && cd build \ + && ../configure --prefix="$RISCV" && make -j"$(nproc)" && make install + +# Proxy kernel +RUN git clone https://github.com/riscv-software-src/riscv-pk.git \ + && cd riscv-pk && git checkout 4f3debe4d04f56d31089c1c716a27e2d5245e9a1 \ + && mkdir build && cd build \ + && ../configure --prefix="$RISCV" --host=riscv64-unknown-elf \ + && make -j"$(nproc)" && make install + +# PyTorchSim + TOGSim +ENV CMAKE_POLICY_VERSION_MINIMUM=3.5 +RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch ispass2026 \ + && cd PyTorchSim/TOGSim \ + && git submodule update --recursive --init \ + && mkdir -p build && cd build \ + && conan install .. --build=missing \ + && cmake -G Ninja .. \ + && cmake --build . -j"$(nproc)" + +RUN pip install --no-cache-dir jupyterhub jupyterlab diff --git a/tutorial/jupyterhub/jupyterhub_config.py b/tutorial/jupyterhub/jupyterhub_config.py index a43c0543..36b03981 100644 --- a/tutorial/jupyterhub/jupyterhub_config.py +++ b/tutorial/jupyterhub/jupyterhub_config.py @@ -6,11 +6,11 @@ # Spawner config # ------------------------------------------------------------------------------ c.JupyterHub.spawner_class = 'dockerspawner.DockerSpawner' -c.DockerSpawner.image = "ghcr.io/psal-postech/torchsim_ksc2025:latest" +c.DockerSpawner.image = "ghcr.io/psal-postech/torchsim-tutorial:ispass2026" # Resource limit -c.DockerSpawner.mem_limit = '16G' -c.DockerSpawner.cpu_limit = 4.0 +c.DockerSpawner.mem_limit = '32G' +c.DockerSpawner.cpu_limit = 8.0 c.DockerSpawner.network_name = 'jupyterhub-network' c.Spawner.default_url = '/lab' From 0043b0183162d2a8705af40b37aa65d3b8c0205b Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 20 Apr 2026 15:30:56 +0900 Subject: [PATCH 174/194] [Tutorial] Add missing script --- tutorial/jupyterhub/Dockerfile.tutorial | 1 + 1 file changed, 1 insertion(+) diff --git a/tutorial/jupyterhub/Dockerfile.tutorial b/tutorial/jupyterhub/Dockerfile.tutorial index 5a0e7458..d10ed1bc 100644 --- a/tutorial/jupyterhub/Dockerfile.tutorial +++ b/tutorial/jupyterhub/Dockerfile.tutorial @@ -100,4 +100,5 @@ RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch ispass2026 && cmake -G Ninja .. \ && cmake --build . -j"$(nproc)" +RUN cd PyTorchSimDevice && python3 -m pip install --no-build-isolation -e . RUN pip install --no-cache-dir jupyterhub jupyterlab From c83d3213a23358c65a4777a0ac7ba89446b58571 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 20 Apr 2026 16:16:41 +0900 Subject: [PATCH 175/194] [Tutorial] Fix paths in Dockerfile for gem5 and PyTorchSimDevice --- tutorial/jupyterhub/Dockerfile.tutorial | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorial/jupyterhub/Dockerfile.tutorial b/tutorial/jupyterhub/Dockerfile.tutorial index d10ed1bc..303bcfa9 100644 --- a/tutorial/jupyterhub/Dockerfile.tutorial +++ b/tutorial/jupyterhub/Dockerfile.tutorial @@ -50,7 +50,7 @@ ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/l # Gem5 (TorchSim branch before build) RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch ispass2026 \ - && cd gem5 && git checkout TorchSim \ + && cd gem5 \ && scons build/RISCV/gem5.opt -j"$(nproc)" ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt @@ -100,5 +100,5 @@ RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch ispass2026 && cmake -G Ninja .. \ && cmake --build . -j"$(nproc)" -RUN cd PyTorchSimDevice && python3 -m pip install --no-build-isolation -e . +RUN cd PyTorchSim/PyTorchSimDevice && python3 -m pip install --no-build-isolation -e . RUN pip install --no-cache-dir jupyterhub jupyterlab From 50e210c0a769ca2e0291e4d8627a9f152fc5cbb5 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 20 Apr 2026 16:22:06 +0900 Subject: [PATCH 176/194] [Tutorial] fix --- tutorial/jupyterhub/Dockerfile.tutorial | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tutorial/jupyterhub/Dockerfile.tutorial b/tutorial/jupyterhub/Dockerfile.tutorial index 303bcfa9..6cb6d7d2 100644 --- a/tutorial/jupyterhub/Dockerfile.tutorial +++ b/tutorial/jupyterhub/Dockerfile.tutorial @@ -49,9 +49,10 @@ ENV PATH=$PATH:/root/.local/bin ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH # Gem5 (TorchSim branch before build) -RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch ispass2026 \ +RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch tutorial \ && cd gem5 \ && scons build/RISCV/gem5.opt -j"$(nproc)" +RUN cd gem5 && git checkout ispass2026 ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt # LLVM MLIR (RISC-V) From 24062d1d215d18dac0fb9c94d962b7a463690a8f Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 21 Apr 2026 22:57:18 +0900 Subject: [PATCH 177/194] [Config] derive req size, freq, peak BW from Ramulator2; simplify simple DRAM YAML --- README.md | 19 +- TOGSim/extern/ramulator2 | 2 +- TOGSim/extern/ramulator_custom/.gitignore | 6 - TOGSim/extern/ramulator_custom/CMakeLists.txt | 11 - .../include/ramulator/Ramulator.hpp | 57 -- TOGSim/extern/ramulator_custom/src/Config.cpp | 68 -- TOGSim/extern/ramulator_custom/src/Config.h | 120 --- .../extern/ramulator_custom/src/Controller.h | 667 ----------------- TOGSim/extern/ramulator_custom/src/DDR4.cpp | 418 ----------- TOGSim/extern/ramulator_custom/src/DDR4.h | 220 ------ TOGSim/extern/ramulator_custom/src/DRAM.h | 453 ------------ TOGSim/extern/ramulator_custom/src/HBM.cpp | 413 ----------- TOGSim/extern/ramulator_custom/src/HBM.h | 228 ------ TOGSim/extern/ramulator_custom/src/Memory.h | 684 ------------------ .../ramulator_custom/src/MemoryFactory.cpp | 80 -- .../ramulator_custom/src/MemoryFactory.h | 84 --- .../extern/ramulator_custom/src/Ramulator.cpp | 171 ----- .../extern/ramulator_custom/src/Refresh.cpp | 255 ------- TOGSim/extern/ramulator_custom/src/Refresh.h | 137 ---- .../extern/ramulator_custom/src/Request.cpp | 90 --- TOGSim/extern/ramulator_custom/src/Request.h | 54 -- .../extern/ramulator_custom/src/Scheduler.h | 377 ---------- .../ramulator_custom/src/SpeedyController.h | 304 -------- .../extern/ramulator_custom/src/StatType.cpp | 153 ---- TOGSim/extern/ramulator_custom/src/StatType.h | 669 ----------------- .../extern/ramulator_custom/src/Statistics.h | 236 ------ TOGSim/include/Common.h | 3 +- TOGSim/include/Dram.h | 10 +- TOGSim/include/SimulationConfig.h | 29 +- TOGSim/include/Simulator.h | 5 +- TOGSim/src/Common.cc | 25 +- TOGSim/src/Dram.cc | 232 +++++- TOGSim/src/Simulator.cc | 6 +- TOGSim/src/main.cc | 22 +- configs/heterogeneous_c2_simple_noc.yml | 2 - configs/ramulator2_configs/HBM2_TPUv2.yaml | 476 ++++++++++++ configs/ramulator2_configs/HBM2_TPUv3.yaml | 70 +- configs/ramulator2_configs/gen_configs.py | 8 +- configs/ramulator_configs/ALDRAM-config.cfg | 30 - configs/ramulator_configs/DDR3-config.cfg | 31 - configs/ramulator_configs/DDR4-config.cfg | 31 - configs/ramulator_configs/DSARP-config.cfg | 31 - configs/ramulator_configs/GDDR5-config.cfg | 30 - configs/ramulator_configs/HBM-config.cfg | 32 - .../HBM-config_ChRaBaRoCo.cfg | 32 - configs/ramulator_configs/HBM-config_FCFS.cfg | 32 - .../ramulator_configs/HBM-config_FRFCFS.cfg | 32 - .../HBM-config_FRFCFS_Cap.cfg | 32 - .../HBM-config_FRFCFS_PriorHit.cfg | 32 - .../HBM-config_RoBaRaCoCh.cfg | 32 - .../HBM-config_RoCoBaRaCh.cfg | 32 - .../ramulator_configs/HBMx0.5ch-config.cfg | 30 - configs/ramulator_configs/HBMx2ch-config.cfg | 30 - configs/ramulator_configs/LPDDR3-config.cfg | 30 - configs/ramulator_configs/LPDDR4-config.cfg | 30 - configs/ramulator_configs/PCM-config.cfg | 30 - configs/ramulator_configs/SALP-config.cfg | 31 - configs/ramulator_configs/STTMRAM-config.cfg | 30 - configs/ramulator_configs/TLDRAM-config.cfg | 31 - configs/ramulator_configs/WideIO-config.cfg | 30 - configs/ramulator_configs/WideIO2-config.cfg | 30 - configs/stonne_big_c1_simple_noc.yml | 2 - configs/stonne_single_c1_simple_noc.yml | 2 - .../systolic_ws_128x128_c1_booksim_tpuv2.yml | 2 - .../systolic_ws_128x128_c1_booksim_tpuv3.yml | 2 - ...ystolic_ws_128x128_c1_simple_noc_tpuv2.yml | 2 - ...ystolic_ws_128x128_c1_simple_noc_tpuv3.yml | 2 - ...ic_ws_128x128_c1_simple_noc_tpuv3_half.yml | 2 - ...28x128_c1_simple_noc_tpuv3_timing_only.yml | 2 - ...ystolic_ws_128x128_c1_simple_noc_tpuv4.yml | 2 - .../systolic_ws_128x128_c2_booksim_tpuv3.yml | 2 - ...ws_128x128_c2_booksim_tpuv3_bw_quarter.yml | 9 +- .../systolic_ws_128x128_c2_chiplet_tpuv3.yml | 2 - ...olic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml | 2 - ...ystolic_ws_128x128_c2_simple_noc_tpuv2.yml | 2 - ...ystolic_ws_128x128_c2_simple_noc_tpuv3.yml | 2 - ...lic_ws_128x128_c2_simple_noc_tpuv3_ils.yml | 2 - ..._128x128_c2_simple_noc_tpuv3_partition.yml | 2 - ...ystolic_ws_128x128_c2_simple_noc_tpuv4.yml | 2 - configs/systolic_ws_8x8_c1_booksim.yml | 2 - configs/systolic_ws_8x8_c1_simple_noc.yml | 2 - .../session1/togsim_configs/togsim_config.yml | 2 - .../togsim_configs/togsim_config_2_cores.yml | 2 - .../togsim_configs/togsim_config_autotune.yml | 2 - .../togsim_config_external_mapping.yml | 2 - .../togsim_config_functional_only.yml | 2 - ...togsim_config_no_compiler_optimization.yml | 2 - .../togsim_config_timing_only.yml | 2 - 88 files changed, 814 insertions(+), 6822 deletions(-) delete mode 100644 TOGSim/extern/ramulator_custom/.gitignore delete mode 100644 TOGSim/extern/ramulator_custom/CMakeLists.txt delete mode 100644 TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp delete mode 100644 TOGSim/extern/ramulator_custom/src/Config.cpp delete mode 100644 TOGSim/extern/ramulator_custom/src/Config.h delete mode 100644 TOGSim/extern/ramulator_custom/src/Controller.h delete mode 100644 TOGSim/extern/ramulator_custom/src/DDR4.cpp delete mode 100644 TOGSim/extern/ramulator_custom/src/DDR4.h delete mode 100644 TOGSim/extern/ramulator_custom/src/DRAM.h delete mode 100644 TOGSim/extern/ramulator_custom/src/HBM.cpp delete mode 100644 TOGSim/extern/ramulator_custom/src/HBM.h delete mode 100644 TOGSim/extern/ramulator_custom/src/Memory.h delete mode 100644 TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp delete mode 100644 TOGSim/extern/ramulator_custom/src/MemoryFactory.h delete mode 100644 TOGSim/extern/ramulator_custom/src/Ramulator.cpp delete mode 100644 TOGSim/extern/ramulator_custom/src/Refresh.cpp delete mode 100644 TOGSim/extern/ramulator_custom/src/Refresh.h delete mode 100644 TOGSim/extern/ramulator_custom/src/Request.cpp delete mode 100644 TOGSim/extern/ramulator_custom/src/Request.h delete mode 100644 TOGSim/extern/ramulator_custom/src/Scheduler.h delete mode 100644 TOGSim/extern/ramulator_custom/src/SpeedyController.h delete mode 100644 TOGSim/extern/ramulator_custom/src/StatType.cpp delete mode 100644 TOGSim/extern/ramulator_custom/src/StatType.h delete mode 100644 TOGSim/extern/ramulator_custom/src/Statistics.h create mode 100644 configs/ramulator2_configs/HBM2_TPUv2.yaml delete mode 100644 configs/ramulator_configs/ALDRAM-config.cfg delete mode 100644 configs/ramulator_configs/DDR3-config.cfg delete mode 100644 configs/ramulator_configs/DDR4-config.cfg delete mode 100644 configs/ramulator_configs/DSARP-config.cfg delete mode 100644 configs/ramulator_configs/GDDR5-config.cfg delete mode 100644 configs/ramulator_configs/HBM-config.cfg delete mode 100644 configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg delete mode 100644 configs/ramulator_configs/HBM-config_FCFS.cfg delete mode 100644 configs/ramulator_configs/HBM-config_FRFCFS.cfg delete mode 100644 configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg delete mode 100644 configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg delete mode 100644 configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg delete mode 100644 configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg delete mode 100644 configs/ramulator_configs/HBMx0.5ch-config.cfg delete mode 100644 configs/ramulator_configs/HBMx2ch-config.cfg delete mode 100644 configs/ramulator_configs/LPDDR3-config.cfg delete mode 100644 configs/ramulator_configs/LPDDR4-config.cfg delete mode 100644 configs/ramulator_configs/PCM-config.cfg delete mode 100644 configs/ramulator_configs/SALP-config.cfg delete mode 100644 configs/ramulator_configs/STTMRAM-config.cfg delete mode 100644 configs/ramulator_configs/TLDRAM-config.cfg delete mode 100644 configs/ramulator_configs/WideIO-config.cfg delete mode 100644 configs/ramulator_configs/WideIO2-config.cfg diff --git a/README.md b/README.md index 03041355..a6dd399a 100644 --- a/README.md +++ b/README.md @@ -397,13 +397,18 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing "vpu_spad_size_kb_per_lane" : 128, // Scratchpad memory size per lane (KB) "vpu_vector_length_bits" : 256, // VPU vector register length (Bits) - "dram_type" : "ramulator2", // DRAM type (ex. ramulator2, simple) - "dram_freq_mhz" : 940, // DRAM frequency (MHz) - "dram_channels": 32, // Number of DRAM channels - "dram_req_size": 32, // DRAM request size (B) - "dram_latency" : 10, // DRAM latency (cycle) - "dram_nbl" : 2, // DRAM burst length size - "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", // Ramulator2 config file path + "dram_type" : "ramulator2", // DRAM type: ramulator2 | simple + "dram_channels": 32, // Number of DRAM channels (topology; required for both types) + "dram_stats_print_period_cycles": 10000, // Optional DRAM stats interval + // ramulator2: per-request size (bytes), DRAM MHz, and per-channel peak GB/s are derived from ramulator_config_path + // (peak ≈ timing[0] as MT/s × channel_width × pseudo-channels for HBM2/3; MHz from Ramulator tCK). + // Optional: if you set dram_freq_mhz, it must exactly match that derived MHz or initialization fails + // (the error message includes tCK in ns and the derived MHz for debugging stale yml values). + // Do not set dram_bandwidth_gbps_* at top level. + "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", + // simple: dram_latency + dram_channels + optional dram_req_size_byte (default 32). Omit + // dram_bandwidth_gbps_* for latency-only; dram_freq_mhz defaults to core_freq_mhz. + // With dram_bandwidth_gbps_* set, dram_freq_mhz is required (credit refill per DRAM cycle). "l2d_type" : "datacache", "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32", diff --git a/TOGSim/extern/ramulator2 b/TOGSim/extern/ramulator2 index ad6acd97..d33bf3ac 160000 --- a/TOGSim/extern/ramulator2 +++ b/TOGSim/extern/ramulator2 @@ -1 +1 @@ -Subproject commit ad6acd97e9fc60c44ed96a49267b7c20ab76e4d3 +Subproject commit d33bf3ac26f3e7f838386ff7923ea6bc3ba61c31 diff --git a/TOGSim/extern/ramulator_custom/.gitignore b/TOGSim/extern/ramulator_custom/.gitignore deleted file mode 100644 index 65a99dc1..00000000 --- a/TOGSim/extern/ramulator_custom/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -*.swp - -# Compiled Object files -obj/ - -# Compiled target executable files diff --git a/TOGSim/extern/ramulator_custom/CMakeLists.txt b/TOGSim/extern/ramulator_custom/CMakeLists.txt deleted file mode 100644 index 371de8df..00000000 --- a/TOGSim/extern/ramulator_custom/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -cmake_minimum_required(VERSION 3.16) -project(ramulator_project) - -file(GLOB_RECURSE RAMULATOR_SRCS CONFIGURE_DEPENDS src/*.cpp) -add_library(ramulator1 STATIC ${RAMULATOR_SRCS}) -target_include_directories(ramulator1 - PUBLIC include - PRIVATE include/ramulator - PRIVATE src -) -target_compile_options(ramulator1 PRIVATE -Wall -O3) diff --git a/TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp b/TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp deleted file mode 100644 index 4687b22b..00000000 --- a/TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef __RAMULATOR_H -#define __RAMULATOR_H -#include -#include -#include -#include -#include -#include -#include -#include -namespace ram { -class MemoryBase; -class Request; -class Ramulator { -public: - Ramulator(const std::string ConfigFilePath, uint32_t num_core, bool is_pim = false); - ~Ramulator(); - void tick(); - bool isAvailable(int CtrlID, uint64_t Addr, bool IsWrite) const; - bool isAvailable(uint64_t Addr, bool IsWrite) const; - void push(int CtrlID, uint64_t Addr, bool IsWrite, uint32_t core_id, void* original_req); - void push(uint64_t Addr, bool IsWrite, uint32_t core_id, void* original_req); - bool isEmpty(int CtrlID) const; - const void* top(int CtrlID) const; - void pop(int CtrlID); - int getAtomicBytes() const; - int getNumChannels() const; - int getChannel(uint64_t Addr) const; - void print_stats(); -private: - std::unique_ptr MemBase; - class OutputPendingQueue; - std::vector OutputPendingQueues; - using CallbackMap = - std::unordered_map>; - CallbackMap Callbacks; - robin_hood::unordered_flat_set hot_vids; - bool is_pim; - static std::unique_ptr createMemory(std::string ConfigFilePath, uint32_t num_core); -}; -class Ramulator::OutputPendingQueue { -public: - OutputPendingQueue(int Size); - bool isAvailable() const; - bool isAvailable(uint32_t count) const; - bool isEmpty() const; - void reserve(); - void push(void* original_req); - const void* top() const; - void pop(); -private: - const int Size; - int NumReserved; - std::queue PendingQueue; -}; -} // end namespace -#endif diff --git a/TOGSim/extern/ramulator_custom/src/Config.cpp b/TOGSim/extern/ramulator_custom/src/Config.cpp deleted file mode 100644 index a82f6e95..00000000 --- a/TOGSim/extern/ramulator_custom/src/Config.cpp +++ /dev/null @@ -1,68 +0,0 @@ -#include "Config.h" - -using namespace std; -using namespace ram; - -RamulatorConfig::RamulatorConfig(const std::string& fname) { - options["mapping"] = "RoBaRaCoCh"; - options["scheduler"] = "FRFCFS"; - parse(fname); -} - -void RamulatorConfig::parse(const string& fname) -{ - ifstream file(fname); - assert(file.good() && "Bad config file"); - string line; - while (getline(file, line)) { - char delim[] = " \t="; - vector tokens; - - while (true) { - size_t start = line.find_first_not_of(delim); - if (start == string::npos) - break; - - size_t end = line.find_first_of(delim, start); - if (end == string::npos) { - tokens.push_back(line.substr(start)); - break; - } - - tokens.push_back(line.substr(start, end - start)); - line = line.substr(end); - } - - // empty line - if (!tokens.size()) - continue; - - // comment line - if (tokens[0][0] == '#') - continue; - - // parameter line - assert(tokens.size() == 2 && "Only allow two tokens in one line"); - - options[tokens[0]] = tokens[1]; - - if (tokens[0] == "channels") { - channels = atoi(tokens[1].c_str()); - } else if (tokens[0] == "ranks") { - ranks = atoi(tokens[1].c_str()); - } else if (tokens[0] == "subarrays") { - subarrays = atoi(tokens[1].c_str()); - } else if (tokens[0] == "cpu_tick") { - cpu_tick = atoi(tokens[1].c_str()); - } else if (tokens[0] == "mem_tick") { - mem_tick = atoi(tokens[1].c_str()); - } else if (tokens[0] == "expected_limit_insts") { - expected_limit_insts = atoi(tokens[1].c_str()); - } else if (tokens[0] == "warmup_insts") { - warmup_insts = atoi(tokens[1].c_str()); - } - } - file.close(); -} - - diff --git a/TOGSim/extern/ramulator_custom/src/Config.h b/TOGSim/extern/ramulator_custom/src/Config.h deleted file mode 100644 index 2d8c12ce..00000000 --- a/TOGSim/extern/ramulator_custom/src/Config.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef __CONFIG_H -#define __CONFIG_H - -#include -#include -#include -#include -#include -#include - -namespace ram -{ - -class RamulatorConfig { - -private: - std::map options; - int channels; - int ranks; - int subarrays; - int cpu_tick; - int mem_tick; - int core_num = 0; - long expected_limit_insts = 0; - long warmup_insts = 0; - -public: - RamulatorConfig() {} - RamulatorConfig(const std::string& fname); - void parse(const std::string& fname); - std::string operator [] (const std::string& name) const { - if (options.find(name) != options.end()) { - return (options.find(name))->second; - } else { - return ""; - } - } - - bool contains(const std::string& name) const { - if (options.find(name) != options.end()) { - return true; - } else { - return false; - } - } - - void add (const std::string& name, const std::string& value) { - if (!contains(name)) { - options.insert(make_pair(name, value)); - } else { - printf("ramulator::Config::add options[%s] already set.\n", name.c_str()); - } - } - - void set_core_num(int _core_num) {core_num = _core_num;} - - int get_channels() const {return channels;} - int get_subarrays() const {return subarrays;} - int get_ranks() const {return ranks;} - int get_cpu_tick() const {return cpu_tick;} - int get_mem_tick() const {return mem_tick;} - int get_core_num() const {return core_num;} - long get_expected_limit_insts() const {return expected_limit_insts;} - long get_warmup_insts() const {return warmup_insts;} - - bool has_l3_cache() const { - if (options.find("cache") != options.end()) { - const std::string& cache_option = (options.find("cache"))->second; - return (cache_option == "all") || (cache_option == "L3"); - } else { - return false; - } - } - bool has_core_caches() const { - if (options.find("cache") != options.end()) { - const std::string& cache_option = (options.find("cache"))->second; - return (cache_option == "all" || cache_option == "L1L2"); - } else { - return false; - } - } - bool is_early_exit() const { - // the default value is true - if (options.find("early_exit") != options.end()) { - if ((options.find("early_exit"))->second == "off") { - return false; - } - return true; - } - return true; - } - bool calc_weighted_speedup() const { - return (expected_limit_insts != 0); - } - bool record_cmd_trace() const { - // the default value is false - if (options.find("record_cmd_trace") != options.end()) { - if ((options.find("record_cmd_trace"))->second == "on") { - return true; - } - return false; - } - return false; - } - bool print_cmd_trace() const { - // the default value is false - if (options.find("print_cmd_trace") != options.end()) { - if ((options.find("print_cmd_trace"))->second == "on") { - return true; - } - return false; - } - return false; - } -}; - - -} /* namespace ram */ - -#endif /* _CONFIG_H */ diff --git a/TOGSim/extern/ramulator_custom/src/Controller.h b/TOGSim/extern/ramulator_custom/src/Controller.h deleted file mode 100644 index 75ebba17..00000000 --- a/TOGSim/extern/ramulator_custom/src/Controller.h +++ /dev/null @@ -1,667 +0,0 @@ -#ifndef __CONTROLLER_H -#define __CONTROLLER_H - -#include -#include -#include -#include -#include -#include -#include - -#include "Config.h" -#include "DRAM.h" -#include "Refresh.h" -#include "Request.h" -#include "Scheduler.h" -#include "Statistics.h" - -// #include "ALDRAM.h" -// #include "SALP.h" -// #include "TLDRAM.h" - -using namespace std; - -namespace ram -{ - - extern bool warmup_complete; - -template -class Controller -{ -protected: - // For counting bandwidth - ScalarStat read_transaction_bytes; - ScalarStat write_transaction_bytes; - - ScalarStat row_hits; - ScalarStat row_misses; - ScalarStat row_conflicts; - VectorStat read_row_hits; - VectorStat read_row_misses; - VectorStat read_row_conflicts; - VectorStat write_row_hits; - VectorStat write_row_misses; - VectorStat write_row_conflicts; - ScalarStat useless_activates; - - ScalarStat read_latency_avg; - ScalarStat read_latency_sum; - - ScalarStat req_queue_length_avg; - ScalarStat req_queue_length_sum; - ScalarStat read_req_queue_length_avg; - ScalarStat read_req_queue_length_sum; - ScalarStat write_req_queue_length_avg; - ScalarStat write_req_queue_length_sum; - - VectorStat record_read_hits; - VectorStat record_read_misses; - VectorStat record_read_conflicts; - VectorStat record_write_hits; - VectorStat record_write_misses; - VectorStat record_write_conflicts; - -public: - /* Member Variables */ - unsigned long clk = 0; - DRAM* channel; - - Scheduler* scheduler; // determines the highest priority request whose commands will be issued - RowPolicy* rowpolicy; // determines the row-policy (e.g., closed-row vs. open-row) - RowTable* rowtable; // tracks metadata about rows (e.g., which are open and for how long) - Refresh* refresh; - - struct Queue { - list q; - unsigned int max = 64; - unsigned int size() const {return q.size();} - }; - - Queue readq; // queue for read requests - Queue writeq; // queue for write requests - Queue actq; // read and write requests for which activate was issued are moved to - // actq, which has higher priority than readq and writeq. - // This is an optimization - // for avoiding useless activations (i.e., PRECHARGE - // after ACTIVATE w/o READ of WRITE command) - Queue otherq; // queue for all "other" requests (e.g., refresh) - - deque pending; // read requests that are about to receive data from DRAM - bool write_mode = false; // whether write requests should be prioritized over reads - float wr_high_watermark = 0.8f; // threshold for switching to write mode - float wr_low_watermark = 0.2f; // threshold for switching back to read mode - //long refreshed = 0; // last time refresh requests were generated - - /* Command trace for DRAMPower 3.1 */ - string cmd_trace_prefix = "cmd-trace-"; - vector cmd_trace_files; - bool record_cmd_trace = false; - /* Commands to stdout */ - bool print_cmd_trace = false; - RamulatorConfig& configs; - /* Constructor */ - Controller(RamulatorConfig& configs, DRAM* channel) : - configs(configs), - channel(channel), - cmd_trace_files(channel->children.size()) - { - scheduler = new Scheduler(this); - rowpolicy = new RowPolicy(this); - rowtable = new RowTable(this); - refresh = new Refresh(this); - - record_cmd_trace = configs.record_cmd_trace(); - print_cmd_trace = configs.print_cmd_trace(); - if (record_cmd_trace){ - if (configs["cmd_trace_prefix"] != "") { - cmd_trace_prefix = configs["cmd_trace_prefix"]; - } - string prefix = cmd_trace_prefix + "chan-" + to_string(channel->id) + "-rank-"; - string suffix = ".cmdtrace"; - for (unsigned int i = 0; i < channel->children.size(); i++) - cmd_trace_files[i].open(prefix + to_string(i) + suffix); - } - - // regStats - - row_hits - .name("row_hits_channel_"+to_string(channel->id) + "_core") - .desc("Number of row hits per channel per core") - .precision(0) - ; - row_misses - .name("row_misses_channel_"+to_string(channel->id) + "_core") - .desc("Number of row misses per channel per core") - .precision(0) - ; - row_conflicts - .name("row_conflicts_channel_"+to_string(channel->id) + "_core") - .desc("Number of row conflicts per channel per core") - .precision(0) - ; - - read_row_hits - .init(configs.get_core_num()) - .name("read_row_hits_channel_"+to_string(channel->id) + "_core") - .desc("Number of row hits for read requests per channel per core") - .precision(0) - ; - read_row_misses - .init(configs.get_core_num()) - .name("read_row_misses_channel_"+to_string(channel->id) + "_core") - .desc("Number of row misses for read requests per channel per core") - .precision(0) - ; - read_row_conflicts - .init(configs.get_core_num()) - .name("read_row_conflicts_channel_"+to_string(channel->id) + "_core") - .desc("Number of row conflicts for read requests per channel per core") - .precision(0) - ; - - write_row_hits - .init(configs.get_core_num()) - .name("write_row_hits_channel_"+to_string(channel->id) + "_core") - .desc("Number of row hits for write requests per channel per core") - .precision(0) - ; - write_row_misses - .init(configs.get_core_num()) - .name("write_row_misses_channel_"+to_string(channel->id) + "_core") - .desc("Number of row misses for write requests per channel per core") - .precision(0) - ; - write_row_conflicts - .init(configs.get_core_num()) - .name("write_row_conflicts_channel_"+to_string(channel->id) + "_core") - .desc("Number of row conflicts for write requests per channel per core") - .precision(0) - ; - - useless_activates - .name("useless_activates_"+to_string(channel->id)+ "_core") - .desc("Number of useless activations. E.g, ACT -> PRE w/o RD or WR") - .precision(0) - ; - - read_transaction_bytes - .name("read_transaction_bytes_"+to_string(channel->id)) - .desc("The total byte of read transaction per channel") - .precision(0) - ; - write_transaction_bytes - .name("write_transaction_bytes_"+to_string(channel->id)) - .desc("The total byte of write transaction per channel") - .precision(0) - ; - - read_latency_sum - .name("read_latency_sum_"+to_string(channel->id)) - .desc("The memory latency cycles (in memory time domain) sum for all read requests in this channel") - .precision(0) - ; - read_latency_avg - .name("read_latency_avg_"+to_string(channel->id)) - .desc("The average memory latency cycles (in memory time domain) per request for all read requests in this channel") - .precision(6) - ; - - req_queue_length_sum - .name("req_queue_length_sum_"+to_string(channel->id)) - .desc("Sum of read and write queue length per memory cycle per channel.") - .precision(0) - ; - req_queue_length_avg - .name("req_queue_length_avg_"+to_string(channel->id)) - .desc("Average of read and write queue length per memory cycle per channel.") - .precision(6) - ; - - read_req_queue_length_sum - .name("read_req_queue_length_sum_"+to_string(channel->id)) - .desc("Read queue length sum per memory cycle per channel.") - .precision(0) - ; - read_req_queue_length_avg - .name("read_req_queue_length_avg_"+to_string(channel->id)) - .desc("Read queue length average per memory cycle per channel.") - .precision(6) - ; - - write_req_queue_length_sum - .name("write_req_queue_length_sum_"+to_string(channel->id)) - .desc("Write queue length sum per memory cycle per channel.") - .precision(0) - ; - write_req_queue_length_avg - .name("write_req_queue_length_avg_"+to_string(channel->id)) - .desc("Write queue length average per memory cycle per channel.") - .precision(6) - ; - - record_read_hits - .init(configs.get_core_num()) - .name("record_read_hits") - .desc("record read hit count for this core when it reaches request limit or to the end") - ; - - record_read_misses - .init(configs.get_core_num()) - .name("record_read_misses") - .desc("record_read_miss count for this core when it reaches request limit or to the end") - ; - - record_read_conflicts - .init(configs.get_core_num()) - .name("record_read_conflicts") - .desc("record read conflict count for this core when it reaches request limit or to the end") - ; - - record_write_hits - .init(configs.get_core_num()) - .name("record_write_hits") - .desc("record write hit count for this core when it reaches request limit or to the end") - ; - - record_write_misses - .init(configs.get_core_num()) - .name("record_write_misses") - .desc("record write miss count for this core when it reaches request limit or to the end") - ; - - record_write_conflicts - .init(configs.get_core_num()) - .name("record_write_conflicts") - .desc("record write conflict for this core when it reaches request limit or to the end") - ; - } - - ~Controller(){ - delete scheduler; - delete rowpolicy; - delete rowtable; - delete channel; - delete refresh; - for (auto& file : cmd_trace_files) - file.close(); - cmd_trace_files.clear(); - } - - void finish(long read_req, long dram_cycles) { - read_latency_avg = read_latency_sum.value() / read_req; - req_queue_length_avg = req_queue_length_sum.value() / dram_cycles; - read_req_queue_length_avg = read_req_queue_length_sum.value() / dram_cycles; - write_req_queue_length_avg = write_req_queue_length_sum.value() / dram_cycles; - // call finish function of each channel - channel->finish(dram_cycles); - } - - /* Member Functions */ - Queue& get_queue(Request::Type type) - { - switch (int(type)) { - case int(Request::Type::READ): return readq; - case int(Request::Type::WRITE): return writeq; - default: return otherq; - } - } - - bool done() const { - return readq.size() == 0 && writeq.size() == 0; - } - - bool is_full(bool is_write) { - Request::Type type = is_write ? Request::Type::WRITE : Request::Type::READ; - auto& queue = get_queue(type); - assert(queue.size() <= queue.max); - return queue.size() == queue.max; - } - - bool enqueue(Request& req) - { - Queue& queue = get_queue(req.type); - if (queue.max == queue.size()) - return false; - - req.arrive = clk; - queue.q.push_back(req); - // shortcut for read requests, if a write to same addr exists - // necessary for coherence - // FIX: currently disable this because the write request of newfeature - // FIX: is same as read address - // if (req.type == Request::Type::READ && find_if(writeq.q.begin(), writeq.q.end(), - // [req](Request& wreq){ return req.addr == wreq.addr;}) != writeq.q.end()){ - // req.depart = clk + 1; - // pending.push_back(req); - // readq.q.pop_back(); - // } - return true; - } - - void tick() - { - clk++; - req_queue_length_sum += readq.size() + writeq.size() + pending.size(); - read_req_queue_length_sum += readq.size() + pending.size(); - write_req_queue_length_sum += writeq.size(); - - /*** 1. Serve completed reads ***/ - if (pending.size()) { - Request& req = pending[0]; - assert(req.type == Request::Type::READ); - if (req.depart <= clk) { - if (req.depart - req.arrive > 1) { // this request really accessed a row - read_latency_sum += req.depart - req.arrive; - channel->update_serving_requests( - req.addr_vec.data(), -1, clk); - } - req.callback(req); - pending.pop_front(); - } - } - - /*** 2. Refresh scheduler ***/ - refresh->tick_ref(); - - /*** 3. Should we schedule writes? ***/ - if (!write_mode) { - // yes -- write queue is almost full or read queue is empty - if (writeq.size() > int(wr_high_watermark * writeq.max) || readq.size() == 0) - write_mode = true; - } - else { - // no -- write queue is almost empty and read queue is not empty - if (writeq.size() < int(wr_low_watermark * writeq.max) && readq.size() != 0) - write_mode = false; - } - - /*** 4. Find the best command to schedule, if any ***/ - - // First check the actq (which has higher priority) to see if there - // are requests available to service in this cycle - Queue* queue = &actq; - typename T::Command cmd; - auto req = scheduler->get_head(queue->q); - - bool is_valid_req = (req != queue->q.end()); - - if(is_valid_req) { - cmd = get_first_cmd(req); - is_valid_req = is_ready(cmd, req->addr_vec); - } - - if (!is_valid_req) { - queue = !write_mode ? &readq : &writeq; - - if (otherq.size()) - queue = &otherq; // "other" requests are rare, so we give them precedence over reads/writes - - req = scheduler->get_head(queue->q); - - is_valid_req = (req != queue->q.end()); - - if(is_valid_req){ - cmd = get_first_cmd(req); - is_valid_req = is_ready(cmd, req->addr_vec); - } - } - - if (!is_valid_req) { - // we couldn't find a command to schedule -- let's try to be speculative - auto cmd = T::Command::PRE; - vector victim = rowpolicy->get_victim(cmd); - if (!victim.empty()){ - issue_cmd(cmd, victim); - } - return; // nothing more to be done this cycle - } - - if (req->is_first_command) { - req->is_first_command = false; - int coreid = req->coreid; - if (req->type == Request::Type::READ || req->type == Request::Type::WRITE) { - channel->update_serving_requests(req->addr_vec.data(), 1, clk); - } - int tx = (channel->spec->prefetch_size * channel->spec->channel_width / 8); - if (req->type == Request::Type::READ) { - if (is_row_hit(req)) { - ++read_row_hits[coreid]; - ++row_hits; - } else if (is_row_open(req)) { - ++read_row_conflicts[coreid]; - ++row_conflicts; - } else { - ++read_row_misses[coreid]; - ++row_misses; - } - read_transaction_bytes += tx; - } else if (req->type == Request::Type::WRITE) { - if (is_row_hit(req)) { - ++write_row_hits[coreid]; - ++row_hits; - } else if (is_row_open(req)) { - ++write_row_conflicts[coreid]; - ++row_conflicts; - } else { - ++write_row_misses[coreid]; - ++row_misses; - } - write_transaction_bytes += tx; - } - } - - // issue command on behalf of request - issue_cmd(cmd, get_addr_vec(cmd, req)); - - // check whether this is the last command (which finishes the request) - //if (cmd != channel->spec->translate[int(req->type)]){ - if (cmd != channel->spec->translate[int(req->type)]) { - if(channel->spec->is_opening(cmd)) { - // promote the request that caused issuing activation to actq - actq.q.push_back(*req); - queue->q.erase(req); - } - - return; - } - - // set a future completion time for read requests - if (req->type == Request::Type::READ) { - req->depart = clk + channel->spec->read_latency; - pending.push_back(*req); - } - - if (req->type == Request::Type::WRITE || req->type == Request::Type::PIM_WRITE) { - channel->update_serving_requests(req->addr_vec.data(), -1, clk); - req->callback(*req); - } - - // remove request from queue - queue->q.erase(req); - } - - bool is_ready(list::iterator req) - { - typename T::Command cmd = get_first_cmd(req); - return channel->check(cmd, req->addr_vec.data(), clk); - } - - bool is_ready(typename T::Command cmd, const vector& addr_vec) - { - return channel->check(cmd, addr_vec.data(), clk); - } - - bool is_row_hit(list::iterator req) - { - // cmd must be decided by the request type, not the first cmd - typename T::Command cmd = channel->spec->translate[int(req->type)]; - return channel->check_row_hit(cmd, req->addr_vec.data()); - } - - bool is_row_hit(typename T::Command cmd, const vector& addr_vec) - { - return channel->check_row_hit(cmd, addr_vec.data()); - } - - bool is_row_open(list::iterator req) - { - // cmd must be decided by the request type, not the first cmd - typename T::Command cmd = channel->spec->translate[int(req->type)]; - return channel->check_row_open(cmd, req->addr_vec.data()); - } - - bool is_row_open(typename T::Command cmd, const vector& addr_vec) - { - return channel->check_row_open(cmd, addr_vec.data()); - } - - // void update_temp(ALDRAM::Temp current_temperature) - // { - // } - - // For telling whether this channel is busying in processing read or write - bool is_active() { - return (channel->cur_serving_requests > 0); - } - - // For telling whether this channel is under refresh - bool is_refresh() { - return clk <= channel->end_of_refreshing; - } - - void set_high_writeq_watermark(const float watermark) { - wr_high_watermark = watermark; - } - - void set_low_writeq_watermark(const float watermark) { - wr_low_watermark = watermark; - } - - void record_core(int coreid) { - record_read_hits[coreid] = read_row_hits[coreid]; - record_read_misses[coreid] = read_row_misses[coreid]; - record_read_conflicts[coreid] = read_row_conflicts[coreid]; - record_write_hits[coreid] = write_row_hits[coreid]; - record_write_misses[coreid] = write_row_misses[coreid]; - record_write_conflicts[coreid] = write_row_conflicts[coreid]; - } - -private: - typename T::Command get_first_cmd(list::iterator req) - { - typename T::Command cmd = channel->spec->translate[int(req->type)]; - return channel->decode(cmd, req->addr_vec.data()); - } - - // upgrade to an autoprecharge command - void cmd_issue_autoprecharge(typename T::Command& cmd, - const vector& addr_vec) { - - // currently, autoprecharge is only used with closed row policy - if(channel->spec->is_accessing(cmd) && rowpolicy->type == RowPolicy::Type::ClosedAP) { - // check if it is the last request to the opened row - Queue* queue = write_mode ? &writeq : &readq; - - auto begin = addr_vec.begin(); - vector rowgroup(begin, begin + int(T::Level::Row) + 1); - - int num_row_hits = 0; - - for (auto itr = queue->q.begin(); itr != queue->q.end(); ++itr) { - if (is_row_hit(itr)) { - auto begin2 = itr->addr_vec.begin(); - vector rowgroup2(begin2, begin2 + int(T::Level::Row) + 1); - if(rowgroup == rowgroup2) - num_row_hits++; - } - } - - if(num_row_hits == 0) { - Queue* queue = &actq; - for (auto itr = queue->q.begin(); itr != queue->q.end(); ++itr) { - if (is_row_hit(itr)) { - auto begin2 = itr->addr_vec.begin(); - vector rowgroup2(begin2, begin2 + int(T::Level::Row) + 1); - if(rowgroup == rowgroup2) - num_row_hits++; - } - } - } - - assert(num_row_hits > 0); // The current request should be a hit, - // so there should be at least one request - // that hits in the current open row - if(num_row_hits == 1) { - if(cmd == T::Command::RD) - cmd = T::Command::RDA; - else if (cmd == T::Command::WR) - cmd = T::Command::WRA; - else - assert(false && "Unimplemented command type."); - } - } - - } - - void issue_cmd(typename T::Command cmd, const vector& addr_vec) - { - cmd_issue_autoprecharge(cmd, addr_vec); - assert(is_ready(cmd, addr_vec)); - channel->update(cmd, addr_vec.data(), clk); - - if(cmd == T::Command::PRE){ - if(rowtable->get_hits(addr_vec, true) == 0){ - useless_activates++; - } - } - - rowtable->update(cmd, addr_vec, clk); - if (record_cmd_trace){ - // select rank - auto& file = cmd_trace_files[addr_vec[1]]; - string& cmd_name = channel->spec->command_name[int(cmd)]; - file<spec->standard_name == "DDR4" || channel->spec->standard_name == "GDDR5") - bank_id += addr_vec[int(T::Level::Bank) - 1] * channel->spec->org_entry.count[int(T::Level::Bank)]; - file<<','<spec->command_name[int(cmd)].c_str(), clk); - for (int lev = 0; lev < int(T::Level::MAX); lev++) - printf(" %5d", addr_vec[lev]); - printf("\n"); - } - } - vector get_addr_vec(typename T::Command cmd, list::iterator req){ - return req->addr_vec; - } -}; - -// template <> -// vector Controller::get_addr_vec( -// SALP::Command cmd, list::iterator req); -// -// template <> -// bool Controller::is_ready(list::iterator req); -// -// template <> -// void Controller::update_temp(ALDRAM::Temp current_temperature); -// -// template <> -// void Controller::tick(); -// -// template <> -// void Controller::cmd_issue_autoprecharge(typename TLDRAM::Command& cmd, -// const vector& addr_vec); -// -} /*namespace ram*/ - -#endif /*__CONTROLLER_H*/ diff --git a/TOGSim/extern/ramulator_custom/src/DDR4.cpp b/TOGSim/extern/ramulator_custom/src/DDR4.cpp deleted file mode 100644 index 31064182..00000000 --- a/TOGSim/extern/ramulator_custom/src/DDR4.cpp +++ /dev/null @@ -1,418 +0,0 @@ -#include "DDR4.h" -#include "DRAM.h" - -using namespace std; -using namespace ram; - -string DDR4::standard_name = "DDR4"; -string DDR4::level_str [int(Level::MAX)] = {"Ch", "Ra", "Bg", "Ba", "Ro", "Co"}; - -map DDR4::org_map = { - {"DDR4_2Gb_x4", DDR4::Org::DDR4_2Gb_x4}, {"DDR4_2Gb_x8", DDR4::Org::DDR4_2Gb_x8}, {"DDR4_2Gb_x16", DDR4::Org::DDR4_2Gb_x16}, - {"DDR4_4Gb_x4", DDR4::Org::DDR4_4Gb_x4}, {"DDR4_4Gb_x8", DDR4::Org::DDR4_4Gb_x8}, {"DDR4_4Gb_x16", DDR4::Org::DDR4_4Gb_x16}, - {"DDR4_8Gb_x4", DDR4::Org::DDR4_8Gb_x4}, {"DDR4_8Gb_x8", DDR4::Org::DDR4_8Gb_x8}, {"DDR4_8Gb_x16", DDR4::Org::DDR4_8Gb_x16}, -}; - -map DDR4::speed_map = { - {"DDR4_1600K", DDR4::Speed::DDR4_1600K}, {"DDR4_1600L", DDR4::Speed::DDR4_1600L}, - {"DDR4_1866M", DDR4::Speed::DDR4_1866M}, {"DDR4_1866N", DDR4::Speed::DDR4_1866N}, - {"DDR4_2133P", DDR4::Speed::DDR4_2133P}, {"DDR4_2133R", DDR4::Speed::DDR4_2133R}, - {"DDR4_2400R", DDR4::Speed::DDR4_2400R}, {"DDR4_2400U", DDR4::Speed::DDR4_2400U}, - {"DDR4_3200", DDR4::Speed::DDR4_3200}, -}; - - -DDR4::DDR4(Org org, Speed speed) - : org_entry(org_table[int(org)]), - speed_entry(speed_table[int(speed)]), - read_latency(speed_entry.nCL + speed_entry.nBL) -{ - init_speed(); - init_prereq(); - init_rowhit(); // SAUGATA: added row hit function - init_rowopen(); - init_lambda(); - init_timing(); -} - -DDR4::DDR4(const string& org_str, const string& speed_str) : - DDR4(org_map[org_str], speed_map[speed_str]) -{ -} - -void DDR4::set_channel_number(int channel) { - org_entry.count[int(Level::Channel)] = channel; -} - -void DDR4::set_rank_number(int rank) { - org_entry.count[int(Level::Rank)] = rank; -} - -void DDR4::init_speed() -{ - const static int RRDS_TABLE[2][5] = { - {4, 4, 4, 4, 4}, - {5, 5, 6, 7, 9} - }; - const static int RRDL_TABLE[2][5] = { - {5, 5, 6, 6, 8}, - {6, 6, 7, 8, 11} - }; - const static int FAW_TABLE[3][5] = { - {16, 16, 16, 16, 16}, - {20, 22, 23, 26, 34}, - {28, 28, 32, 36, 48} - }; - const static int RFC_TABLE[int(RefreshMode::MAX)][3][5] = {{ - {128, 150, 171, 192, 256}, - {208, 243, 278, 312, 416}, - {280, 327, 374, 420, 560} - },{ - {88, 103, 118, 132, 176}, - {128, 150, 171, 192, 256}, - {208, 243, 278, 312, 416} - },{ - {72, 84, 96, 108, 144}, - {88, 103, 118, 132, 176}, - {128, 150, 171, 192, 256} - } - }; - const static int REFI_TABLE[5] = { - 6240, 7280, 8320, 9360, 12480 - }; - const static int XS_TABLE[3][5] = { - {136, 159, 182, 204, 272}, - {216, 252, 288, 324, 432}, - {288, 336, 384, 432, 576} - }; - - int speed = 0, density = 0; - switch (speed_entry.rate) { - case 1600: speed = 0; break; - case 1866: speed = 1; break; - case 2133: speed = 2; break; - case 2400: speed = 3; break; - case 3200: speed = 4; break; - default: assert(false); - }; - switch (org_entry.size >> 10){ - case 2: density = 0; break; - case 4: density = 1; break; - case 8: density = 2; break; - default: assert(false); - } - speed_entry.nRRDS = RRDS_TABLE[org_entry.dq == 16? 1: 0][speed]; - speed_entry.nRRDL = RRDL_TABLE[org_entry.dq == 16? 1: 0][speed]; - speed_entry.nFAW = FAW_TABLE[org_entry.dq == 4? 0: org_entry.dq == 8? 1: 2][speed]; - speed_entry.nRFC = RFC_TABLE[(int)refresh_mode][density][speed]; - speed_entry.nREFI = (REFI_TABLE[speed] >> int(refresh_mode)); - speed_entry.nXS = XS_TABLE[density][speed]; -} - - -void DDR4::init_prereq() -{ - // RD - prereq[int(Level::Rank)][int(Command::RD)] = [] (DRAM* node, Command cmd, int id) { - switch (int(node->state)) { - case int(State::PowerUp): return Command::MAX; - case int(State::ActPowerDown): return Command::PDX; - case int(State::PrePowerDown): return Command::PDX; - case int(State::SelfRefresh): return Command::SRX; - default: { - assert(false); - return Command::MAX; - } - }}; - prereq[int(Level::Bank)][int(Command::RD)] = [] (DRAM* node, Command cmd, int id) { - switch (int(node->state)) { - case int(State::Closed): return Command::ACT; - case int(State::Opened): - if (node->row_state.find(id) != node->row_state.end()) - return cmd; - else return Command::PRE; - default: { - assert(false); - return Command::MAX; - } - }}; - - // WR - prereq[int(Level::Rank)][int(Command::WR)] = prereq[int(Level::Rank)][int(Command::RD)]; - prereq[int(Level::Bank)][int(Command::WR)] = prereq[int(Level::Bank)][int(Command::RD)]; - - // REF - prereq[int(Level::Rank)][int(Command::REF)] = [] (DRAM* node, Command cmd, int id) { - for (auto bg : node->children) - for (auto bank: bg->children) { - if (bank->state == State::Closed) - continue; - return Command::PREA; - } - return Command::REF;}; - - // PD - prereq[int(Level::Rank)][int(Command::PDE)] = [] (DRAM* node, Command cmd, int id) { - switch (int(node->state)) { - case int(State::PowerUp): return Command::PDE; - case int(State::ActPowerDown): return Command::PDE; - case int(State::PrePowerDown): return Command::PDE; - case int(State::SelfRefresh): return Command::SRX; - default: { - assert(false); - return Command::MAX; - } - }}; - - // SR - prereq[int(Level::Rank)][int(Command::SRE)] = [] (DRAM* node, Command cmd, int id) { - switch (int(node->state)) { - case int(State::PowerUp): return Command::SRE; - case int(State::ActPowerDown): return Command::PDX; - case int(State::PrePowerDown): return Command::PDX; - case int(State::SelfRefresh): return Command::SRE; - default: { - assert(false); - return Command::MAX; - } - }}; -} - -// SAUGATA: added row hit check functions to see if the desired location is currently open -void DDR4::init_rowhit() -{ - // RD - rowhit[int(Level::Bank)][int(Command::RD)] = [] (DRAM* node, Command cmd, int id) { - switch (int(node->state)) { - case int(State::Closed): return false; - case int(State::Opened): - if (node->row_state.find(id) != node->row_state.end()) - return true; - return false; - default: { - assert(false); - return false; - } - }}; - - // WR - rowhit[int(Level::Bank)][int(Command::WR)] = rowhit[int(Level::Bank)][int(Command::RD)]; -} - -void DDR4::init_rowopen() -{ - // RD - rowopen[int(Level::Bank)][int(Command::RD)] = [] (DRAM* node, Command cmd, int id) { - switch (int(node->state)) { - case int(State::Closed): return false; - case int(State::Opened): return true; - default: { - assert(false); - return false; - } - }}; - - // WR - rowopen[int(Level::Bank)][int(Command::WR)] = rowopen[int(Level::Bank)][int(Command::RD)]; -} - -void DDR4::init_lambda() -{ - lambda[int(Level::Bank)][int(Command::ACT)] = [] (DRAM* node, int id) { - node->state = State::Opened; - node->row_state[id] = State::Opened;}; - lambda[int(Level::Bank)][int(Command::PRE)] = [] (DRAM* node, int id) { - node->state = State::Closed; - node->row_state.clear();}; - lambda[int(Level::Rank)][int(Command::PREA)] = [] (DRAM* node, int id) { - for (auto bg : node->children) - for (auto bank : bg->children) { - bank->state = State::Closed; - bank->row_state.clear(); - }}; - lambda[int(Level::Rank)][int(Command::REF)] = [] (DRAM* node, int id) {}; - lambda[int(Level::Bank)][int(Command::RD)] = [] (DRAM* node, int id) {}; - lambda[int(Level::Bank)][int(Command::WR)] = [] (DRAM* node, int id) {}; - lambda[int(Level::Bank)][int(Command::RDA)] = [] (DRAM* node, int id) { - node->state = State::Closed; - node->row_state.clear();}; - lambda[int(Level::Bank)][int(Command::WRA)] = [] (DRAM* node, int id) { - node->state = State::Closed; - node->row_state.clear();}; - lambda[int(Level::Rank)][int(Command::PDE)] = [] (DRAM* node, int id) { - for (auto bg : node->children) - for (auto bank : bg->children) { - if (bank->state == State::Closed) - continue; - node->state = State::ActPowerDown; - return; - } - node->state = State::PrePowerDown;}; - lambda[int(Level::Rank)][int(Command::PDX)] = [] (DRAM* node, int id) { - node->state = State::PowerUp;}; - lambda[int(Level::Rank)][int(Command::SRE)] = [] (DRAM* node, int id) { - node->state = State::SelfRefresh;}; - lambda[int(Level::Rank)][int(Command::SRX)] = [] (DRAM* node, int id) { - node->state = State::PowerUp;}; -} - - -void DDR4::init_timing() -{ - SpeedEntry& s = speed_entry; - vector *t; - - /*** Channel ***/ - t = timing[int(Level::Channel)]; - - // CAS <-> CAS - t[int(Command::RD)].push_back({Command::RD, 1, s.nBL}); - t[int(Command::RD)].push_back({Command::RDA, 1, s.nBL}); - t[int(Command::RDA)].push_back({Command::RD, 1, s.nBL}); - t[int(Command::RDA)].push_back({Command::RDA, 1, s.nBL}); - t[int(Command::WR)].push_back({Command::WR, 1, s.nBL}); - t[int(Command::WR)].push_back({Command::WRA, 1, s.nBL}); - t[int(Command::WRA)].push_back({Command::WR, 1, s.nBL}); - t[int(Command::WRA)].push_back({Command::WRA, 1, s.nBL}); - - - /*** Rank ***/ - t = timing[int(Level::Rank)]; - - // CAS <-> CAS - t[int(Command::RD)].push_back({Command::RD, 1, s.nCCDS}); - t[int(Command::RD)].push_back({Command::RDA, 1, s.nCCDS}); - t[int(Command::RDA)].push_back({Command::RD, 1, s.nCCDS}); - t[int(Command::RDA)].push_back({Command::RDA, 1, s.nCCDS}); - t[int(Command::WR)].push_back({Command::WR, 1, s.nCCDS}); - t[int(Command::WR)].push_back({Command::WRA, 1, s.nCCDS}); - t[int(Command::WRA)].push_back({Command::WR, 1, s.nCCDS}); - t[int(Command::WRA)].push_back({Command::WRA, 1, s.nCCDS}); - t[int(Command::RD)].push_back({Command::WR, 1, s.nCL + s.nBL + 2 - s.nCWL}); - t[int(Command::RD)].push_back({Command::WRA, 1, s.nCL + s.nBL + 2 - s.nCWL}); - t[int(Command::RDA)].push_back({Command::WR, 1, s.nCL + s.nBL + 2 - s.nCWL}); - t[int(Command::RDA)].push_back({Command::WRA, 1, s.nCL + s.nBL + 2 - s.nCWL}); - t[int(Command::WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRS}); - t[int(Command::WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRS}); - t[int(Command::WRA)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRS}); - t[int(Command::WRA)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRS}); - - // CAS <-> CAS (between sibling ranks) - t[int(Command::RD)].push_back({Command::RD, 1, s.nBL + s.nRTRS, true}); - t[int(Command::RD)].push_back({Command::RDA, 1, s.nBL + s.nRTRS, true}); - t[int(Command::RDA)].push_back({Command::RD, 1, s.nBL + s.nRTRS, true}); - t[int(Command::RDA)].push_back({Command::RDA, 1, s.nBL + s.nRTRS, true}); - t[int(Command::RD)].push_back({Command::WR, 1, s.nBL + s.nRTRS, true}); - t[int(Command::RD)].push_back({Command::WRA, 1, s.nBL + s.nRTRS, true}); - t[int(Command::RDA)].push_back({Command::WR, 1, s.nBL + s.nRTRS, true}); - t[int(Command::RDA)].push_back({Command::WRA, 1, s.nBL + s.nRTRS, true}); - t[int(Command::RD)].push_back({Command::WR, 1, s.nCL + s.nBL + s.nRTRS - s.nCWL, true}); - t[int(Command::RD)].push_back({Command::WRA, 1, s.nCL + s.nBL + s.nRTRS - s.nCWL, true}); - t[int(Command::RDA)].push_back({Command::WR, 1, s.nCL + s.nBL + s.nRTRS - s.nCWL, true}); - t[int(Command::RDA)].push_back({Command::WRA, 1, s.nCL + s.nBL + s.nRTRS - s.nCWL, true}); - t[int(Command::WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nRTRS - s.nCL, true}); - t[int(Command::WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nRTRS - s.nCL, true}); - t[int(Command::WRA)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nRTRS - s.nCL, true}); - t[int(Command::WRA)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nRTRS - s.nCL, true}); - - t[int(Command::RD)].push_back({Command::PREA, 1, s.nRTP}); - t[int(Command::WR)].push_back({Command::PREA, 1, s.nCWL + s.nBL + s.nWR}); - - // CAS <-> PD - t[int(Command::RD)].push_back({Command::PDE, 1, s.nCL + s.nBL + 1}); - t[int(Command::RDA)].push_back({Command::PDE, 1, s.nCL + s.nBL + 1}); - t[int(Command::WR)].push_back({Command::PDE, 1, s.nCWL + s.nBL + s.nWR}); - t[int(Command::WRA)].push_back({Command::PDE, 1, s.nCWL + s.nBL + s.nWR + 1}); // +1 for pre - t[int(Command::PDX)].push_back({Command::RD, 1, s.nXP}); - t[int(Command::PDX)].push_back({Command::RDA, 1, s.nXP}); - t[int(Command::PDX)].push_back({Command::WR, 1, s.nXP}); - t[int(Command::PDX)].push_back({Command::WRA, 1, s.nXP}); - - // CAS <-> SR: none (all banks have to be precharged) - - // RAS <-> RAS - t[int(Command::ACT)].push_back({Command::ACT, 1, s.nRRDS}); - t[int(Command::ACT)].push_back({Command::ACT, 4, s.nFAW}); - t[int(Command::ACT)].push_back({Command::PREA, 1, s.nRAS}); - t[int(Command::PREA)].push_back({Command::ACT, 1, s.nRP}); - - // RAS <-> REF - t[int(Command::ACT)].push_back({Command::REF, 1, s.nRC}); - t[int(Command::PRE)].push_back({Command::REF, 1, s.nRP}); - t[int(Command::PREA)].push_back({Command::REF, 1, s.nRP}); - t[int(Command::RDA)].push_back({Command::REF, 1, s.nRTP + s.nRP}); - t[int(Command::WRA)].push_back({Command::REF, 1, s.nCWL + s.nBL + s.nWR + s.nRP}); - t[int(Command::REF)].push_back({Command::ACT, 1, s.nRFC}); - - // RAS <-> PD - t[int(Command::ACT)].push_back({Command::PDE, 1, 1}); - t[int(Command::PDX)].push_back({Command::ACT, 1, s.nXP}); - t[int(Command::PDX)].push_back({Command::PRE, 1, s.nXP}); - t[int(Command::PDX)].push_back({Command::PREA, 1, s.nXP}); - - // RAS <-> SR - t[int(Command::PRE)].push_back({Command::SRE, 1, s.nRP}); - t[int(Command::PREA)].push_back({Command::SRE, 1, s.nRP}); - t[int(Command::SRX)].push_back({Command::ACT, 1, s.nXS}); - - // REF <-> REF - t[int(Command::REF)].push_back({Command::REF, 1, s.nRFC}); - - // REF <-> PD - t[int(Command::REF)].push_back({Command::PDE, 1, 1}); - t[int(Command::PDX)].push_back({Command::REF, 1, s.nXP}); - - // REF <-> SR - t[int(Command::SRX)].push_back({Command::REF, 1, s.nXS}); - - // PD <-> PD - t[int(Command::PDE)].push_back({Command::PDX, 1, s.nPD}); - t[int(Command::PDX)].push_back({Command::PDE, 1, s.nXP}); - - // PD <-> SR - t[int(Command::PDX)].push_back({Command::SRE, 1, s.nXP}); - t[int(Command::SRX)].push_back({Command::PDE, 1, s.nXS}); - - // SR <-> SR - t[int(Command::SRE)].push_back({Command::SRX, 1, s.nCKESR}); - t[int(Command::SRX)].push_back({Command::SRE, 1, s.nXS}); - - /*** Bank Group ***/ - t = timing[int(Level::BankGroup)]; - // CAS <-> CAS - t[int(Command::RD)].push_back({Command::RD, 1, s.nCCDL}); - t[int(Command::RD)].push_back({Command::RDA, 1, s.nCCDL}); - t[int(Command::RDA)].push_back({Command::RD, 1, s.nCCDL}); - t[int(Command::RDA)].push_back({Command::RDA, 1, s.nCCDL}); - t[int(Command::WR)].push_back({Command::WR, 1, s.nCCDL}); - t[int(Command::WR)].push_back({Command::WRA, 1, s.nCCDL}); - t[int(Command::WRA)].push_back({Command::WR, 1, s.nCCDL}); - t[int(Command::WRA)].push_back({Command::WRA, 1, s.nCCDL}); - t[int(Command::WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRL}); - t[int(Command::WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRL}); - t[int(Command::WRA)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRL}); - t[int(Command::WRA)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRL}); - - // RAS <-> RAS - t[int(Command::ACT)].push_back({Command::ACT, 1, s.nRRDL}); - - /*** Bank ***/ - t = timing[int(Level::Bank)]; - - // CAS <-> RAS - t[int(Command::ACT)].push_back({Command::RD, 1, s.nRCD}); - t[int(Command::ACT)].push_back({Command::RDA, 1, s.nRCD}); - t[int(Command::ACT)].push_back({Command::WR, 1, s.nRCD}); - t[int(Command::ACT)].push_back({Command::WRA, 1, s.nRCD}); - - t[int(Command::RD)].push_back({Command::PRE, 1, s.nRTP}); - t[int(Command::WR)].push_back({Command::PRE, 1, s.nCWL + s.nBL + s.nWR}); - - t[int(Command::RDA)].push_back({Command::ACT, 1, s.nRTP + s.nRP}); - t[int(Command::WRA)].push_back({Command::ACT, 1, s.nCWL + s.nBL + s.nWR + s.nRP}); - - // RAS <-> RAS - t[int(Command::ACT)].push_back({Command::ACT, 1, s.nRC}); - t[int(Command::ACT)].push_back({Command::PRE, 1, s.nRAS}); - t[int(Command::PRE)].push_back({Command::ACT, 1, s.nRP}); -} diff --git a/TOGSim/extern/ramulator_custom/src/DDR4.h b/TOGSim/extern/ramulator_custom/src/DDR4.h deleted file mode 100644 index 0808dc80..00000000 --- a/TOGSim/extern/ramulator_custom/src/DDR4.h +++ /dev/null @@ -1,220 +0,0 @@ -#ifndef __DDR4_H -#define __DDR4_H - -#include -#include -#include -#include - -#include "Request.h" - -using namespace std; - -namespace ram -{ -template -class DRAM; - -class DDR4 -{ -public: - static string standard_name; - enum class Org; - enum class Speed; - DDR4(Org org, Speed speed); - DDR4(const string& org_str, const string& speed_str); - - static map org_map; - static map speed_map; - /* Level */ - enum class Level : int - { - Channel, Rank, BankGroup, Bank, Row, Column, MAX - }; - - static std::string level_str [int(Level::MAX)]; - - /* Command */ - enum class Command : int - { - ACT, PRE, PREA, - RD, WR, RDA, WRA, - REF, PDE, PDX, SRE, SRX, - MAX - }; - - string command_name[int(Command::MAX)] = { - "ACT", "PRE", "PREA", - "RD", "WR", "RDA", "WRA", - "REF", "PDE", "PDX", "SRE", "SRX" - }; - - Level scope[int(Command::MAX)] = { - Level::Row, Level::Bank, Level::Rank, - Level::Column, Level::Column, Level::Column, Level::Column, - Level::Rank, Level::Rank, Level::Rank, Level::Rank, Level::Rank - }; - - bool is_opening(Command cmd) - { - switch(int(cmd)) { - case int(Command::ACT): - return true; - default: - return false; - } - } - - bool is_accessing(Command cmd) - { - switch(int(cmd)) { - case int(Command::RD): - case int(Command::WR): - case int(Command::RDA): - case int(Command::WRA): - return true; - default: - return false; - } - } - - bool is_closing(Command cmd) - { - switch(int(cmd)) { - case int(Command::RDA): - case int(Command::WRA): - case int(Command::PRE): - case int(Command::PREA): - return true; - default: - return false; - } - } - - bool is_refreshing(Command cmd) - { - switch(int(cmd)) { - case int(Command::REF): - return true; - default: - return false; - } - } - - /* State */ - enum class State : int - { - Opened, Closed, PowerUp, ActPowerDown, PrePowerDown, SelfRefresh, MAX - } start[int(Level::MAX)] = { - State::MAX, State::PowerUp, State::MAX, State::Closed, State::Closed, State::MAX - }; - - /* Translate */ - Command translate[int(Request::Type::MAX)] = { - Command::RD, Command::WR, - Command::REF, Command::PDE, Command::SRE - }; - - /* Prereq */ - function*, Command cmd, int)> prereq[int(Level::MAX)][int(Command::MAX)]; - - // SAUGATA: added function object container for row hit status - /* Row hit */ - function*, Command cmd, int)> rowhit[int(Level::MAX)][int(Command::MAX)]; - function*, Command cmd, int)> rowopen[int(Level::MAX)][int(Command::MAX)]; - - /* Timing */ - struct TimingEntry - { - Command cmd; - int dist; - int val; - bool sibling; - }; - vector timing[int(Level::MAX)][int(Command::MAX)]; - - /* Lambda */ - function*, int)> lambda[int(Level::MAX)][int(Command::MAX)]; - - /* Organization */ - enum class Org : int - { - DDR4_2Gb_x4, DDR4_2Gb_x8, DDR4_2Gb_x16, - DDR4_4Gb_x4, DDR4_4Gb_x8, DDR4_4Gb_x16, - DDR4_8Gb_x4, DDR4_8Gb_x8, DDR4_8Gb_x16, - MAX - }; - - struct OrgEntry { - int size; - int dq; - int count[int(Level::MAX)]; - } org_table[int(Org::MAX)] = { - {2<<10, 4, {0, 0, 4, 4, 1<<15, 1<<10}}, {2<<10, 8, {0, 0, 4, 4, 1<<14, 1<<10}}, {2<<10, 16, {0, 0, 2, 4, 1<<14, 1<<10}}, - {4<<10, 4, {0, 0, 4, 4, 1<<16, 1<<10}}, {4<<10, 8, {0, 0, 4, 4, 1<<15, 1<<10}}, {4<<10, 16, {0, 0, 2, 4, 1<<15, 1<<10}}, - {8<<10, 4, {0, 0, 4, 4, 1<<17, 1<<10}}, {8<<10, 8, {0, 0, 4, 4, 1<<16, 1<<10}}, {8<<10, 16, {0, 0, 2, 4, 1<<16, 1<<10}} - }, org_entry; - - void set_channel_number(int channel); - void set_rank_number(int rank); - - /* Speed */ - enum class Speed : int - { - DDR4_1600K, DDR4_1600L, - DDR4_1866M, DDR4_1866N, - DDR4_2133P, DDR4_2133R, - DDR4_2400R, DDR4_2400U, - DDR4_3200, - MAX - }; - - enum class RefreshMode : int - { - Refresh_1X, - Refresh_2X, - Refresh_4X, - MAX - } refresh_mode = RefreshMode::Refresh_1X; - - int prefetch_size = 8; // 8n prefetch DDR - int channel_width = 64; - - struct SpeedEntry { - int rate; - double freq, tCK; - int nBL, nCCDS, nCCDL, nRTRS; - int nCL, nRCD, nRP, nCWL; - int nRAS, nRC; - int nRTP, nWTRS, nWTRL, nWR; - int nRRDS, nRRDL, nFAW; - int nRFC, nREFI; - int nPD, nXP, nXPDLL; // XPDLL not found in DDR4?? - int nCKESR, nXS, nXSDLL; // nXSDLL TBD (nDLLK), nXS = (tRFC+10ns)/tCK - } speed_table[int(Speed::MAX)] = { - {1600, (400.0/3)*6, (3/0.4)/6, 4, 4, 5, 2, 11, 11, 11, 9, 28, 39, 6, 2, 6, 12, 0, 0, 0, 0, 0, 4, 5, 0, 5, 0, 0}, - {1600, (400.0/3)*6, (3/0.4)/6, 4, 4, 5, 2, 12, 12, 12, 9, 28, 40, 6, 2, 6, 12, 0, 0, 0, 0, 0, 4, 5, 0, 5, 0, 0}, - {1866, (400.0/3)*7, (3/0.4)/7, 4, 4, 5, 2, 13, 13, 13, 10, 32, 45, 7, 3, 7, 14, 0, 0, 0, 0, 0, 5, 6, 0, 6, 0, 0}, - {1866, (400.0/3)*7, (3/0.4)/7, 4, 4, 5, 2, 14, 14, 14, 10, 32, 46, 7, 3, 7, 14, 0, 0, 0, 0, 0, 5, 6, 0, 6, 0, 0}, - {2133, (400.0/3)*8, (3/0.4)/8, 4, 4, 6, 2, 15, 15, 15, 11, 36, 51, 8, 3, 8, 16, 0, 0, 0, 0, 0, 6, 7, 0, 7, 0, 0}, - {2133, (400.0/3)*8, (3/0.4)/8, 4, 4, 6, 2, 16, 16, 16, 11, 36, 52, 8, 3, 8, 16, 0, 0, 0, 0, 0, 6, 7, 0, 7, 0, 0}, - {2400, (400.0/3)*9, (3/0.4)/9, 4, 4, 6, 2, 16, 16, 16, 12, 39, 55, 9, 3, 9, 18, 0, 0, 0, 0, 0, 6, 8, 0, 7, 0, 0}, - {2400, (400.0/3)*9, (3/0.4)/9, 4, 4, 6, 2, 18, 18, 18, 12, 39, 57, 9, 3, 9, 18, 0, 0, 0, 0, 0, 6, 8, 0, 7, 0, 0}, - {3200, 1600, 0.625, prefetch_size/2/*DDR*/, 4, 10, 2, 22, 22, 22, 16, 56, 78, 12, 4, 12, 24, 8, 10, 40, 0, 0, 8, 10, 0, 8, 0, 0} - //rate, freq, tCK, nBL, nCCDS nCCDL nRTRS nCL nRCD nRP nCWL nRAS nRC nRTP nWTRS nWTRL nWR nRRDS nRRDL nFAW nRFC nREFI nPD nXP nXPDLL nCKESR nXS nXSDLL - }, speed_entry; - - int read_latency; - -private: - void init_speed(); - void init_lambda(); - void init_prereq(); - void init_rowhit(); // SAUGATA: added function to check for row hits - void init_rowopen(); - void init_timing(); -}; - -} /*namespace ram*/ - -#endif /*__DDR4_H*/ diff --git a/TOGSim/extern/ramulator_custom/src/DRAM.h b/TOGSim/extern/ramulator_custom/src/DRAM.h deleted file mode 100644 index fe5405b6..00000000 --- a/TOGSim/extern/ramulator_custom/src/DRAM.h +++ /dev/null @@ -1,453 +0,0 @@ -#ifndef __DRAM_H -#define __DRAM_H - -#include "Statistics.h" -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -using namespace std; - -namespace ram -{ - -template -class DRAM -{ -public: - ScalarStat active_cycles; - ScalarStat refresh_cycles; - ScalarStat busy_cycles; - ScalarStat active_refresh_overlap_cycles; - - ScalarStat serving_requests; - ScalarStat average_serving_requests; - - // Constructor - DRAM(T* spec, typename T::Level level); - ~DRAM(); - - // Specification (e.g., DDR3) - T* spec; - - // Tree Organization (e.g., Channel->Rank->Bank->Row->Column) - typename T::Level level; - int id; - long size; - DRAM* parent; - vector children; - - // State (e.g., Opened, Closed) - typename T::State state; - - // State of Rows: - // There are too many rows for them to be instantiated individually - // Instead, their bank (or an equivalent entity) tracks their state for them - robin_hood::unordered_flat_map row_state; - - // Insert a node as one of my child nodes - void insert(DRAM* child); - - // Decode a command into its "prerequisite" command (if any is needed) - typename T::Command decode(typename T::Command cmd, const int* addr); - - // Check whether a command is ready to be scheduled - bool check(typename T::Command cmd, const int* addr, long clk); - - // Check whether a command is a row hit - bool check_row_hit(typename T::Command cmd, const int* addr); - - // Check whether a row is open - bool check_row_open(typename T::Command cmd, const int* addr); - - // Return the earliest clock when a command is ready to be scheduled - long get_next(typename T::Command cmd, const int* addr); - - // Update the timing/state of the tree, signifying that a command has been issued - void update(typename T::Command cmd, const int* addr, long clk); - // Update statistics: - - // Update the number of requests it serves currently - void update_serving_requests(const int* addr, int delta, long clk); - - // TIANSHI: current serving requests count - int cur_serving_requests = 0; - long begin_of_serving = -1; - long end_of_serving = -1; - long begin_of_cur_reqcnt = -1; - long begin_of_refreshing = -1; - long end_of_refreshing = -1; - std::vector> refresh_intervals; - - // register statistics - void regStats(const std::string& identifier); - - void finish(long dram_cycles); - -private: - // Constructor - DRAM(){} - - // Timing - long cur_clk = 0; - long next[int(T::Command::MAX)]; // the earliest time in the future when a command could be ready - deque prev[int(T::Command::MAX)]; // the most recent history of when commands were issued - - // Lookup table for which commands must be preceded by which other commands (i.e., "prerequisite") - // E.g., a read command to a closed bank must be preceded by an activate command - function*, typename T::Command cmd, int)>* prereq; - - // SAUGATA: added table for row hits - // Lookup table for whether a command is a row hit - // E.g., a read command to a closed bank must be preceded by an activate command - function*, typename T::Command cmd, int)>* rowhit; - function*, typename T::Command cmd, int)>* rowopen; - - // Lookup table between commands and the state transitions they trigger - // E.g., an activate command to a closed bank opens both the bank and the row - function*, int)>* lambda; - - // Lookup table for timing parameters - // E.g., activate->precharge: tRAS@bank, activate->activate: tRC@bank - vector* timing; - - // Helper Functions - void update_state(typename T::Command cmd, const int* addr); - void update_timing(typename T::Command cmd, const int* addr, long clk); -}; /* class DRAM */ - - -// register statistics -template -void DRAM::regStats(const std::string& identifier) { - active_cycles - .name("active_cycles" + identifier + "_" + to_string(id)) - .desc("Total active cycles for level " + identifier + "_" + to_string(id)) - .precision(0) - ; - refresh_cycles - .name("refresh_cycles" + identifier + "_" + to_string(id)) - .desc("(All-bank refresh only, only valid for rank level) The sum of cycles that is under refresh per memory cycle for level " + identifier + "_" + to_string(id)) - .precision(0) - .flags(Stat::nozero) - ; - busy_cycles - .name("busy_cycles" + identifier + "_" + to_string(id)) - .desc("(All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level " + identifier + "_" + to_string(id)) - .precision(0) - ; - active_refresh_overlap_cycles - .name("active_refresh_overlap_cycles" + identifier + "_" + to_string(id)) - .desc("(All-bank refresh only, only valid for rank level) The sum of cycles that are both active and under refresh per memory cycle for level " + identifier + "_" + to_string(id)) - .precision(0) - .flags(Stat::nozero) - ; - serving_requests - .name("serving_requests" + identifier + "_" + to_string(id)) - .desc("The sum of read and write requests that are served in this DRAM element per memory cycle for level " + identifier + "_" + to_string(id)) - .precision(0) - ; - average_serving_requests - .name("average_serving_requests" + identifier + "_" + to_string(id)) - .desc("The average of read and write requests that are served in this DRAM element per memory cycle for level " + identifier + "_" + to_string(id)) - .precision(6) - ; - - if (!children.size()) { - return; - } - - // recursively register children statistics - for (auto child : children) { - child->regStats(identifier + "_" + to_string(id)); - } -} - -template -void DRAM::finish(long dram_cycles) { - // finalize busy cycles - busy_cycles = active_cycles.value() + refresh_cycles.value() - active_refresh_overlap_cycles.value(); - - // finalize average serving requests - average_serving_requests = serving_requests.value() / dram_cycles; - - if (!children.size()) { - return; - } - - for (auto child : children) { - child->finish(dram_cycles); - } -} - -// Constructor -template -DRAM::DRAM(T* spec, typename T::Level level) : - spec(spec), level(level), id(0), parent(NULL) -{ - - state = spec->start[(int)level]; - prereq = spec->prereq[int(level)]; - rowhit = spec->rowhit[int(level)]; - rowopen = spec->rowopen[int(level)]; - lambda = spec->lambda[int(level)]; - timing = spec->timing[int(level)]; - - fill_n(next, int(T::Command::MAX), -1); // initialize future - for (int cmd = 0; cmd < int(T::Command::MAX); cmd++) { - int dist = 0; - for (auto& t : timing[cmd]) - dist = max(dist, t.dist); - - if (dist) - prev[cmd].resize(dist, -1); // initialize history - } - - // try to recursively construct my children - int child_level = int(level) + 1; - if (child_level == int(T::Level::Row)) - return; // stop recursion: rows are not instantiated as nodes - - int child_max = spec->org_entry.count[child_level]; - if (!child_max) - return; // stop recursion: the number of children is unspecified - - // recursively construct my children - for (int i = 0; i < child_max; i++) { - DRAM* child = new DRAM(spec, typename T::Level(child_level)); - child->parent = this; - child->id = i; - children.push_back(child); - } - -} - -template -DRAM::~DRAM() -{ - for (auto child: children) - delete child; -} - -// Insert -template -void DRAM::insert(DRAM* child) -{ - child->parent = this; - child->id = children.size(); - children.push_back(child); -} - -// Decode -template -typename T::Command DRAM::decode(typename T::Command cmd, const int* addr) -{ - int child_id = addr[int(level)+1]; - if (prereq[int(cmd)]) { - typename T::Command prereq_cmd = prereq[int(cmd)](this, cmd, child_id); - if (prereq_cmd != T::Command::MAX) - return prereq_cmd; // stop recursion: there is a prerequisite at this level - } - - if (child_id < 0 || !children.size()) - return cmd; // stop recursion: there were no prequisites at any level - - // recursively decode at my child - return children[child_id]->decode(cmd, addr); -} - - -// Check -template -bool DRAM::check(typename T::Command cmd, const int* addr, long clk) -{ - if (next[int(cmd)] != -1 && clk < next[int(cmd)]) - return false; // stop recursion: the check failed at this level - - int child_id = addr[int(level)+1]; - if (child_id < 0 || level == spec->scope[int(cmd)] || !children.size()) - return true; // stop recursion: the check passed at all levels - - // recursively check my child - return children[child_id]->check(cmd, addr, clk); -} - -// SAUGATA: added function to check whether a command is a row hit -// Check row hits -template -bool DRAM::check_row_hit(typename T::Command cmd, const int* addr) -{ - int child_id = addr[int(level)+1]; - if (rowhit[int(cmd)]) { - return rowhit[int(cmd)](this, cmd, child_id); // stop recursion: there is a row hit at this level - } - - if (child_id < 0 || !children.size()) - return false; // stop recursion: there were no row hits at any level - - // recursively check for row hits at my child - return children[child_id]->check_row_hit(cmd, addr); -} - -template -bool DRAM::check_row_open(typename T::Command cmd, const int* addr) -{ - int child_id = addr[int(level)+1]; - if (rowopen[int(cmd)]) { - return rowopen[int(cmd)](this, cmd, child_id); // stop recursion: there is a row hit at this level - } - - if (child_id < 0 || !children.size()) - return false; // stop recursion: there were no row hits at any level - - // recursively check for row hits at my child - return children[child_id]->check_row_open(cmd, addr); -} - -template -long DRAM::get_next(typename T::Command cmd, const int* addr) -{ - long next_clk = max(cur_clk, next[int(cmd)]); - auto node = this; - for (int l = int(level); l < int(spec->scope[int(cmd)]) && node->children.size() && addr[l + 1] >= 0; l++){ - node = node->children[addr[l + 1]]; - next_clk = max(next_clk, node->next[int(cmd)]); - } - return next_clk; -} - -// Update -template -void DRAM::update(typename T::Command cmd, const int* addr, long clk) -{ - cur_clk = clk; - update_state(cmd, addr); - update_timing(cmd, addr, clk); -} - - -// Update (State) -template -void DRAM::update_state(typename T::Command cmd, const int* addr) -{ - int child_id = addr[int(level)+1]; - if (lambda[int(cmd)]) - lambda[int(cmd)](this, child_id); // update this level - - if (level == spec->scope[int(cmd)] || !children.size()) - return; // stop recursion: updated all levels - - // recursively update my child - children[child_id]->update_state(cmd, addr); -} - - -// Update (Timing) -template -void DRAM::update_timing(typename T::Command cmd, const int* addr, long clk) -{ - // I am not a target node: I am merely one of its siblings - if (id != addr[int(level)]) { - for (auto& t : timing[int(cmd)]) { - if (!t.sibling) - continue; // not an applicable timing parameter - - assert (t.dist == 1); - - long future = clk + t.val; - next[int(t.cmd)] = max(next[int(t.cmd)], future); // update future - } - - return; // stop recursion: only target nodes should be recursed - } - - // I am a target node - if (prev[int(cmd)].size()) { - prev[int(cmd)].pop_back(); // FIXME TIANSHI why pop back? - prev[int(cmd)].push_front(clk); // update history - } - - for (auto& t : timing[int(cmd)]) { - if (t.sibling) - continue; // not an applicable timing parameter - - long past = prev[int(cmd)][t.dist-1]; - if (past < 0) - continue; // not enough history - - long future = past + t.val; - next[int(t.cmd)] = max(next[int(t.cmd)], future); // update future - // TIANSHI: for refresh statistics - if (spec->is_refreshing(cmd) && spec->is_opening(t.cmd)) { - assert(past == clk); - begin_of_refreshing = clk; - end_of_refreshing = max(end_of_refreshing, next[int(t.cmd)]); - refresh_cycles += end_of_refreshing - clk; - if (cur_serving_requests > 0) { - refresh_intervals.push_back(make_pair(begin_of_refreshing, end_of_refreshing)); - } - } - } - - // Some commands have timings that are higher that their scope levels, thus - // we do not stop at the cmd's scope level - if (!children.size()) - return; // stop recursion: updated all levels - - // recursively update *all* of my children - for (auto child : children) - child->update_timing(cmd, addr, clk); - -} - -template -void DRAM::update_serving_requests(const int* addr, int delta, long clk) { - assert(id == addr[int(level)]); - assert(delta == 1 || delta == -1); - // update total serving requests - if (begin_of_cur_reqcnt != -1 && cur_serving_requests > 0) { - serving_requests += (clk - begin_of_cur_reqcnt) * cur_serving_requests; - active_cycles += clk - begin_of_cur_reqcnt; - } - // update begin of current request number - begin_of_cur_reqcnt = clk; - cur_serving_requests += delta; - assert(cur_serving_requests >= 0); - - if (delta == 1 && cur_serving_requests == 1) { - // transform from inactive to active - begin_of_serving = clk; - if (end_of_refreshing > begin_of_serving) { - active_refresh_overlap_cycles += end_of_refreshing - begin_of_serving; - } - } else if (cur_serving_requests == 0) { - // transform from active to inactive - assert(begin_of_serving != -1); - assert(delta == -1); - active_cycles += clk - begin_of_cur_reqcnt; - end_of_serving = clk; - - for (const auto& ref: refresh_intervals) { - active_refresh_overlap_cycles += min(end_of_serving, ref.second) - ref.first; - } - refresh_intervals.clear(); - } - - int child_id = addr[int(level) + 1]; - // We only count the level bank or the level higher than bank - if (child_id < 0 || !children.size() || (int(level) > int(T::Level::Bank)) ) { - return; - } - children[child_id]->update_serving_requests(addr, delta, clk); -} - -} /* namespace ram */ - -#endif /* __DRAM_H */ diff --git a/TOGSim/extern/ramulator_custom/src/HBM.cpp b/TOGSim/extern/ramulator_custom/src/HBM.cpp deleted file mode 100644 index 00f8f704..00000000 --- a/TOGSim/extern/ramulator_custom/src/HBM.cpp +++ /dev/null @@ -1,413 +0,0 @@ -#include "HBM.h" -#include "DRAM.h" - -#include - -using namespace std; -using namespace ram; - -string HBM::standard_name = "HBM"; -string HBM::level_str [int(Level::MAX)] = {"Ch", "Ra", "Bg", "Ba", "Ro", "Co"}; - -map HBM::org_map = { - {"HBM_1Gb", HBM::Org::HBM_1Gb}, - {"HBM_2Gb", HBM::Org::HBM_2Gb}, - {"HBM_4Gb", HBM::Org::HBM_4Gb}, -}; - -map HBM::speed_map = { - {"HBM_1Gbps", HBM::Speed::HBM_1Gbps}, - {"HBM_2Gbps", HBM::Speed::HBM_2Gbps}, -}; - -HBM::HBM(Org org, Speed speed) - : org_entry(org_table[int(org)]), - speed_entry(speed_table[int(speed)]), - read_latency(speed_entry.nCL + speed_entry.nBL) -{ - init_speed(); - init_prereq(); - init_rowhit(); // SAUGATA: added row hit function - init_rowopen(); - init_lambda(); - init_timing(); -} - -HBM::HBM(const string& org_str, const string& speed_str) : - HBM(org_map[org_str], speed_map[speed_str]) -{ -} - -void HBM::set_channel_number(int channel) { - org_entry.count[int(Level::Channel)] = channel; -} - -void HBM::set_rank_number(int rank) { - org_entry.count[int(Level::Rank)] = rank; -} - - -void HBM::init_speed() -{ - const static int RFC_TABLE[int(Speed::MAX)][int(Org::MAX)] = { - {55, 80, 130}, - {110, 160, 260} - }; - const static int REFI1B_TABLE[int(Speed::MAX)][int(Org::MAX)] = { - {64, 128, 256}, - {128, 256, 512} - }; - const static int XS_TABLE[int(Speed::MAX)][int(Org::MAX)] = { - {60, 85, 135}, - {120, 170, 270} - }; - - int speed = 0, density = 0; - switch (speed_entry.rate) { - case 1000: speed = 0; break; - case 2000: speed = 1; break; - default: assert(false); - }; - switch (org_entry.size >> 10){ - case 1: density = 0; break; - case 2: density = 1; break; - case 4: density = 2; break; - default: assert(false); - } - speed_entry.nRFC = RFC_TABLE[speed][density]; - speed_entry.nREFI1B = REFI1B_TABLE[speed][density]; - speed_entry.nXS = XS_TABLE[speed][density]; -} - - -void HBM::init_prereq() -{ - // RD - prereq[int(Level::Rank)][int(Command::RD)] = [] (DRAM* node, Command cmd, int id) { - switch (int(node->state)) { - case int(State::PowerUp): return Command::MAX; - case int(State::ActPowerDown): return Command::PDX; - case int(State::PrePowerDown): return Command::PDX; - case int(State::SelfRefresh): return Command::SRX; - default: { - assert(false); - return Command::MAX; - } - }}; - prereq[int(Level::Bank)][int(Command::RD)] = [] (DRAM* node, Command cmd, int id) { - switch (int(node->state)) { - case int(State::Closed): return Command::ACT; - case int(State::Opened): - if (node->row_state.find(id) != node->row_state.end()) - return cmd; - else return Command::PRE; - default: { - assert(false); - return Command::MAX; - } - }}; - - // WR - prereq[int(Level::Rank)][int(Command::WR)] = prereq[int(Level::Rank)][int(Command::RD)]; - prereq[int(Level::Rank)][int(Command::PIM_WR)] = prereq[int(Level::Rank)][int(Command::RD)]; - - prereq[int(Level::Bank)][int(Command::WR)] = prereq[int(Level::Bank)][int(Command::RD)]; - - // REF - prereq[int(Level::Rank)][int(Command::REF)] = [] (DRAM* node, Command cmd, int id) { - for (auto bg : node->children) - for (auto bank: bg->children) { - if (bank->state == State::Closed) - continue; - return Command::PREA; - } - return Command::REF;}; - - // REFSB - prereq[int(Level::Bank)][int(Command::REFSB)] = [] (DRAM* node, Command cmd, int id) { - if (node->state == State::Closed) return Command::REFSB; - return Command::PRE;}; - - // PD - prereq[int(Level::Rank)][int(Command::PDE)] = [] (DRAM* node, Command cmd, int id) { - switch (int(node->state)) { - case int(State::PowerUp): return Command::PDE; - case int(State::ActPowerDown): return Command::PDE; - case int(State::PrePowerDown): return Command::PDE; - case int(State::SelfRefresh): return Command::SRX; - default: { - assert(false); - return Command::MAX; - } - }}; - - // SR - prereq[int(Level::Rank)][int(Command::SRE)] = [] (DRAM* node, Command cmd, int id) { - switch (int(node->state)) { - case int(State::PowerUp): return Command::SRE; - case int(State::ActPowerDown): return Command::PDX; - case int(State::PrePowerDown): return Command::PDX; - case int(State::SelfRefresh): return Command::SRE; - default: { - assert(false); - return Command::MAX; - } - }}; -} - -// SAUGATA: added row hit check functions to see if the desired location is currently open -void HBM::init_rowhit() -{ - // RD - rowhit[int(Level::Bank)][int(Command::RD)] = [] (DRAM* node, Command cmd, int id) { - switch (int(node->state)) { - case int(State::Closed): return false; - case int(State::Opened): - if (node->row_state.find(id) != node->row_state.end()) - return true; - return false; - default: { - assert(false); - return false; - } - }}; - - // WR - rowhit[int(Level::Bank)][int(Command::WR)] = rowhit[int(Level::Bank)][int(Command::RD)]; - rowhit[int(Level::Bank)][int(Command::PIM_WR)] = [] (DRAM* node, Command cmd, int id) { - return true; - }; -} - -void HBM::init_rowopen() -{ - // RD - rowopen[int(Level::Bank)][int(Command::RD)] = [] (DRAM* node, Command cmd, int id) { - switch (int(node->state)) { - case int(State::Closed): return false; - case int(State::Opened): return true; - default: { - assert(false); - return false; - } - }}; - - // WR - rowopen[int(Level::Bank)][int(Command::WR)] = rowopen[int(Level::Bank)][int(Command::RD)]; - rowopen[int(Level::Bank)][int(Command::PIM_WR)] = [] (DRAM* node, Command cmd, int id) { - return true; - }; -} - -void HBM::init_lambda() -{ - lambda[int(Level::Bank)][int(Command::ACT)] = [] (DRAM* node, int id) { - node->state = State::Opened; - node->row_state[id] = State::Opened;}; - lambda[int(Level::Bank)][int(Command::PRE)] = [] (DRAM* node, int id) { - node->state = State::Closed; - node->row_state.clear();}; - lambda[int(Level::Rank)][int(Command::PREA)] = [] (DRAM* node, int id) { - for (auto bg : node->children) - for (auto bank : bg->children) { - bank->state = State::Closed; - bank->row_state.clear(); - }}; - lambda[int(Level::Rank)][int(Command::REF)] = [] (DRAM* node, int id) {}; - lambda[int(Level::Bank)][int(Command::RD)] = [] (DRAM* node, int id) {}; - lambda[int(Level::Bank)][int(Command::WR)] = [] (DRAM* node, int id) {}; - lambda[int(Level::Bank)][int(Command::PIM_WR)] = [] (DRAM* node, int id) {}; - lambda[int(Level::Bank)][int(Command::RDA)] = [] (DRAM* node, int id) { - node->state = State::Closed; - node->row_state.clear();}; - lambda[int(Level::Bank)][int(Command::WRA)] = [] (DRAM* node, int id) { - node->state = State::Closed; - node->row_state.clear();}; - lambda[int(Level::Rank)][int(Command::PDE)] = [] (DRAM* node, int id) { - for (auto bg : node->children) - for (auto bank : bg->children) { - if (bank->state == State::Closed) - continue; - node->state = State::ActPowerDown; - return; - } - node->state = State::PrePowerDown;}; - lambda[int(Level::Rank)][int(Command::PDX)] = [] (DRAM* node, int id) { - node->state = State::PowerUp;}; - lambda[int(Level::Rank)][int(Command::SRE)] = [] (DRAM* node, int id) { - node->state = State::SelfRefresh;}; - lambda[int(Level::Rank)][int(Command::SRX)] = [] (DRAM* node, int id) { - node->state = State::PowerUp;}; -} - - -void HBM::init_timing() -{ - SpeedEntry& s = speed_entry; - vector *t; - - /*** Channel ***/ - t = timing[int(Level::Channel)]; - - // CAS <-> CAS - t[int(Command::RD)].push_back({Command::RD, 1, s.nBL}); - t[int(Command::RD)].push_back({Command::RDA, 1, s.nBL}); - t[int(Command::RDA)].push_back({Command::RD, 1, s.nBL}); - t[int(Command::RDA)].push_back({Command::RDA, 1, s.nBL}); - t[int(Command::WR)].push_back({Command::WR, 1, s.nBL}); - t[int(Command::WR)].push_back({Command::WRA, 1, s.nBL}); - t[int(Command::WRA)].push_back({Command::WR, 1, s.nBL}); - t[int(Command::WRA)].push_back({Command::WRA, 1, s.nBL}); - - // PIM_WR - t[int(Command::WR)].push_back({Command::PIM_WR, 1, s.nBL}); - t[int(Command::PIM_WR)].push_back({Command::WR, 1, s.nBL}); - t[int(Command::PIM_WR)].push_back({Command::PIM_WR, 1, s.nBL}); - - /*** Rank ***/ - t = timing[int(Level::Rank)]; - - // CAS <-> CAS - t[int(Command::RD)].push_back({Command::RD, 1, s.nCCDS}); - t[int(Command::RD)].push_back({Command::RDA, 1, s.nCCDS}); - t[int(Command::RDA)].push_back({Command::RD, 1, s.nCCDS}); - t[int(Command::RDA)].push_back({Command::RDA, 1, s.nCCDS}); - - t[int(Command::WR)].push_back({Command::WR, 1, s.nCCDS}); - t[int(Command::WR)].push_back({Command::PIM_WR, 1, s.nCCDS}); - t[int(Command::PIM_WR)].push_back({Command::WR, 1, s.nCCDS}); - t[int(Command::PIM_WR)].push_back({Command::WR, 1, s.nCCDS}); - - t[int(Command::WR)].push_back({Command::WRA, 1, s.nCCDS}); - t[int(Command::WRA)].push_back({Command::WR, 1, s.nCCDS}); - t[int(Command::WRA)].push_back({Command::WRA, 1, s.nCCDS}); - t[int(Command::RD)].push_back({Command::WR, 1, s.nCL + s.nCCDS + 2 - s.nCWL}); - t[int(Command::RD)].push_back({Command::WRA, 1, s.nCL + s.nCCDS + 2 - s.nCWL}); - t[int(Command::RDA)].push_back({Command::WR, 1, s.nCL + s.nCCDS + 2 - s.nCWL}); - t[int(Command::RDA)].push_back({Command::WRA, 1, s.nCL + s.nCCDS + 2 - s.nCWL}); - - t[int(Command::WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRS}); - t[int(Command::WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRS}); - - t[int(Command::PIM_WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRS}); - t[int(Command::PIM_WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRS}); - - t[int(Command::WRA)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRS}); - t[int(Command::WRA)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRS}); - - t[int(Command::RD)].push_back({Command::PREA, 1, s.nRTP}); - t[int(Command::WR)].push_back({Command::PREA, 1, s.nCWL + s.nBL + s.nWR}); - - // CAS <-> PD - t[int(Command::RD)].push_back({Command::PDE, 1, s.nCL + s.nBL + 1}); - t[int(Command::RDA)].push_back({Command::PDE, 1, s.nCL + s.nBL + 1}); - t[int(Command::WR)].push_back({Command::PDE, 1, s.nCWL + s.nBL + s.nWR}); - t[int(Command::WRA)].push_back({Command::PDE, 1, s.nCWL + s.nBL + s.nWR + 1}); // +1 for pre - t[int(Command::PDX)].push_back({Command::RD, 1, s.nXP}); - t[int(Command::PDX)].push_back({Command::RDA, 1, s.nXP}); - t[int(Command::PDX)].push_back({Command::WR, 1, s.nXP}); - t[int(Command::PDX)].push_back({Command::WRA, 1, s.nXP}); - - // CAS <-> SR: none (all banks have to be precharged) - - // RAS <-> RAS - t[int(Command::ACT)].push_back({Command::ACT, 1, s.nRRDS}); - t[int(Command::ACT)].push_back({Command::ACT, 4, s.nFAW}); - t[int(Command::ACT)].push_back({Command::PREA, 1, s.nRAS}); - t[int(Command::PREA)].push_back({Command::ACT, 1, s.nRP}); - - // RAS <-> REF - t[int(Command::PRE)].push_back({Command::REF, 1, s.nRP}); - t[int(Command::PREA)].push_back({Command::REF, 1, s.nRP}); - t[int(Command::REF)].push_back({Command::ACT, 1, s.nRFC}); - - // RAS <-> PD - t[int(Command::ACT)].push_back({Command::PDE, 1, 1}); - t[int(Command::PDX)].push_back({Command::ACT, 1, s.nXP}); - t[int(Command::PDX)].push_back({Command::PRE, 1, s.nXP}); - t[int(Command::PDX)].push_back({Command::PREA, 1, s.nXP}); - - // RAS <-> SR - t[int(Command::PRE)].push_back({Command::SRE, 1, s.nRP}); - t[int(Command::PREA)].push_back({Command::SRE, 1, s.nRP}); - t[int(Command::SRX)].push_back({Command::ACT, 1, s.nXS}); - - // REF <-> REF - t[int(Command::REF)].push_back({Command::REF, 1, s.nRFC}); - - // REF <-> PD - t[int(Command::REF)].push_back({Command::PDE, 1, 1}); - t[int(Command::PDX)].push_back({Command::REF, 1, s.nXP}); - - // REF <-> SR - t[int(Command::SRX)].push_back({Command::REF, 1, s.nXS}); - - // PD <-> PD - t[int(Command::PDE)].push_back({Command::PDX, 1, s.nPD}); - t[int(Command::PDX)].push_back({Command::PDE, 1, s.nXP}); - - // PD <-> SR - t[int(Command::PDX)].push_back({Command::SRE, 1, s.nXP}); - t[int(Command::SRX)].push_back({Command::PDE, 1, s.nXS}); - - // SR <-> SR - t[int(Command::SRE)].push_back({Command::SRX, 1, s.nCKESR}); - t[int(Command::SRX)].push_back({Command::SRE, 1, s.nXS}); - - /*** Bank Group ***/ - t = timing[int(Level::BankGroup)]; - // CAS <-> CAS - t[int(Command::RD)].push_back({Command::RD, 1, s.nCCDL}); - t[int(Command::RD)].push_back({Command::RDA, 1, s.nCCDL}); - t[int(Command::RDA)].push_back({Command::RD, 1, s.nCCDL}); - t[int(Command::RDA)].push_back({Command::RDA, 1, s.nCCDL}); - t[int(Command::WR)].push_back({Command::WR, 1, s.nCCDL}); - t[int(Command::WR)].push_back({Command::WRA, 1, s.nCCDL}); - t[int(Command::PIM_WR)].push_back({Command::WR, 1, s.nCCDL}); - t[int(Command::PIM_WR)].push_back({Command::WRA, 1, s.nCCDL}); - - t[int(Command::WR)].push_back({Command::PIM_WR, 1, s.nCCDL}); - - t[int(Command::PIM_WR)].push_back({Command::PIM_WR, 1, s.nCCDL}); - - t[int(Command::WRA)].push_back({Command::WR, 1, s.nCCDL}); - t[int(Command::WRA)].push_back({Command::WRA, 1, s.nCCDL}); - t[int(Command::WRA)].push_back({Command::WR, 1, s.nCCDL}); - t[int(Command::WRA)].push_back({Command::WRA, 1, s.nCCDL}); - t[int(Command::WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRL}); - t[int(Command::WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRL}); - - t[int(Command::PIM_WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRL}); - t[int(Command::PIM_WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRL}); - - t[int(Command::WRA)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRL}); - t[int(Command::WRA)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRL}); - - // RAS <-> RAS - t[int(Command::ACT)].push_back({Command::ACT, 1, s.nRRDL}); - - /*** Bank ***/ - t = timing[int(Level::Bank)]; - - // CAS <-> RAS - t[int(Command::ACT)].push_back({Command::RD, 1, s.nRCDR}); - t[int(Command::ACT)].push_back({Command::RDA, 1, s.nRCDR}); - t[int(Command::ACT)].push_back({Command::WR, 1, s.nRCDW}); - t[int(Command::ACT)].push_back({Command::WRA, 1, s.nRCDW}); - - t[int(Command::RD)].push_back({Command::PRE, 1, s.nRTP}); - t[int(Command::WR)].push_back({Command::PRE, 1, s.nCWL + s.nBL + s.nWR}); - - t[int(Command::RDA)].push_back({Command::ACT, 1, s.nRTP + s.nRP}); - t[int(Command::WRA)].push_back({Command::ACT, 1, s.nCWL + s.nBL + s.nWR + s.nRP}); - - // RAS <-> RAS - t[int(Command::ACT)].push_back({Command::ACT, 1, s.nRC}); - t[int(Command::ACT)].push_back({Command::PRE, 1, s.nRAS}); - t[int(Command::PRE)].push_back({Command::ACT, 1, s.nRP}); - - // REFSB - t[int(Command::PRE)].push_back({Command::REFSB, 1, s.nRP}); - t[int(Command::REFSB)].push_back({Command::REFSB, 1, s.nRFC}); - t[int(Command::REFSB)].push_back({Command::ACT, 1, s.nRFC}); -} diff --git a/TOGSim/extern/ramulator_custom/src/HBM.h b/TOGSim/extern/ramulator_custom/src/HBM.h deleted file mode 100644 index b52f0500..00000000 --- a/TOGSim/extern/ramulator_custom/src/HBM.h +++ /dev/null @@ -1,228 +0,0 @@ -#ifndef __HBM_H -#define __HBM_H - -#include -#include -#include -#include - -#include "Request.h" - -using namespace std; - -namespace ram -{ -template -class DRAM; - -class HBM -{ -public: - static string standard_name; - enum class Org; - enum class Speed; - HBM(Org org, Speed speed); - HBM(const string& org_str, const string& speed_str); - - static map org_map; - static map speed_map; - - /* Level */ - enum class Level : int - { - Channel, Rank, BankGroup, Bank, Row, Column, MAX - }; - - static std::string level_str [int(Level::MAX)]; - - /* Command */ - enum class Command : int - { - ACT, PRE, PREA, - RD, WR, PIM_WR, RDA, WRA, - REF, REFSB, PDE, PDX, SRE, SRX, - MAX - }; - - // REFSB and REF is not compatible, choose one or the other. - // REFSB can be issued to banks in any order, as long as REFI1B - // is satisfied for all banks - - string command_name[int(Command::MAX)] = { - "ACT", "PRE", "PREA", - "RD", "WR", "PIM_WR", "RDA", "WRA", - "REF", "REFSB", "PDE", "PDX", "SRE", "SRX" - }; - - Level scope[int(Command::MAX)] = { - Level::Row, Level::Bank, Level::Rank, - Level::Column, Level::Column, Level::Column, Level::Column, Level::Column, - Level::Rank, Level::Bank, Level::Rank, Level::Rank, Level::Rank, Level::Rank - }; - - bool is_opening(Command cmd) - { - switch(int(cmd)) { - case int(Command::ACT): - return true; - default: - return false; - } - } - - bool is_accessing(Command cmd) - { - switch(int(cmd)) { - case int(Command::RD): - case int(Command::WR): - case int(Command::RDA): - case int(Command::WRA): - return true; - default: - return false; - } - } - - bool is_closing(Command cmd) - { - switch(int(cmd)) { - case int(Command::RDA): - case int(Command::WRA): - case int(Command::PRE): - case int(Command::PREA): - return true; - default: - return false; - } - } - - bool is_refreshing(Command cmd) - { - switch(int(cmd)) { - case int(Command::REF): - case int(Command::REFSB): - return true; - default: - return false; - } - } - - /* State */ - enum class State : int - { - Opened, Closed, PowerUp, ActPowerDown, PrePowerDown, SelfRefresh, MAX - } start[int(Level::MAX)] = { - State::MAX, State::PowerUp, State::MAX, State::Closed, State::Closed, State::MAX - }; - - /* Translate */ - Command translate[int(Request::Type::MAX)] = { - Command::RD, Command::WR, Command::PIM_WR, - Command::REF, Command::PDE, Command::SRE - }; - - /* Prereq */ - function*, Command cmd, int)> prereq[int(Level::MAX)][int(Command::MAX)]; - - // SAUGATA: added function object container for row hit status - /* Row hit */ - function*, Command cmd, int)> rowhit[int(Level::MAX)][int(Command::MAX)]; - function*, Command cmd, int)> rowopen[int(Level::MAX)][int(Command::MAX)]; - - /* Timing */ - struct TimingEntry - { - Command cmd; - int dist; - int val; - bool sibling; - }; - vector timing[int(Level::MAX)][int(Command::MAX)]; - - /* Lambda */ - function*, int)> lambda[int(Level::MAX)][int(Command::MAX)]; - - /* Organization */ - enum class Org : int - { // per channel density here. Each stack comes with 8 channels - HBM_1Gb, - HBM_2Gb, - HBM_4Gb, - MAX - }; - - struct OrgEntry { - int size; - int dq; - int count[int(Level::MAX)]; - } org_table[int(Org::MAX)] = { - {1<<10, 128, {0, 0, 4, 2, 1<<20, 1<<(6+1)}}, - {2<<10, 128, {0, 0, 4, 2, 1<<20, 1<<(6+1)}}, - {4<<10, 128, {0, 0, 4, 4, 1<<20, 1<<(6+1)}}, - }, org_entry; - - void set_channel_number(int channel); - void set_rank_number(int rank); - - /* Speed */ - enum class Speed : int - { - HBM_1Gbps, - HBM_2Gbps, - MAX - }; - - int prefetch_size = 2; // burst length could be 2 and 4 (choose 4 here), 2n prefetch - int channel_width = 128; - - struct SpeedEntry { - int rate; - double freq, tCK; - int nBL, nCCDS, nCCDL; - int nCL, nRCDR, nRCDW, nRP, nCWL; - int nRAS, nRC; - int nRTP, nWTRS, nWTRL, nWR; - int nRRDS, nRRDL, nFAW; - int nRFC, nREFI, nREFI1B; - int nPD, nXP; - int nCKESR, nXS; - } speed_table[int(Speed::MAX)] = { - {1000, // rate - 500, 2.0, // freq, tCK - // FIX: Why is nBL set to 2 instead of 1? - // FIX: It seems that this is because a single request corresponds to 64B, - // ,which means that `prefetch_size = 4`. - 1, 1, 2, // nBL, nCCDS, nCCDL - 7, 7, 6, 7, 4, // nCL, nRCDR, nRCDW, nRP, nCWL - 17, 24, // nRAS, nRC - 7, 2, 4, 8, // nRTP, nWTRS, nWTRL, nWR - 4, 5, 20, // nRRDS, nRRDL, nFAW - 0, 1950, 0, // nRFC, nREFI, nREFI1B - 5, 5, // nPD, nXP - 5, 0 }, // nCKESR, nXS - {2000, - 1000, 1.0, - 1, 1, 2, - 14, 14, 12, 14, 8, - 34, 48, - 14, 4, 8, 16, - 8, 10, 40, - 0, 3900, 0, - 10, 10, - 10, 0}, - }, speed_entry; - - int read_latency; - -private: - void init_speed(); - void init_lambda(); - void init_prereq(); - void init_rowhit(); // SAUGATA: added function to check for row hits - void init_rowopen(); - void init_timing(); -}; - -} /*namespace ram*/ - -#endif /*__HBM_H*/ diff --git a/TOGSim/extern/ramulator_custom/src/Memory.h b/TOGSim/extern/ramulator_custom/src/Memory.h deleted file mode 100644 index 45f7cc8b..00000000 --- a/TOGSim/extern/ramulator_custom/src/Memory.h +++ /dev/null @@ -1,684 +0,0 @@ -#ifndef __RAM_MEMORY_H -#define __RAM_MEMORY_H - -#include "DRAM.h" -#include "Request.h" -#include "Controller.h" -//#include "SpeedyController.h" -#include "Statistics.h" -// #include "GDDR5.h" -#include "HBM.h" -#include "Config.h" -// #include "LPDDR3.h" -// #include "LPDDR4.h" -// #include "WideIO2.h" -// #include "DSARP.h" -#include -#include -#include -#include -#include - -using namespace std; - -typedef vector MapSrcVector; -typedef map MapSchemeEntry; -typedef map MapScheme; - -namespace ram -{ -class MemoryBase{ -public: - MemoryBase() {} - virtual ~MemoryBase() {} - virtual double clk_ns() = 0; - virtual void tick() = 0; - virtual bool send(Request req) = 0; - virtual int pending_requests() = 0; - virtual void finish(void) = 0; - virtual long page_allocator(long addr, int coreid) = 0; - virtual void record_core(int coreid) = 0; - virtual void set_high_writeq_watermark(const float watermark) = 0; - virtual void set_low_writeq_watermark(const float watermark) = 0; - virtual bool done() const = 0; - virtual int get_transaction_bytes() const = 0; - virtual int get_num_channels() const = 0; - virtual bool is_full(int ch, bool is_write) const = 0; - virtual std::vector decode_mem_addr(uint64_t addr) = 0; -}; - -template class Controller = Controller > -class Memory : public MemoryBase -{ -protected: - ScalarStat dram_capacity; - ScalarStat num_dram_cycles; - ScalarStat num_incoming_requests; - VectorStat num_read_requests; - VectorStat num_write_requests; - ScalarStat ramulator_active_cycles; - VectorStat incoming_requests_per_channel; - VectorStat incoming_read_reqs_per_channel; - - ScalarStat physical_page_replacement; - ScalarStat maximum_bandwidth; - ScalarStat in_queue_req_num_sum; - ScalarStat in_queue_read_req_num_sum; - ScalarStat in_queue_write_req_num_sum; - ScalarStat in_queue_req_num_avg; - ScalarStat in_queue_read_req_num_avg; - ScalarStat in_queue_write_req_num_avg; - - VectorStat record_read_requests; - VectorStat record_write_requests; - - long max_address; - MapScheme mapping_scheme; - -public: - enum class Type { - ChRaBaRoCo, - RoBaRaCoCh, - RoCoBaRaCh, - MAX, - // } type = Type::ChRaBaRoCo; - } type = Type::RoBaRaCoCh; - - enum class Translation { - None, - Random, - MAX, - } translation = Translation::None; - - std::map name_to_translation = { - {"None", Translation::None}, - {"Random", Translation::Random}, - }; - - vector free_physical_pages; - long free_physical_pages_remaining; - map, long> page_translation; - - vector*> ctrls; - T * spec; - vector addr_bits; - string mapping_file; - bool use_mapping_file; - bool dump_mapping; - - int tx_bits; - - Memory(RamulatorConfig& configs, vector*> ctrls) - : ctrls(ctrls), - spec(ctrls[0]->channel->spec), - addr_bits(int(T::Level::MAX)) - { - // make sure 2^N channels/ranks - // TODO support channel number that is not powers of 2 - int *sz = spec->org_entry.count; - assert((sz[0] & (sz[0] - 1)) == 0); - assert((sz[1] & (sz[1] - 1)) == 0); - // validate size of one transaction - int tx = (spec->prefetch_size * spec->channel_width / 8); - tx_bits = calc_log2(tx); - assert((1<standard_name.substr(0, 4) == "DDR3"){ - if (configs["mapping"] != "defaultmapping"){ - init_mapping_with_file(configs["mapping"]); - // dump_mapping = true; - use_mapping_file = true; - } - } - // If hi address bits will not be assigned to Rows - // then the chips must not be LPDDRx 6Gb, 12Gb etc. - - if(configs["mapping"] == "RoBaRaCoCh") { - type = Type::RoBaRaCoCh; - } - else if(configs["mapping"] == "RoCoBaRaCh") { - type = Type::RoCoBaRaCh; - } - else if(configs["mapping"] == "ChRaBaRoCo") { - type = Type::ChRaBaRoCo; - } - else { - use_mapping_file = true; - init_mapping_with_file(configs["mapping"]); - } - - if (type != Type::RoBaRaCoCh && spec->standard_name.substr(0, 5) == "LPDDR") - assert((sz[int(T::Level::Row)] & (sz[int(T::Level::Row)] - 1)) == 0); - - max_address = spec->channel_width / 8; - - for (unsigned int lev = 0; lev < addr_bits.size(); lev++) { - addr_bits[lev] = calc_log2(sz[lev]); - max_address *= sz[lev]; - } - - addr_bits[int(T::Level::MAX) - 1] -= calc_log2(spec->prefetch_size); - - // Initiating translation - if (configs.contains("translation")) { - translation = name_to_translation[configs["translation"]]; - } - if (translation != Translation::None) { - // construct a list of available pages - // TODO: this should not assume a 4KB page! - free_physical_pages_remaining = max_address >> 12; - - free_physical_pages.resize(free_physical_pages_remaining, -1); - } - - dram_capacity - .name("dram_capacity") - .desc("Number of bytes in simulated DRAM") - .precision(0) - ; - dram_capacity = max_address; - - num_dram_cycles - .name("dram_cycles") - .desc("Number of DRAM cycles simulated") - .precision(0) - ; - num_incoming_requests - .name("incoming_requests") - .desc("Number of incoming requests to DRAM") - .precision(0) - ; - num_read_requests - .init(configs.get_core_num()) - .name("read_requests") - .desc("Number of incoming read requests to DRAM per core") - .precision(0) - ; - num_write_requests - .init(configs.get_core_num()) - .name("write_requests") - .desc("Number of incoming write requests to DRAM per core") - .precision(0) - ; - incoming_requests_per_channel - .init(sz[int(T::Level::Channel)]) - .name("incoming_requests_per_channel") - .desc("Number of incoming requests to each DRAM channel") - ; - incoming_read_reqs_per_channel - .init(sz[int(T::Level::Channel)]) - .name("incoming_read_reqs_per_channel") - .desc("Number of incoming read requests to each DRAM channel") - ; - - ramulator_active_cycles - .name("ramulator_active_cycles") - .desc("The total number of cycles that the DRAM part is active (serving R/W)") - .precision(0) - ; - physical_page_replacement - .name("physical_page_replacement") - .desc("The number of times that physical page replacement happens.") - .precision(0) - ; - maximum_bandwidth - .name("maximum_bandwidth") - .desc("The theoretical maximum bandwidth (Bps)") - .precision(0) - ; - in_queue_req_num_sum - .name("in_queue_req_num_sum") - .desc("Sum of read/write queue length") - .precision(0) - ; - in_queue_read_req_num_sum - .name("in_queue_read_req_num_sum") - .desc("Sum of read queue length") - .precision(0) - ; - in_queue_write_req_num_sum - .name("in_queue_write_req_num_sum") - .desc("Sum of write queue length") - .precision(0) - ; - in_queue_req_num_avg - .name("in_queue_req_num_avg") - .desc("Average of read/write queue length per memory cycle") - .precision(6) - ; - in_queue_read_req_num_avg - .name("in_queue_read_req_num_avg") - .desc("Average of read queue length per memory cycle") - .precision(6) - ; - in_queue_write_req_num_avg - .name("in_queue_write_req_num_avg") - .desc("Average of write queue length per memory cycle") - .precision(6) - ; - record_read_requests - .init(configs.get_core_num()) - .name("record_read_requests") - .desc("record read requests for this core when it reaches request limit or to the end") - ; - - record_write_requests - .init(configs.get_core_num()) - .name("record_write_requests") - .desc("record write requests for this core when it reaches request limit or to the end") - ; - - } - - ~Memory() - { - for (auto ctrl: ctrls) - delete ctrl; - delete spec; - } - - double clk_ns() - { - return spec->speed_entry.tCK; - } - - void record_core(int coreid) { - record_read_requests[coreid] = num_read_requests[coreid]; - record_write_requests[coreid] = num_write_requests[coreid]; - for (auto ctrl : ctrls) { - ctrl->record_core(coreid); - } - } - - void tick() - { - ++num_dram_cycles; - int cur_que_req_num = 0; - int cur_que_readreq_num = 0; - int cur_que_writereq_num = 0; - for (auto ctrl : ctrls) { - cur_que_req_num += ctrl->readq.size() + ctrl->writeq.size() + ctrl->pending.size(); - cur_que_readreq_num += ctrl->readq.size() + ctrl->pending.size(); - cur_que_writereq_num += ctrl->writeq.size(); - } - in_queue_req_num_sum += cur_que_req_num; - in_queue_read_req_num_sum += cur_que_readreq_num; - in_queue_write_req_num_sum += cur_que_writereq_num; - - bool is_active = false; - for (auto ctrl : ctrls) { - is_active = is_active || ctrl->is_active(); - ctrl->tick(); - } - if (is_active) { - ramulator_active_cycles++; - } - } - - bool is_full(int ch, bool is_write) const { - return ctrls[ch]->is_full(is_write); - } - - int get_num_channels() const { - return ctrls.size(); - } - - int get_transaction_bytes() const { - return (spec->prefetch_size * (spec->channel_width / 8)); - } - - std::vector decode_mem_addr(uint64_t target_addr) { - std::vector addr_vec(addr_bits.size(), 0); - uint64_t addr = target_addr; - // Each transaction size is 2^tx_bits, so first clear the lowest tx_bits bits - clear_lower_bits(addr, tx_bits); - if (use_mapping_file){ - apply_mapping(addr, addr_vec); - } - else { - switch(int(type)){ - case int(Type::ChRaBaRoCo): - for (int i = addr_bits.size() - 1; i >= 0; i--) - addr_vec[i] = slice_lower_bits(addr, addr_bits[i]); - break; - case int(Type::RoBaRaCoCh): - addr_vec[0] = slice_lower_bits(addr, addr_bits[0]); - addr_vec[addr_bits.size() - 1] = - slice_lower_bits(addr, addr_bits[addr_bits.size() - 1]); - for (int i = 1; i <= int(T::Level::Row); i++) - addr_vec[i] = slice_lower_bits(addr, addr_bits[i]); - break; - case int(Type::RoCoBaRaCh): - for (int i = 0; i <= int(T::Level::Bank); ++i) { - addr_vec[i] = slice_lower_bits(addr, addr_bits[i]); - } - addr_vec[int(T::Level::Column)] = - slice_lower_bits(addr, addr_bits[int(T::Level::Column)]); - addr_vec[int(T::Level::Row)] = - slice_lower_bits(addr, addr_bits[int(T::Level::Row)]); - break; - default: - assert(false); - } - } - return addr_vec; - } - - bool send(Request req) - { - // req.addr_vec.resize(addr_bits.size()); - // long addr = req.addr; - // int coreid = req.coreid; - // - // // Each transaction size is 2^tx_bits, so first clear the lowest tx_bits bits - // clear_lower_bits(addr, tx_bits); - // - // if (use_mapping_file){ - // apply_mapping(addr, req.addr_vec); - // } - // else { - // switch(int(type)){ - // case int(Type::ChRaBaRoCo): - // for (int i = addr_bits.size() - 1; i >= 0; i--) - // req.addr_vec[i] = slice_lower_bits(addr, addr_bits[i]); - // break; - // case int(Type::RoBaRaCoCh): - // req.addr_vec[0] = slice_lower_bits(addr, addr_bits[0]); - // req.addr_vec[addr_bits.size() - 1] = slice_lower_bits(addr, addr_bits[addr_bits.size() - 1]); - // for (int i = 1; i <= int(T::Level::Row); i++) - // req.addr_vec[i] = slice_lower_bits(addr, addr_bits[i]); - // break; - // default: - // assert(false); - // } - // } - - if(ctrls[req.getChannelID()]->enqueue(req)) { - // tally stats here to avoid double counting for requests that aren't enqueued - ++num_incoming_requests; - if (req.type == Request::Type::READ) { - ++num_read_requests[req.coreid]; - ++incoming_read_reqs_per_channel[req.addr_vec[int(T::Level::Channel)]]; - } - if (req.type == Request::Type::WRITE) { - ++num_write_requests[req.coreid]; - } - ++incoming_requests_per_channel[req.addr_vec[int(T::Level::Channel)]]; - return true; - } - - return false; - } - - void init_mapping_with_file(string filename){ - ifstream file(filename); - assert(file.good() && "Bad mapping file"); - // possible line types are: - // 0. Empty line - // 1. Direct bit assignment : component N = x - // 2. Direct range assignment : component N:M = x:y - // 3. XOR bit assignment : component N = x y z ... - // 4. Comment line : # comment here - string line; - char delim[] = " \t"; - while (getline(file, line)) { - short capture_flags = 0; - int level = -1; - int target_bit = -1, target_bit2 = -1; - int source_bit = -1, source_bit2 = -1; - // cout << "Processing: " << line << endl; - bool is_range = false; - while (true) { // process next word - size_t start = line.find_first_not_of(delim); - if (start == string::npos) // no more words - break; - size_t end = line.find_first_of(delim, start); - string word = line.substr(start, end - start); - - if (word.at(0) == '#')// starting a comment - break; - - size_t col_index; - int source_min, target_min, target_max; - switch (capture_flags){ - case 0: // capturing the component name - // fetch component level from channel spec - for (int i = 0; i < int(T::Level::MAX); i++) - if (word.find(T::level_str[i]) != string::npos) { - level = i; - capture_flags ++; - } - break; - - case 1: // capturing target bit(s) - col_index = word.find(":"); - if ( col_index != string::npos ){ - target_bit2 = stoi(word.substr(col_index+1)); - word = word.substr(0,col_index); - is_range = true; - } - target_bit = stoi(word); - capture_flags ++; - break; - - case 2: //this should be the delimiter - assert(word.find("=") != string::npos); - capture_flags ++; - break; - - case 3: - if (is_range){ - col_index = word.find(":"); - source_bit = stoi(word.substr(0,col_index)); - source_bit2 = stoi(word.substr(col_index+1)); - assert(source_bit2 - source_bit == target_bit2 - target_bit); - source_min = min(source_bit, source_bit2); - target_min = min(target_bit, target_bit2); - target_max = max(target_bit, target_bit2); - while (target_min <= target_max){ - mapping_scheme[level][target_min].push_back(source_min); - // cout << target_min << " <- " << source_min << endl; - source_min ++; - target_min ++; - } - } - else { - source_bit = stoi(word); - mapping_scheme[level][target_bit].push_back(source_bit); - } - } - if (end == string::npos) { // this is the last word - break; - } - line = line.substr(end); - } - } - if (dump_mapping) - dump_mapping_scheme(); - } - - void dump_mapping_scheme(){ - cout << "Mapping Scheme: " << endl; - for (MapScheme::iterator mapit = mapping_scheme.begin(); mapit != mapping_scheme.end(); mapit++) - { - int level = mapit->first; - for (MapSchemeEntry::iterator entit = mapit->second.begin(); entit != mapit->second.end(); entit++){ - cout << T::level_str[level] << "[" << entit->first << "] := "; - cout << "PhysicalAddress[" << *(entit->second.begin()) << "]"; - entit->second.erase(entit->second.begin()); - for (MapSrcVector::iterator it = entit->second.begin() ; it != entit->second.end(); it ++) - cout << " xor PhysicalAddress[" << *it << "]"; - cout << endl; - } - } - } - - void apply_mapping(long addr, std::vector& addr_vec){ - int *sz = spec->org_entry.count; - int addr_total_bits = sizeof(addr_vec)*8; - int addr_bits [int(T::Level::MAX)]; - for (int i = 0 ; i < int(T::Level::MAX) ; i ++) - { - if ( i != int(T::Level::Row)) - { - addr_bits[i] = calc_log2(sz[i]); - addr_total_bits -= addr_bits[i]; - } - } - // Row address is an integer. - addr_bits[int(T::Level::Row)] = min((int)sizeof(int)*8, max(addr_total_bits, calc_log2(sz[int(T::Level::Row)]))); - - // printf("Address: %lx => ",addr); - for (unsigned int lvl = 0; lvl < int(T::Level::MAX); lvl++) - { - unsigned int lvl_bits = addr_bits[lvl]; - addr_vec[lvl] = 0; - for (unsigned int bitindex = 0 ; bitindex < lvl_bits ; bitindex++){ - bool bitvalue = false; - for (MapSrcVector::iterator it = mapping_scheme[lvl][bitindex].begin() ; - it != mapping_scheme[lvl][bitindex].end(); it ++) - { - bitvalue = bitvalue xor get_bit_at(addr, *it); - } - addr_vec[lvl] |= (bitvalue << bitindex); - } - // printf("%s: %x, ",T::level_str[lvl].c_str(),addr_vec[lvl]); - } - // printf("\n"); - } - - int pending_requests() - { - int reqs = 0; - for (auto ctrl: ctrls) - reqs += ctrl->readq.size() + ctrl->writeq.size() + ctrl->otherq.size() + ctrl->actq.size() + ctrl->pending.size(); - return reqs; - } - - void set_high_writeq_watermark(const float watermark) { - for (auto ctrl: ctrls) - ctrl->set_high_writeq_watermark(watermark); - } - - void set_low_writeq_watermark(const float watermark) { - for (auto ctrl: ctrls) - ctrl->set_low_writeq_watermark(watermark); - } - - void finish(void) { - dram_capacity = max_address; - int *sz = spec->org_entry.count; - maximum_bandwidth = spec->speed_entry.rate * 1e6 * spec->channel_width * sz[int(T::Level::Channel)] / 8; - long dram_cycles = num_dram_cycles.value(); - for (auto ctrl : ctrls) { - long read_req = long(incoming_read_reqs_per_channel[ctrl->channel->id].value()); - ctrl->finish(read_req, dram_cycles); - } - - // finalize average queueing requests - in_queue_req_num_avg = in_queue_req_num_sum.value() / dram_cycles; - in_queue_read_req_num_avg = in_queue_read_req_num_sum.value() / dram_cycles; - in_queue_write_req_num_avg = in_queue_write_req_num_sum.value() / dram_cycles; - } - - bool done() const { - return std::all_of( - std::begin(ctrls), - std::end(ctrls), - [](const auto &ctrl) { - return ctrl->done(); - }); - } - - long page_allocator(long addr, int coreid) { - long virtual_page_number = addr >> 12; - - switch(int(translation)) { - case int(Translation::None): { - return addr; - } - case int(Translation::Random): { - auto target = make_pair(coreid, virtual_page_number); - if(page_translation.find(target) == page_translation.end()) { - // page doesn't exist, so assign a new page - // make sure there are physical pages left to be assigned - - // if physical page doesn't remain, replace a previous assigned - // physical page. - if (!free_physical_pages_remaining) { - physical_page_replacement++; - long phys_page_to_read = lrand() % free_physical_pages.size(); - assert(free_physical_pages[phys_page_to_read] != -1); - page_translation[target] = phys_page_to_read; - } else { - // assign a new page - long phys_page_to_read = lrand() % free_physical_pages.size(); - // if the randomly-selected page was already assigned - if(free_physical_pages[phys_page_to_read] != -1) { - long starting_page_of_search = phys_page_to_read; - - do { - // iterate through the list until we find a free page - // TODO: does this introduce serious non-randomness? - ++phys_page_to_read; - phys_page_to_read %= free_physical_pages.size(); - } - while((phys_page_to_read != starting_page_of_search) && free_physical_pages[phys_page_to_read] != -1); - } - - assert(free_physical_pages[phys_page_to_read] == -1); - - page_translation[target] = phys_page_to_read; - free_physical_pages[phys_page_to_read] = coreid; - --free_physical_pages_remaining; - } - } - - // SAUGATA TODO: page size should not always be fixed to 4KB - return (page_translation[target] << 12) | (addr & ((1 << 12) - 1)); - } - default: { - assert(false); - return -1; - } - } - - } - -private: - - int calc_log2(int val){ - int n = 0; - while ((val >>= 1)) - n ++; - return n; - } - int slice_lower_bits(uint64_t & addr, int bits) - { - int lbits = addr & ((1<>= bits; - return lbits; - } - bool get_bit_at(uint64_t addr, int bit) - { - return (((addr >> bit) & 1) == 1); - } - void clear_lower_bits(uint64_t & addr, int bits) - { - addr >>= bits; - } - long lrand(void) { - if(sizeof(int) < sizeof(long)) { - return static_cast(rand()) << (sizeof(int) * 8) | rand(); - } - - return rand(); - } -}; - -} /*namespace ram*/ - -#endif /*__MEMORY_H*/ diff --git a/TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp b/TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp deleted file mode 100644 index 9a15f3d1..00000000 --- a/TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp +++ /dev/null @@ -1,80 +0,0 @@ -#include "MemoryFactory.h" -// #include "LPDDR4.h" -// #include "WideIO.h" -// #include "WideIO2.h" -#include "HBM.h" -//#include "SALP.h" - -using namespace ram; - -namespace ram -{ -// -// template <> -// void MemoryFactory::validate(int channels, int ranks, RamulatorConfig& configs) { -// assert(channels >= 2 && "LPDDR4 requires 2, 4, 8 ... channels"); -// } -// -// template <> -// void MemoryFactory::validate(int channels, int ranks, RamulatorConfig& configs) { -// assert(channels == 4 && "WideIO comes with 4 channels"); -// } -// -// template <> -// void MemoryFactory::validate(int channels, int ranks, RamulatorConfig& configs) { -// assert((channels == 4 || channels == 8) && "WideIO2 comes with 4 or 8 channels"); -// assert((ranks == 1 || ranks == 2) && "WideIO2 comes with 1 or 2 ranks"); -// } - -template <> -void MemoryFactory::validate(int channels, int ranks, RamulatorConfig& configs) { - assert(channels == 8 && "HBM comes with 8 channels"); -} - -// template <> -// MemoryBase *MemoryFactory::create(RamulatorConfig& configs, int cacheline) { -// int channels = stoi(configs["channels"], NULL, 0); -// int ranks = stoi(configs["ranks"], NULL, 0); -// validate(channels, ranks, configs); -// -// const string& org_name = configs["org"]; -// const string& speed_name = configs["speed"]; -// -// WideIO2 *spec = new WideIO2(org_name, speed_name, channels); -// -// extend_channel_width(spec, cacheline); -// -// return (MemoryBase *)populate_memory(configs, spec, channels, ranks); -// } -// -// -// template <> -// MemoryBase *MemoryFactory::create(RamulatorConfig& configs, int cacheline) { -// int channels = stoi(configs["channels"], NULL, 0); -// int ranks = stoi(configs["ranks"], NULL, 0); -// int subarrays = stoi(configs["subarrays"], NULL, 0); -// validate(channels, ranks, configs); -// -// const string& std_name = configs["standard"]; -// const string& org_name = configs["org"]; -// const string& speed_name = configs["speed"]; -// -// SALP *spec = new SALP(org_name, speed_name, std_name, subarrays); -// -// extend_channel_width(spec, cacheline); -// -// return (MemoryBase *)populate_memory(configs, spec, channels, ranks); -// } - -} - -// This function can be used by autoconf AC_CHECK_LIB since -// apparently it can't detect C++ functions. -// Basically just an entry in the symbol table -// extern "C" -// { -// void libramulator_is_present(void) -// { -// ; -// } -// } diff --git a/TOGSim/extern/ramulator_custom/src/MemoryFactory.h b/TOGSim/extern/ramulator_custom/src/MemoryFactory.h deleted file mode 100644 index be10213b..00000000 --- a/TOGSim/extern/ramulator_custom/src/MemoryFactory.h +++ /dev/null @@ -1,84 +0,0 @@ -#ifndef __MEMORY_FACTORY_H -#define __MEMORY_FACTORY_H - -#include -#include -#include -#include - -#include "Memory.h" -#include "DRAM.h" -#include "Controller.h" -#include "Config.h" - -using namespace std; - -namespace ram -{ -template -class MemoryFactory { -public: - static void extend_channel_width(T* spec, int cacheline) - { - int channel_unit = spec->prefetch_size * spec->channel_width / 8; - int gang_number = cacheline / channel_unit; - - assert(gang_number >= 1 && - "cacheline size must be greater or equal to minimum channel width"); - - assert(cacheline == gang_number * channel_unit && - "cacheline size must be a multiple of minimum channel width"); - - spec->channel_width *= gang_number; - } - - static std::unique_ptr> populate_memory(RamulatorConfig& configs, - T *spec, - int channels, int ranks) { - int& default_ranks = spec->org_entry.count[int(T::Level::Rank)]; - int& default_channels = spec->org_entry.count[int(T::Level::Channel)]; - - if (default_channels == 0) default_channels = channels; - if (default_ranks == 0) default_ranks = ranks; - - vector *> ctrls; - for (int c = 0; c < channels; c++){ - DRAM* channel = new DRAM(spec, T::Level::Channel); - channel->id = c; - channel->regStats(""); - ctrls.push_back(new Controller(configs, channel)); - } - return std::make_unique>(configs, ctrls); - } - - static void validate(int channels, int ranks, RamulatorConfig& configs) { - assert(channels > 0 && ranks > 0); - } - - static std::unique_ptr create(RamulatorConfig& configs, - int cacheline) { - int channels = stoi(configs["channels"], NULL, 0); - int ranks = stoi(configs["ranks"], NULL, 0); - - validate(channels, ranks, configs); - - const string& org_name = configs["org"]; - const string& speed_name = configs["speed"]; - - T *spec = new T(org_name, speed_name); - - // Set channel width statically in the header file - //extend_channel_width(spec, cacheline); - - return populate_memory(configs, spec, channels, ranks); - } -}; - -// template <> -// MemoryBase *MemoryFactory::create(RamulatorConfig& configs, int cacheline); -// template <> -// MemoryBase *MemoryFactory::create(RamulatorConfig& configs, int cacheline); - -} /*namespace ram*/ - -#endif /*__MEMORY_FACTORY_H*/ diff --git a/TOGSim/extern/ramulator_custom/src/Ramulator.cpp b/TOGSim/extern/ramulator_custom/src/Ramulator.cpp deleted file mode 100644 index 6d37f8b1..00000000 --- a/TOGSim/extern/ramulator_custom/src/Ramulator.cpp +++ /dev/null @@ -1,171 +0,0 @@ -#include "Ramulator.hpp" -#include "Memory.h" -#include "MemoryFactory.h" -#include "DDR4.h" -#include "HBM.h" -#include "Request.h" - -namespace ram { - // TODO: init outputpendingqueue -Ramulator::Ramulator(const std::string ConfigFilePath, uint32_t num_core, bool is_pim) - : MemBase(createMemory(ConfigFilePath, num_core)), is_pim(is_pim) { - for (int ch = 0; ch < MemBase->get_num_channels(); ++ch) { - OutputPendingQueues.push_back(OutputPendingQueue(64)); - } - Callbacks[false] = [&](const ram::Request& Req) { - int CtrlID = Req.getChannelID(); - // TODO: check pending queue reservation logic - OutputPendingQueues[CtrlID].push(Req.orignal_request); - }; - Callbacks[true] = [&](const ram::Request& Req) { - int CtrlID = Req.getChannelID(); - // // TODO: check pending queue reservation logic - OutputPendingQueues[CtrlID].push(Req.orignal_request); - }; - - if (is_pim) { - int hot_vid = -1; - int in_degrees = -1; - int total_vid = 0; - } - Stat::statlist.output("./ramulator.stats"); -} - -void Ramulator::tick() { - MemBase->tick(); -} - -bool Ramulator::isAvailable(int CtrlID, uint64_t Addr, bool IsWrite) const { - std::vector MemAddr = MemBase->decode_mem_addr(Addr); - assert(CtrlID == MemAddr[0]); - return OutputPendingQueues[CtrlID].isAvailable(1) && !MemBase->is_full(CtrlID, IsWrite); -} - -bool Ramulator::isAvailable(uint64_t Addr, bool IsWrite) const { - // TODO: need to avoid decoding memory addr whenever `isAvailable` is called - std::vector MemAddr = MemBase->decode_mem_addr(Addr); - uint32_t CtrlID = MemAddr[0]; - - bool result = OutputPendingQueues[CtrlID].isAvailable(1) && !MemBase->is_full(CtrlID, IsWrite); - - return result; -} - - -void Ramulator::push(int CtrlID, uint64_t Addr, bool IsWrite, uint32_t core_id, void* orignal_req) { - std::vector MemAddr = MemBase->decode_mem_addr(Addr); - //Ensure CtrlID match with decoded address - assert(CtrlID == MemAddr[0]); - if (IsWrite) { - Request req(Request::Type::WRITE, Addr, MemAddr, Callbacks[IsWrite], orignal_req); - req.coreid = core_id; - bool isSent = MemBase->send(req); - assert(isSent); - } else { - Request req(Request::Type::READ, Addr, MemAddr, Callbacks[IsWrite], orignal_req); - req.coreid = core_id; - bool isSent = MemBase->send(req); - assert(isSent); - } - - OutputPendingQueues[CtrlID].reserve(); -} - -void Ramulator::push(uint64_t Addr, bool IsWrite, uint32_t core_id, void* original_req) { - std::vector MemAddr = MemBase->decode_mem_addr(Addr); - const int CtrlID = MemAddr[0]; - // TODO: vid check here - if (IsWrite) { - Request req(Request::Type::WRITE, Addr, MemAddr, Callbacks[IsWrite], original_req); - req.coreid = core_id; - bool isSent = MemBase->send(req); - assert(isSent); - } else { - Request req(Request::Type::READ, Addr, MemAddr, Callbacks[IsWrite], original_req); - req.coreid = core_id; - bool isSent = MemBase->send(req); - assert(isSent); - } - - OutputPendingQueues[CtrlID].reserve(); -} - -bool Ramulator::isEmpty(int CtrlID) const { - return OutputPendingQueues[CtrlID].isEmpty(); -} -const void* Ramulator::top(int CtrlID) const { - return OutputPendingQueues[CtrlID].top(); -} -void Ramulator::pop(int CtrlID) { - OutputPendingQueues[CtrlID].pop(); -} - -int Ramulator::getAtomicBytes() const { - return MemBase->get_transaction_bytes(); -} - -int Ramulator::getNumChannels() const { - return MemBase->get_num_channels(); -} - -int Ramulator::getChannel(uint64_t Addr) const { - std::vector MemAddr = MemBase->decode_mem_addr(Addr); - return MemAddr[0]; -} - -void Ramulator::print_stats() { - MemBase->finish(); - Stat::statlist.printall(); -} - -std::unique_ptr -Ramulator::createMemory(const std::string ConfigFilePath, uint32_t num_core) { - RamulatorConfig Config(ConfigFilePath); - Config.set_core_num(num_core); - std::string MemType = Config["standard"]; - if (MemType == "DDR4") { - return MemoryFactory::create(Config, 32); - } else if (MemType == "HBM") { - return MemoryFactory::create(Config, 32); - } else { - assert(false); - return nullptr; - } -} -Ramulator::OutputPendingQueue::OutputPendingQueue(int Size) - : Size(Size), - NumReserved(0) {} - -bool Ramulator::OutputPendingQueue::isAvailable() const { - return NumReserved + PendingQueue.size() < Size; -} - -bool Ramulator::OutputPendingQueue::isAvailable(uint32_t count) const { - return NumReserved + PendingQueue.size() + count - 1 < Size; -} - -void Ramulator::OutputPendingQueue::reserve() { - assert(NumReserved < Size); - NumReserved++; -} - -void Ramulator::OutputPendingQueue::push(void* Addr) { - PendingQueue.push(Addr); - assert(NumReserved > 0); - NumReserved--; -} - -bool Ramulator::OutputPendingQueue::isEmpty() const { - return PendingQueue.empty(); -} - -void Ramulator::OutputPendingQueue::pop() { - PendingQueue.pop(); -} -const void* Ramulator::OutputPendingQueue::top() const { - return PendingQueue.front(); -} - -Ramulator::~Ramulator() = default; - -} diff --git a/TOGSim/extern/ramulator_custom/src/Refresh.cpp b/TOGSim/extern/ramulator_custom/src/Refresh.cpp deleted file mode 100644 index 20281f64..00000000 --- a/TOGSim/extern/ramulator_custom/src/Refresh.cpp +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Refresh.cpp - * - * Mainly DSARP specialization at the moment. - * - * Created on: Mar 17, 2015 - * Author: kevincha - */ - -#include - -#include "Refresh.h" -#include "Controller.h" -#include "DRAM.h" -// #include "DSARP.h" - -using namespace std; -using namespace ram; - -namespace ram { - -/**** DSARP specialization ****/ -// template<> -// Refresh::Refresh(Controller* ctrl) : ctrl(ctrl) { -// clk = refreshed = 0; -// max_rank_count = ctrl->channel->children.size(); -// max_bank_count = ctrl->channel->spec->org_entry.count[(int)DSARP::Level::Bank]; -// max_sa_count = ctrl->channel->spec->org_entry.count[(int)DSARP::Level::SubArray]; -// -// // Init refresh counters -// for (int r = 0; r < max_rank_count; r++) { -// bank_ref_counters.push_back(0); -// bank_refresh_backlog.push_back(new vector(max_bank_count, 0)); -// vector sa_counters(ctrl->channel->spec->org_entry.count[(int)DSARP::Level::SubArray], 0); -// subarray_ref_counters.push_back(sa_counters); -// } -// -// level_chan = (int)DSARP::Level::Channel; -// level_rank = (int)DSARP::Level::Rank; -// level_bank = (int)DSARP::Level::Bank; -// level_sa = (int)DSARP::Level::SubArray; -// } -// -// template<> -// void Refresh::early_inject_refresh() { -// // Only enabled during reads -// if (ctrl->write_mode) -// return; -// -// // OoO bank-level refresh -// vector is_bank_occupied(max_rank_count * max_bank_count, false); -// Controller::Queue& rdq = ctrl->readq; -// -// // Figure out which banks are idle in order to refresh one of them -// for (auto req: rdq.q) -// { -// assert(req.addr_vec[level_chan] == ctrl->channel->id); -// int ridx = req.addr_vec[level_rank] * max_bank_count; -// int bidx = req.addr_vec[level_bank]; -// is_bank_occupied[ridx+bidx] = true; -// } -// -// // Try to pick an idle bank to refresh per rank -// for (int r = 0; r < max_rank_count; r++) { -// // Randomly pick a bank to examine -// int bidx_start = rand() % max_bank_count; -// -// for (int b = 0; b < max_bank_count; b++) -// { -// int bidx = (bidx_start + b) % max_bank_count; -// // Idle cycle only -// if (is_bank_occupied[(r * max_bank_count) + bidx]) -// continue; -// -// // Pending refresh -// bool pending_ref = false; -// for (Request req : ctrl->otherq.q) -// if (req.type == Request::Type::REFRESH -// && req.addr_vec[level_chan] == ctrl->channel->id -// && req.addr_vec[level_rank] == r && req.addr_vec[level_bank] == bidx) -// pending_ref = true; -// if (pending_ref) -// continue; -// -// // Only pull in refreshes when we are almost running out of credits -// if ((*(bank_refresh_backlog[r]))[bidx] >= backlog_early_pull_threshold || -// ctrl->otherq.q.size() >= ctrl->otherq.max) -// continue; -// -// // Refresh now -// refresh_target(ctrl, r, bidx, subarray_ref_counters[r][bidx]); -// // One credit for delaying a future ref -// (*(bank_refresh_backlog[r]))[bidx]++; -// subarray_ref_counters[r][bidx] = (subarray_ref_counters[r][bidx]+1) % max_sa_count; -// break; -// } -// } -// } - -// template<> -// void Refresh::inject_refresh(bool b_ref_rank) { -// // Rank-level refresh -// if (b_ref_rank) -// for (auto rank : ctrl->channel->children) -// refresh_target(ctrl, rank->id, -1, -1); -// // Bank-level refresh. Simultaneously issue to all ranks (better performance than staggered refreshes). -// else { -// for (auto rank : ctrl->channel->children) { -// int rid = rank->id; -// int bid = bank_ref_counters[rid]; -// -// // Behind refresh schedule by 1 ref -// (*(bank_refresh_backlog[rid]))[bid]--; -// -// // Next time, refresh the next bank in the same bank -// bank_ref_counters[rid] = (bank_ref_counters[rid] + 1) % max_bank_count; -// -// // Check to see if we can skip a refresh -// if (ctrl->channel->spec->type == DSARP::Type::DARP || -// ctrl->channel->spec->type == DSARP::Type::DSARP) { -// -// bool ref_now = false; -// // 1. Any pending refrehes? -// bool pending_ref = false; -// for (Request req : ctrl->otherq.q) { -// if (req.type == Request::Type::REFRESH) { -// pending_ref = true; -// break; -// } -// } -// -// // 2. Track readq -// if (!pending_ref && ctrl->readq.size() == 0) -// ref_now = true; -// -// // 3. Track log status. If we are too behind the schedule, then we need to refresh now. -// if ((*(bank_refresh_backlog[rid]))[bid] <= backlog_min) -// ref_now = true; -// -// // Otherwise skip refresh -// if (!ref_now) -// continue; -// } -// -// refresh_target(ctrl, rid, bid, subarray_ref_counters[rid][bid]); -// // Get 1 ref credit -// (*(bank_refresh_backlog[rid]))[bid]++; -// // Next time, refresh the next sa in the same bank -// subarray_ref_counters[rid][bid] = (subarray_ref_counters[rid][bid]+1) % max_sa_count; -// } -// } -// refreshed = clk; -// } -// -// first = wrq.count; second = bank idx -typedef pair wrq_idx; -bool wrq_comp (wrq_idx l, wrq_idx r) -{ - return l.first < r.first; -} - -// WRP -// template<> -// void Refresh::wrp() { -// for (int ref_rid = 0; ref_rid < max_rank_count; ref_rid++) -// { -// // Pending refresh in the rank? -// bool pending_ref = false; -// for (Request req : ctrl->otherq.q) { -// if (req.type == Request::Type::REFRESH && req.addr_vec[level_rank] == ref_rid) { -// pending_ref = true; -// break; -// } -// } -// if (pending_ref) -// continue; -// -// // Find the bank with the lowest number of writes+reads -// vector sorted_bank_demand; -// for (int b = 0; b < max_bank_count; b++) -// sorted_bank_demand.push_back(wrq_idx(0,b)); -// // Filter out all the writes to this rank -// int total_wr = 0; -// for (auto req : ctrl->writeq.q) { -// if (req.addr_vec[level_rank] == ref_rid) { -// sorted_bank_demand[req.addr_vec[level_bank]].first++; -// total_wr++; -// } -// } -// // If there's no write, just skip. -// if (total_wr == 0) -// continue; -// -// // Add read -// for (auto req : ctrl->readq.q) -// if (req.addr_vec[level_rank] == ref_rid) -// sorted_bank_demand[req.addr_vec[level_bank]].first++; -// -// // Sort based on the entries -// std::sort(sorted_bank_demand.begin(), sorted_bank_demand.end(), wrq_comp); -// -// // Randomly select an idle bank to refresh -// int top_idle_idx = 0; -// for (int i = 0; i < max_bank_count; i++) { -// if (sorted_bank_demand[i].second != 0) { -// top_idle_idx = i; -// break; -// } -// } -// -// // Select a bank to ref -// int ref_bid_idx = (top_idle_idx == 0) ? 0 : rand() % top_idle_idx; -// int ref_bid = sorted_bank_demand[ref_bid_idx].second; -// -// // Make sure we don't exceed the credit -// if ((*(bank_refresh_backlog[ref_rid]))[ref_bid] < backlog_max -// && ctrl->otherq.q.size() < ctrl->otherq.max) { -// refresh_target(ctrl, ref_rid, ref_bid, subarray_ref_counters[ref_rid][ref_bid]); -// // Get 1 ref credit -// (*(bank_refresh_backlog[ref_rid]))[ref_bid]++; -// subarray_ref_counters[ref_rid][ref_bid] = (subarray_ref_counters[ref_rid][ref_bid]+1) % max_sa_count; -// } -// } -// } -// -// // OoO refresh of DSARP -// template<> -// void Refresh::tick_ref() { -// clk++; -// -// bool b_ref_rank = ctrl->channel->spec->b_ref_rank; -// int refresh_interval = -// (b_ref_rank) ? -// ctrl->channel->spec->speed_entry.nREFI : -// ctrl->channel->spec->speed_entry.nREFIpb; -// -// // DARP -// if (ctrl->channel->spec->type == DSARP::Type::DARP || -// ctrl->channel->spec->type == DSARP::Type::DSARP) { -// // Write-Refresh Parallelization. Issue refreshes when the controller enters writeback mode -// if (!ctrl_write_mode && ctrl->write_mode) -// wrp(); -// // Record write mode -// ctrl_write_mode = ctrl->write_mode; -// // Inject early to pull in some refreshes during read mode -// early_inject_refresh(); -// } -// -// // Time to schedule a refresh and also try to skip some refreshes -// if ((clk - refreshed) >= refresh_interval) -// inject_refresh(b_ref_rank); -// } -/**** End DSARP specialization ****/ - -} /* namespace ram */ diff --git a/TOGSim/extern/ramulator_custom/src/Refresh.h b/TOGSim/extern/ramulator_custom/src/Refresh.h deleted file mode 100644 index 36c08b55..00000000 --- a/TOGSim/extern/ramulator_custom/src/Refresh.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Refresh.h - * - * This is a refresh scheduler. A list of refresh policies implemented: - * - * 1. All-bank refresh - * 2. Per-bank refresh (only DSARP memory module has been completed to work with REFpb). - * The other modules (LPDDRx) have not been updated to pass a knob to turn on/off REFpb. - * 3. A re-implementation of DSARP from the refresh mechanisms proposed in Chang et al., - * "Improving DRAM Performance by Parallelizing Refreshes with Accesses", HPCA 2014. - * - * Created on: Mar 17, 2015 - * Author: kevincha - */ - -#ifndef __REFRESH_H_ -#define __REFRESH_H_ - -#include -#include -#include -#include - -#include "Request.h" -// #include "DSARP.h" -// #include "ALDRAM.h" - -using namespace std; - -namespace ram { - -template -class Controller; - -template -class Refresh { -public: - Controller* ctrl; - long clk, refreshed; - // Per-bank refresh counter to track the refresh progress for each rank - vector bank_ref_counters; - int max_rank_count, max_bank_count; - int level_chan, level_rank, level_bank, level_sa; - - // ctor - Refresh(Controller* ctrl) : ctrl(ctrl) { - clk = refreshed = 0; - max_rank_count = ctrl->channel->children.size(); - max_bank_count = ctrl->channel->spec->org_entry.count[(int)T::Level::Bank]; - - // Init refresh counters - for (int r = 0; r < max_rank_count; r++) { - bank_ref_counters.push_back(0); - bank_refresh_backlog.push_back(new vector(max_bank_count, 0)); - } - - level_chan = (int)T::Level::Channel; - level_rank = (int)T::Level::Rank; - level_bank = (int)T::Level::Bank; - level_sa = -1; // Most DRAM doesn't have subarray level - } - - // dtor - virtual ~Refresh() { - // Clean up backlog - for (unsigned int i = 0; i < bank_refresh_backlog.size(); i++) - delete bank_refresh_backlog[i]; - } - - // Basic refresh scheduling for all bank refresh that is applicable to all DRAM types - void tick_ref() { - clk++; - - int refresh_interval = ctrl->channel->spec->speed_entry.nREFI; - - // Time to schedule a refresh - if ((clk - refreshed) >= refresh_interval) { - inject_refresh(true); - // ALDRAM: update timing parameters based on temperatures - // ALDRAM::Temp current_temperature = ALDRAM::Temp::COLD; - // ctrl->update_temp(current_temperature); - } - } - -private: - // Keeping track of refresh status of every bank: + means ahead of schedule, - means behind schedule - vector*> bank_refresh_backlog; - // Keeping track of which subarray to refresh next - vector> subarray_ref_counters; - int max_sa_count = 0; - // As defined in the standards - int backlog_max = 8; - int backlog_min = -8; - int backlog_early_pull_threshold = -6; - bool ctrl_write_mode = false; - - // Refresh based on the specified address - void refresh_target(Controller* ctrl, int rank, int bank, int sa) - { - vector addr_vec(int(T::Level::MAX), -1); - addr_vec[0] = ctrl->channel->id; - addr_vec[1] = rank; - addr_vec[2] = bank; - addr_vec[3] = sa; - Request req(addr_vec, Request::Type::REFRESH, NULL); - bool res = ctrl->enqueue(req); - assert(res); - } - - // Inject refresh at either rank or bank level - void inject_refresh(bool b_ref_rank) { - // Rank-level refresh - if (b_ref_rank) { - for (auto rank : ctrl->channel->children) - refresh_target(ctrl, rank->id, -1, -1); - } - // Bank-level refresh. Simultaneously issue to all ranks (better performance than staggered refreshes). - else { - for (auto rank : ctrl->channel->children) - refresh_target(ctrl, rank->id, bank_ref_counters[rank->id], -1); - } - refreshed = clk; - } - - // DSARP - void early_inject_refresh(); - void wrp(); -}; - -// Declaration of specialized constructor and tick_ref, so the compiler knows -// where to look for these definitions when controller calls them! -// template<> Refresh::Refresh(Controller* ctrl); -// template<> void Refresh::tick_ref(); - -} /* namespace ram */ - -#endif /* SRC_REFRESH_H_ */ diff --git a/TOGSim/extern/ramulator_custom/src/Request.cpp b/TOGSim/extern/ramulator_custom/src/Request.cpp deleted file mode 100644 index 7bbd90fe..00000000 --- a/TOGSim/extern/ramulator_custom/src/Request.cpp +++ /dev/null @@ -1,90 +0,0 @@ -#include "Request.h" - -namespace ram { - -Request::Request() {} - -Request::Request(Type Type, uint64_t Addr, std::vector AddrVec, - function &cb) - : type(Type), - is_first_command(true), - addr(Addr), - addr_vec(AddrVec), - coreid(0), - arrive(0), - depart(0), - callback(cb) {} - -Request::Request(Type Type, uint64_t Addr, std::vector AddrVec, - function &cb, void* original_req) - : type(Type), - is_first_command(true), - addr(Addr), - addr_vec(AddrVec), - coreid(0), - arrive(0), - depart(0), - callback(cb), - orignal_request(original_req) {} - -Request::Request(Type Type, uint64_t Addr, std::vector AddrVec, - function &cb, int vid) - : type(Type), - is_first_command(true), - addr(Addr), - addr_vec(AddrVec), - coreid(0), - arrive(0), - depart(0), - vid(vid), - callback(cb) {} - -Request::Request(std::vector addr_vec, Type type, - function cb) - : type(type), - is_first_command(true), - addr(-1), - BaseAddr(-1), - addr_vec(addr_vec), - coreid(0), - arrive(0), - depart(0), - callback(cb) {} - -Request::Request(std::vector addr_vec, Type type, - function cb, void* original_req) - : type(type), - is_first_command(true), - addr(-1), - BaseAddr(-1), - addr_vec(addr_vec), - coreid(0), - arrive(0), - depart(0), - callback(cb), - orignal_request(original_req) {} - -Request::Request(Type Type, uint64_t BaseAddr, uint64_t Addr, - std::vector AddrVec, function &cb) - : type(Type), - is_first_command(true), - addr(Addr), - BaseAddr(BaseAddr), - addr_vec(AddrVec), - coreid(0), - arrive(0), - depart(0), - callback(cb) {} - -bool Request::isRead() const { - return type == Type::READ; -} -bool Request::isWrite() const { - return type == Type::WRITE; -} -int Request::getChannelID() const { - return addr_vec[0]; -} - -} // end namespace - diff --git a/TOGSim/extern/ramulator_custom/src/Request.h b/TOGSim/extern/ramulator_custom/src/Request.h deleted file mode 100644 index 8f70856e..00000000 --- a/TOGSim/extern/ramulator_custom/src/Request.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef __REQUEST_H -#define __REQUEST_H - -#include -#include -#include - -using namespace std; - -namespace ram { -class Request { -public: - enum class Type { - READ, WRITE, PIM_WRITE, REFRESH, POWERDOWN, SELFREFRESH, EXTENSION, MAX - }; - Type type; - bool is_first_command; - uint64_t addr; - uint64_t BaseAddr; - //int HandlerID; - - vector addr_vec; - // specify which node this request sent from - int coreid; // to remove compile errors - - uint64_t arrive; - uint64_t depart; - - int vid = -1; - void* orignal_request; - function callback; // call back with more info - - bool isRead() const; - bool isWrite() const; - int getChannelID() const; - - // Used to generate refresh request - Request(); - Request(std::vector addr_vec, Type type, function cb); - Request(std::vector addr_vec, Type type, function cb, void* original_req); - Request(Type type, uint64_t Addr, - std::vector AddrVec, function &cb); - Request(Type type, uint64_t Addr, - std::vector AddrVec, function &cb, void* orignal_req); - Request(Type type, uint64_t Addr, - std::vector AddrVec, function &cb, int vid); - Request(Type type, uint64_t BaseAddr, uint64_t Addr, - std::vector AddrVec, function &cb); -}; - -} /*namespace ram*/ - -#endif /*__REQUEST_H*/ - diff --git a/TOGSim/extern/ramulator_custom/src/Scheduler.h b/TOGSim/extern/ramulator_custom/src/Scheduler.h deleted file mode 100644 index 778bfcd4..00000000 --- a/TOGSim/extern/ramulator_custom/src/Scheduler.h +++ /dev/null @@ -1,377 +0,0 @@ -/***************************** SCHEDULER.H *********************************** -- SAFARI GROUP - -This file contains the different scheduling policies and row policies that the -memory controller can use to schedule requests. - -Current Memory Scheduling Policies: - -1) FCFS - First Come First Serve - This scheduling policy schedules memory requests chronologically - -2) FRFCFS - Frist Ready First Come First Serve - This scheduling policy first checks if a request is READY(meets all - timing parameters), if yes then it is prioritized. If multiple requests - are ready, they they are scheduled chronologically. Otherwise, it - behaves the same way as FCFS. - -3) FRFCFS_Cap - First Ready First Come First Serve Cap - This scheduling policy behaves the same way as FRFCS, except that it has - a cap on the number of hits you can get in a certain row. The CAP VALUE - can be altered by changing the number for the "cap" variable in - line number 76. - -4) FRFCFS_PriorHit - First Ready First Come First Serve Prioritize Hits - This scheduling policy behaves the same way as FRFCFS, except that it - prioritizes row hits more than readiness. - -You can select which scheduler you want to use by changing the value of -"type" variable on line number 74. - - _______________________________________ - -Current Row Policies: - -1) Closed - Precharges a row as soon as there are no pending references to - the active row. -2) ClosedAP - Closed Auto Precharge -3) Opened - Precharges a row only if there are pending references to - other rows. -4) Timeout - Precharges a row after X time if there are no pending references. - 'X' time can be changed by changing the variable timeout - on line number 221 - -*****************************************************************************/ - -#ifndef __SCHEDULER_H -#define __SCHEDULER_H - -#include "DRAM.h" -#include "Request.h" -//#include "Controller.h" -#include -#include -#include -#include -#include - -using namespace std; - -namespace ram -{ - -template -class Controller; - -template -class Scheduler -{ -public: - Controller* ctrl; - - enum class Type { - FCFS, FRFCFS, FRFCFS_Cap, FRFCFS_PriorHit, MAX - } type = Type::FRFCFS; //Change this line to change scheduling policy - - long cap = 16; //Change this line to change cap - - Scheduler(Controller* ctrl) : ctrl(ctrl) { - std::cout << "DRAM Contorller scheduler : " << ctrl->configs["scheduler"] << std::endl; - if(ctrl->configs["scheduler"] == "FCFS") { - type = Type::FCFS; - } - else if(ctrl->configs["scheduler"] == "FRFCFS") { - type = Type::FRFCFS; - } - else if(ctrl->configs["scheduler"] == "FRFCFS_Cap") { - type = Type::FRFCFS_Cap; - } - else if(ctrl->configs["scheduler"] == "FRFCFS_PriorHit") { - type = Type::FRFCFS_PriorHit; - } - } - - list::iterator get_head(list& q) - { - // TODO make the decision at compile time - if (type != Type::FRFCFS_PriorHit) { - //If queue is empty, return end of queue - if (!q.size()) - return q.end(); - - //Else return based on the policy - auto head = q.begin(); - for (auto itr = next(q.begin(), 1); itr != q.end(); itr++) - head = compare[int(type)](head, itr); - - return head; - } - else { //Code to get around edge cases for FRFCFS_PriorHit - - //If queue is empty, return end of queue - if (!q.size()) - return q.end(); - - //Else return based on FRFCFS_PriorHit Scheduling Policy - auto head = q.begin(); - for (auto itr = next(q.begin(), 1); itr != q.end(); itr++) { - head = compare[int(Type::FRFCFS_PriorHit)](head, itr); - } - - if (this->ctrl->is_ready(head) && this->ctrl->is_row_hit(head)) { - return head; - } - - // prepare a list of hit request - vector> hit_reqs; - for (auto itr = q.begin() ; itr != q.end() ; ++itr) { - if (this->ctrl->is_row_hit(itr)) { - auto begin = itr->addr_vec.begin(); - // TODO Here it assumes all DRAM standards use PRE to close a row - // It's better to make it more general. - auto end = begin + int(ctrl->channel->spec->scope[int(T::Command::PRE)]) + 1; - vector rowgroup(begin, end); // bank or subarray - hit_reqs.push_back(rowgroup); - } - } - // if we can't find proper request, we need to return q.end(), - // so that no command will be scheduled - head = q.end(); - for (auto itr = q.begin(); itr != q.end(); itr++) { - bool violate_hit = false; - if ((!this->ctrl->is_row_hit(itr)) && this->ctrl->is_row_open(itr)) { - // so the next instruction to be scheduled is PRE, might violate hit - auto begin = itr->addr_vec.begin(); - // TODO Here it assumes all DRAM standards use PRE to close a row - // It's better to make it more general. - auto end = begin + int(ctrl->channel->spec->scope[int(T::Command::PRE)]) + 1; - vector rowgroup(begin, end); // bank or subarray - for (const auto& hit_req_rowgroup : hit_reqs) { - if (rowgroup == hit_req_rowgroup) { - violate_hit = true; - break; - } - } - } - if (violate_hit) { - continue; - } - // If it comes here, that means it won't violate any hit request - if (head == q.end()) { - head = itr; - } else { - head = compare[int(Type::FRFCFS)](head, itr); - } - } - - return head; - } - } - -//Compare functions for each memory schedulers -private: - typedef list::iterator ReqIter; - function compare[int(Type::MAX)] = { - // FCFS - [this] (ReqIter req1, ReqIter req2) { - if (req1->arrive <= req2->arrive) return req1; - return req2;}, - - // FRFCFS - [this] (ReqIter req1, ReqIter req2) { - bool ready1 = this->ctrl->is_ready(req1); - bool ready2 = this->ctrl->is_ready(req2); - - if (ready1 ^ ready2) { - if (ready1) return req1; - return req2; - } - - if (req1->arrive <= req2->arrive) return req1; - return req2;}, - - // FRFCFS_CAP - [this] (ReqIter req1, ReqIter req2) { - bool ready1 = this->ctrl->is_ready(req1); - bool ready2 = this->ctrl->is_ready(req2); - - ready1 = ready1 && (this->ctrl->rowtable->get_hits(req1->addr_vec) <= this->cap); - ready2 = ready2 && (this->ctrl->rowtable->get_hits(req2->addr_vec) <= this->cap); - - if (ready1 ^ ready2) { - if (ready1) return req1; - return req2; - } - - if (req1->arrive <= req2->arrive) return req1; - return req2;}, - // FRFCFS_PriorHit - [this] (ReqIter req1, ReqIter req2) { - bool ready1 = this->ctrl->is_ready(req1) && this->ctrl->is_row_hit(req1); - bool ready2 = this->ctrl->is_ready(req2) && this->ctrl->is_row_hit(req2); - - if (ready1 ^ ready2) { - if (ready1) return req1; - return req2; - } - - if (req1->arrive <= req2->arrive) return req1; - return req2;} - }; -}; - - -// Row Precharge Policy -template -class RowPolicy -{ -public: - Controller* ctrl; - - enum class Type { - Closed, ClosedAP, Opened, Timeout, MAX - } type = Type::Opened; - - int timeout = 50; - - RowPolicy(Controller* ctrl) : ctrl(ctrl) {} - - vector get_victim(typename T::Command cmd) - { - return policy[int(type)](cmd); - } - -private: - function(typename T::Command)> policy[int(Type::MAX)] = { - // Closed - [this] (typename T::Command cmd) -> vector { - for (auto& kv : this->ctrl->rowtable->table) { - if (!this->ctrl->is_ready(cmd, kv.first)) - continue; - return kv.first; - } - return vector();}, - - // ClosedAP - [this] (typename T::Command cmd) -> vector { - for (auto& kv : this->ctrl->rowtable->table) { - if (!this->ctrl->is_ready(cmd, kv.first)) - continue; - return kv.first; - } - return vector();}, - - // Opened - [this] (typename T::Command cmd) { - return vector();}, - - // Timeout - [this] (typename T::Command cmd) -> vector { - for (auto& kv : this->ctrl->rowtable->table) { - auto& entry = kv.second; - if (this->ctrl->clk - entry.timestamp < timeout) - continue; - if (!this->ctrl->is_ready(cmd, kv.first)) - continue; - return kv.first; - } - return vector();} - }; - -}; - - -template -class RowTable -{ -public: - Controller* ctrl; - - struct Entry { - int row; - int hits; - long timestamp; - }; - - map, Entry> table; - - RowTable(Controller* ctrl) : ctrl(ctrl) {} - - void update(typename T::Command cmd, const vector& addr_vec, long clk) - { - auto begin = addr_vec.begin(); - auto end = begin + int(T::Level::Row); - vector rowgroup(begin, end); // bank or subarray - int row = *end; - - T* spec = ctrl->channel->spec; - - if (spec->is_opening(cmd)) - table.insert({rowgroup, {row, 0, clk}}); - - if (spec->is_accessing(cmd)) { - // we are accessing a row -- update its entry - auto match = table.find(rowgroup); - assert(match != table.end()); - assert(match->second.row == row); - match->second.hits++; - match->second.timestamp = clk; - } /* accessing */ - - if (spec->is_closing(cmd)) { - // we are closing one or more rows -- remove their entries - int n_rm = 0; - int scope; - if (spec->is_accessing(cmd)) - scope = int(T::Level::Row) - 1; //special condition for RDA and WRA - else - scope = int(spec->scope[int(cmd)]); - - for (auto it = table.begin(); it != table.end();) { - if (equal(begin, begin + scope + 1, it->first.begin())) { - n_rm++; - it = table.erase(it); - } - else - it++; - } - - assert(n_rm > 0); - } /* closing */ - } - - int get_hits(const vector& addr_vec, const bool to_opened_row = false) - { - auto begin = addr_vec.begin(); - auto end = begin + int(T::Level::Row); - - vector rowgroup(begin, end); - int row = *end; - - auto itr = table.find(rowgroup); - if (itr == table.end()) - return 0; - - if(!to_opened_row && (itr->second.row != row)) - return 0; - - return itr->second.hits; - } - - int get_open_row(const vector& addr_vec) { - auto begin = addr_vec.begin(); - auto end = begin + int(T::Level::Row); - - vector rowgroup(begin, end); - - auto itr = table.find(rowgroup); - if(itr == table.end()) - return -1; - - return itr->second.row; - } -}; - -} /*namespace ram*/ - -#endif /*__SCHEDULER_H*/ diff --git a/TOGSim/extern/ramulator_custom/src/SpeedyController.h b/TOGSim/extern/ramulator_custom/src/SpeedyController.h deleted file mode 100644 index 981ce900..00000000 --- a/TOGSim/extern/ramulator_custom/src/SpeedyController.h +++ /dev/null @@ -1,304 +0,0 @@ -#ifndef __SPEEDYCONTROLLER_H -#define __SPEEDYCONTROLLER_H - -#include "Config.h" -#include "DRAM.h" -#include "Request.h" -#include "Statistics.h" -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -namespace ram -{ - -template -class SpeedyController -// A FR-FCFS Open Row Controller, optimized for simulation speed. -// Not For SALP-2 -{ -protected: - ScalarStat row_hits; - ScalarStat row_misses; -private: - class compair_depart_clk{ - public: - bool operator()(const Request& lhs, const Request& rhs) { - return lhs.depart > rhs.depart; - } - }; -public: - /* Command trace for DRAMPower 3.1 */ - string cmd_trace_prefix = "cmd-trace-"; - vector cmd_trace_files; - bool record_cmd_trace = false; - /* Commands to stdout */ - bool print_cmd_trace = false; - /* Member Variables */ - const unsigned int queue_capacity = 32; - long clk = 0; - DRAM* channel; - - double write_hi = 0.875; - double write_low = 0.5; - - // request, first command, earliest clk - typedef tuple request_info; - typedef vector request_queue; - request_queue readq; // queue for read requests - request_queue writeq; // queue for write requests - request_queue otherq; // queue for all "other" requests (e.g., refresh) - - // read requests that are about to receive data from DRAM - priority_queue, compair_depart_clk> pending; - - bool write_mode = false; // whether write requests should be prioritized over reads - long refreshed = 0; // last time refresh requests were generated - - /* Constructor */ - SpeedyController(RamulatorConfig& configs, DRAM* channel) : - channel(channel) - { - record_cmd_trace = configs.record_cmd_trace(); - print_cmd_trace = configs.print_cmd_trace(); - if (record_cmd_trace){ - string prefix = cmd_trace_prefix + "chan-" + to_string(channel->id) + "-rank-"; - string suffix = ".cmdtrace"; - for (unsigned int i = 0; i < channel->children.size(); i++) - cmd_trace_files.emplace_back(prefix + to_string(i) + suffix); - } - readq.reserve(queue_capacity); - writeq.reserve(queue_capacity); - otherq.reserve(queue_capacity); - - // regStats - - row_hits - .name("row_hits_channel_"+to_string(channel->id)) - .desc("Number of row hits") - .precision(0) - ; - row_misses - .name("row_misses_channel_"+to_string(channel->id)) - .desc("Number of row misses") - .precision(0) - ; - } - - ~SpeedyController(){ - delete channel; - for (auto& file : cmd_trace_files) - file.close(); - } - - /* Member Functions */ - - void finish(int read_req, int write_req, int dram_cycles) { - // call finish function of each channel - channel->finish(dram_cycles); - } - - bool enqueue(Request& req) - { - request_queue& q = - req.type == Request::Type::READ? readq: - req.type == Request::Type::WRITE? writeq: - otherq; - if (queue_capacity == q.size()) - return false; - - req.arrive = clk; - if (req.type == Request::Type::READ){ - for (auto& info : writeq) - if (req.addr == get<0>(info).addr){ - req.depart = clk + 1; - pending.push(req); - return true; - } - } - typename T::Command first_cmd = get_first_cmd(req); - long first_clk = channel->get_next(first_cmd, req.addr_vec.data()); - q.emplace_back(req, first_cmd, first_clk); - push_heap(q.begin(), q.end(), compair_first_clk);; - return true; - } - - void tick() - { - clk++; - - /*** 1. Serve completed reads ***/ - if (pending.size()) { - Request req = pending.top(); - if (req.depart <= clk) { - req.depart = clk; // actual depart clk - req.callback(req); - pending.pop(); - } - } - - /*** 2. Should we schedule refreshes? ***/ - int refresh_interval = channel->spec->speed_entry.nREFI; - if (clk - refreshed >= refresh_interval) { - auto req_type = Request::Type::REFRESH; - vector addr_vec(int(T::Level::MAX), -1); - addr_vec[0] = channel->id; - for (auto child : channel->children) { - addr_vec[1] = child->id; - Request req(addr_vec, req_type, NULL); - bool res = enqueue(req); - assert(res); - } - - refreshed = clk; - } - - /*** 3. Should we schedule writes? ***/ - if (!write_mode) { - // yes -- write queue is almost full or read queue is empty - if (writeq.size() >= (unsigned int)(write_hi * queue_capacity) || readq.size() == 0) - write_mode = true; - } - else { - // no -- write queue is almost empty and read queue is not empty - if (writeq.size() <= (unsigned int)(write_low * queue_capacity) && readq.size() != 0) - write_mode = false; - } - - /*** 4. Find the best command to schedule, if any ***/ - request_queue& q = otherq.size()? otherq: write_mode ? writeq : readq; - - schedule(q); - } - - bool is_row_hit(Request& req) - { - typename T::Command cmd = get_first_cmd(req); - return channel->check_row_hit(cmd, req.addr_vec.data()); - } - -private: - - static bool compair_first_clk(const request_info& lhs, const request_info& rhs) { - return (get<2>(lhs) > get<2>(rhs)); - } - - typename T::Command get_first_cmd(Request& req) - { - typename T::Command cmd = channel->spec->translate[int(req.type)]; - switch (int(req.type)){ - case int(Request::Type::READ): - case int(Request::Type::WRITE):{ - auto node = channel; - for (int i = 1; i < int(T::Level::Row); i++) - node = node->children[req.addr_vec[i]]; - assert(int(node->level) == int(T::Level::Row) - 1); - if (node->state == T::State::Closed) return T::Command::ACT; - else if (node->row_state.find(req.addr_vec[int(T::Level::Row)]) != node->row_state.end()) return cmd; - else return T::Command::PRE; - } - case int(Request::Type::REFRESH): - return channel->decode(cmd, req.addr_vec.data()); - default: - assert(false); - } - // return channel->decode(cmd, req.addr_vec.data()); - } - void update(typename T::Command cmd, bool state_change, vector::iterator& begin, vector::iterator& end, request_queue& q){ - if (q.empty()) return; - - for (auto& info : q) { - bool addr_eq = equal(begin, end, get<0>(info).addr_vec.begin()); - if (state_change && addr_eq) - get<1>(info) = get_first_cmd(get<0>(info)); - if ((cmd == T::Command::RD || cmd == T::Command::WR) - && get<1>(info) == T::Command::ACT) - continue; - get<2>(info) = channel->get_next(get<1>(info), get<0>(info).addr_vec.data()); - } - make_heap(q.begin(), q.end(), compair_first_clk); - } - - void schedule(request_queue& q){ - if (q.empty()) return; - - Request& req = get<0>(q[0]); - typename T::Command& first_cmd = get<1>(q[0]); - long first_clk = get<2>(q[0]); - - if (first_clk > clk) return; - - if (req.is_first_command) { - req.is_first_command = false; - if (req.type == Request::Type::READ || req.type == Request::Type::WRITE) { - if (is_row_hit(req)) - ++row_hits; - else - ++row_misses; - } - } - - issue_cmd(first_cmd, req.addr_vec.data()); - - if (first_cmd == channel->spec->translate[int(req.type)]){ - if (req.type == Request::Type::READ) { - req.depart = clk + channel->spec->read_latency; - pending.push(req); - } - pop_heap(q.begin(), q.end(), compair_first_clk); - q.pop_back(); - } - - bool state_change = channel->spec->is_opening(first_cmd) - || channel->spec->is_closing(first_cmd) - || channel->spec->is_refreshing(first_cmd); - - auto begin = req.addr_vec.begin(); - auto end = begin + 1; - for (; end < begin + int(T::Level::Row) && *end >= 0; end++); - - update(first_cmd, state_change, begin, end, readq); - update(first_cmd, state_change, begin, end, writeq); - update(first_cmd, state_change, begin, end, otherq); - } - - void issue_cmd(typename T::Command cmd, int* addr_vec) - { - // assert(channel->check(cmd, addr_vec, clk)); - channel->update(cmd, addr_vec, clk); - - if (record_cmd_trace){ - // select rank - auto& file = cmd_trace_files[addr_vec[1]]; - string& cmd_name = channel->spec->command_name[int(cmd)]; - file<spec->standard_name == "DDR4" || channel->spec->standard_name == "GDDR5") - bank_id += addr_vec[int(T::Level::Bank) - 1] * - channel->spec->org_entry.count[int(T::Level::Bank)]; - file<<','<spec->command_name[int(cmd)].c_str(), clk); - for (int lev = 0; lev < int(T::Level::MAX); lev++) - printf(" %5d", addr_vec[lev]); - printf("\n"); - } - } -}; - -} /*namespace ram*/ - -#endif /*__SPEEDYCONTROLLER_H*/ diff --git a/TOGSim/extern/ramulator_custom/src/StatType.cpp b/TOGSim/extern/ramulator_custom/src/StatType.cpp deleted file mode 100644 index 843f76c8..00000000 --- a/TOGSim/extern/ramulator_custom/src/StatType.cpp +++ /dev/null @@ -1,153 +0,0 @@ -#include "StatType.h" - -namespace Stat { - -// Statistics list -StatList statlist; - -// The smallest timing granularity. -Tick curTick = 0; - -std::vector all_stats; -void reset_stats() { - for(auto s : all_stats) - s->reset(); -} - -void -Histogram::grow_out() -{ - int size = cvec.size(); - int zero = size / 2; // round down! - int top_half = zero + (size - zero + 1) / 2; // round up! - int bottom_half = (size - zero) / 2; // round down! - - // grow down - int low_pair = zero - 1; - for (int i = zero - 1; i >= bottom_half; i--) { - cvec[i] = cvec[low_pair]; - if (low_pair - 1 >= 0) - cvec[i] += cvec[low_pair - 1]; - low_pair -= 2; - } - assert(low_pair == 0 || low_pair == -1 || low_pair == -2); - - for (int i = bottom_half - 1; i >= 0; i--) - cvec[i] = Counter(); - - // grow up - int high_pair = zero; - for (int i = zero; i < top_half; i++) { - cvec[i] = cvec[high_pair]; - if (high_pair + 1 < size) - cvec[i] += cvec[high_pair + 1]; - high_pair += 2; - } - assert(high_pair == size || high_pair == size + 1); - - for (int i = top_half; i < size; i++) - cvec[i] = Counter(); - - max_bucket *= 2; - min_bucket *= 2; - bucket_size *= 2; -} - -void -Histogram::grow_convert() -{ - int size = cvec.size(); - int half = (size + 1) / 2; // round up! - //bool even = (size & 1) == 0; - - int pair = size - 1; - for (int i = size - 1; i >= half; --i) { - cvec[i] = cvec[pair]; - if (pair - 1 >= 0) - cvec[i] += cvec[pair - 1]; - pair -= 2; - } - - for (int i = half - 1; i >= 0; i--) - cvec[i] = Counter(); - - min_bucket = -max_bucket;// - (even ? bucket_size : 0); - bucket_size *= 2; -} - -void -Histogram::grow_up() -{ - int size = cvec.size(); - int half = (size + 1) / 2; // round up! - - int pair = 0; - for (int i = 0; i < half; i++) { - cvec[i] = cvec[pair]; - if (pair + 1 < size) - cvec[i] += cvec[pair + 1]; - pair += 2; - } - assert(pair == size || pair == size + 1); - - for (int i = half; i < size; i++) - cvec[i] = Counter(); - - max_bucket *= 2; - bucket_size *= 2; -} - -void -Histogram::add(Histogram &hs) -{ - size_type b_size = hs.size(); - assert(size() == b_size); - assert(min_bucket == hs.min_bucket); - - sum += hs.sum; - logs += hs.logs; - squares += hs.squares; - samples += hs.samples; - - while(bucket_size > hs.bucket_size) - hs.grow_up(); - while(bucket_size < hs.bucket_size) - grow_up(); - - for (uint32_t i = 0; i < b_size; i++) - cvec[i] += hs.cvec[i]; -} - -void -Histogram::sample(Counter val, int number) -{ - assert(min_bucket < max_bucket); - if (val < min_bucket) { - if (min_bucket == 0) - grow_convert(); - - while (val < min_bucket) - grow_out(); - } else if (val >= max_bucket + bucket_size) { - if (min_bucket == 0) { - while (val >= max_bucket + bucket_size) - grow_up(); - } else { - while (val >= max_bucket + bucket_size) - grow_out(); - } - } - - size_type index = - (int64_t)std::floor((val - min_bucket) / bucket_size); - - assert(index >= 0 && index < size()); - cvec[index] += number; - - sum += val * number; - squares += val * val * number; - logs += log(val) * number; - samples += number; -} - -} /* namespace Stats */ diff --git a/TOGSim/extern/ramulator_custom/src/StatType.h b/TOGSim/extern/ramulator_custom/src/StatType.h deleted file mode 100644 index 1a7d5ca9..00000000 --- a/TOGSim/extern/ramulator_custom/src/StatType.h +++ /dev/null @@ -1,669 +0,0 @@ -#ifndef __STATTYPE_H -#define __STATTYPE_H - -#include -#include -#include -#include - -#include -#include -#include - -namespace ram { - -class ScalarStat; -class AverageStat; -class VectorStat; -class AverageVectorStat; -} // namespace ram - -namespace Stat { - -const double eps = 1e-8; - -typedef unsigned int size_type; -typedef unsigned int off_type; -typedef double Counter; -typedef double Result; -typedef uint64_t Tick; -typedef std::vector VCounter; -typedef std::vector VResult; -typedef std::numeric_limits CounterLimits; - -class StatBase; -extern std::vector all_stats; -void reset_stats(); - -// Flags -const uint16_t init = 0x00000001; -const uint16_t display = 0x00000002; -const uint16_t total = 0x00000010; -const uint16_t pdf = 0x00000020; -const uint16_t cdf = 0x00000040; -const uint16_t dist = 0x00000080; -const uint16_t nozero = 0x00000100; -const uint16_t nonan = 0x00000200; - -class Flags { - protected: - uint16_t flags; - public: - Flags(){} - Flags(uint16_t flags):flags(flags){} - void operator=(uint16_t _flags){flags = _flags;} - bool is_total() const {return flags & total;} - bool is_pdf() const {return flags & pdf;} - bool is_nozero() const {return flags & nozero;} - bool is_nonan() const {return flags & nonan;} - bool is_cdf() const {return flags & cdf;} - bool is_display() const {return flags & display;} -}; - -class StatBase { - public: - StatBase() { - all_stats.push_back(this); - } - - - // TODO implement print for Distribution, Histogram, - // AverageDeviation, StandardDeviation - virtual void print(std::ofstream& file) = 0; - - virtual size_type size() const = 0; - virtual bool zero() const = 0; - virtual void prepare() = 0; - virtual void reset() = 0; - - virtual VResult vresult() const { return VResult(); }; - virtual Result total() const { return Result(); }; - - virtual bool is_display() const = 0; - virtual bool is_nozero() const = 0; -}; - -class StatList { - protected: - std::vector list; - std::ofstream stat_output; - public: - void add(StatBase* stat) { - list.push_back(stat); - } - void output(std::string filename) { - stat_output.open(filename.c_str(), std::ios_base::out); - if (!stat_output.good()) { - assert(false && "!stat_output.good()"); - } - } - void printall() { - for(off_type i = 0 ; i < list.size() ; ++i) { - if (!list[i]) { - continue; - } - if (list[i]->is_nozero() && list[i]->zero()) { - continue; - } - if (list[i]->is_display()) { - list[i]->prepare(); - list[i]->print(stat_output); - } - } - } - ~StatList() { - stat_output.close(); - } -}; - -extern StatList statlist; - -template -class Stat : public StatBase { - protected: - std::string _name; - std::string _desc; - int _precision = 1; - Flags _flags = display; - std::string separatorString; - public: - Stat() { - statlist.add(selfptr()); - } - Derived &self() {return *static_cast(this);} - Derived *selfptr() {return static_cast(this);} - Derived &name(const std::string &__name) { - _name = __name; - return self(); - }; - Derived &desc(const std::string &__desc) { - _desc = __desc; - return self(); - }; - Derived &precision(int __precision) { - _precision = __precision; - return self(); - }; - Derived &flags(Flags __flags) { - _flags = __flags; - return self(); - }; - - template - Derived &prereq(const GenericStat & prereq) { - // TODO deal with prereq; - // only print the stat if the prereq is not zero. - return self(); - } - - Derived &setSeparator(std::string str) { - separatorString = str; - return self(); - } - const std::string& setSeparator() const {return separatorString;} - - size_type size() const { return 0; } - - virtual void print(std::ofstream& file) {}; - virtual void printname(std::ofstream& file) { - file.width(40); - file << _name; - } - - virtual void printdesc(std::ofstream& file) { - file.width(40); - file << "# " << _desc << std::endl; - } - - virtual bool is_display() const { - return _flags.is_display(); - } - - virtual bool is_nozero() const { - return _flags.is_nozero(); - } -}; - -template -class ScalarBase: public Stat { - public: - virtual Counter value() const = 0; - virtual Result result() const = 0; - virtual Result total() const = 0; - - size_type size() const {return 1;} - VResult vresult() const {return VResult(1, result());} - - virtual void print(std::ofstream& file) { - Stat::printname(file); - // TODO deal with flag - file.precision(Stat::_precision); - file.width(20); - Result res = Stat::self().result(); - file << std::fixed << res; - Stat::printdesc(file); - } -}; - -class ConstValue: public ScalarBase { - private: - Counter _value; - public: - ConstValue(Counter __value):_value(__value){} - - void operator ++ () { ++_value; } - void operator -- () { --_value; } - void operator ++ (int) { _value++; } - void operator -- (int) { _value--; } - - template - void operator = (const U &v) { _value = v; } - - template - void operator += (const U &v) { _value += v;} - - template - void operator -= (const U &v) { _value -= v;} - - - Counter value() const {return _value;} - Result result() const {return (Result)_value;} - Result total() const {return result();} - bool zero() const {return (fabs(_value) < eps);} - void prepare() {} - void reset() {} -}; - -class Scalar: public ScalarBase { - private: - Counter _value; - public: - Scalar():_value(0) {} - Counter value() const {return _value;} - Result result() const {return (Result)_value;} - Result total() const {return (Result)_value;} - - void operator ++ () { ++_value; } - void operator -- () { --_value; } - void operator ++ (int) { _value++; } - void operator -- (int) { _value--; } - - template - void operator = (const U &v) { _value = v; } - - template - void operator += (const U &v) { _value += v;} - - template - void operator -= (const U &v) { _value -= v;} - - - virtual bool zero() const {return (fabs(_value) < eps);} - void prepare() {} - void reset() {_value = Counter();} - -}; - -extern Tick curTick; - -class Average: public ScalarBase { - private: - Counter current; - Tick lastReset; - Result total_val; - Tick last; - public: - Average():current(0), lastReset(0), total_val(0), last(0){} - - void set(Counter val) { - total_val += current * (curTick - last); - last = curTick; - current = val; - } - void inc(Counter val) { - set(current + val); - } - void dec(Counter val) { - set(current - val); - } - void operator ++ () { inc(1); } - void operator -- () { dec(1); } - void operator ++ (int) { inc(1); } - void operator -- (int) { dec(1); } - - template - void operator = (const U &v) { set(v); } - - template - void operator += (const U &v) { inc(v);} - - template - void operator -= (const U &v) { dec(v);} - - - bool zero() const { return (fabs(total_val) < eps); } - void prepare() { - total_val += current * (curTick - last); - last = curTick; - } - void reset() { - total_val = 0.0; - last = curTick; - lastReset = curTick; - } - - Counter value() const { return current; } - Result result() const { - assert(last == curTick); - return (Result)(total_val + current)/ (Result)(curTick - lastReset + 1); - } - Result total() const {return result();} -}; - -template -class VectorBase: public Stat { - private: - size_type _size = 0; - std::vector data; - - public: - void init(size_type __size) { - _size = __size; - data.resize(size()); - for (off_type i = 0 ; i < size() ; ++i) { - data[i].flags(0) - .name("[" + std::string(1, char(i + '0')) + "]"); - } - } - size_type size() const {return _size;} - // Copy the values to a local vector and return a reference to it. - void value(VCounter& vec) const { - vec.resize(size()); - for (off_type i = 0 ; i < size() ; ++i) { - vec[i] = data[i].value(); - } - } - // Copy the results to a local vector and return a reference to it. - void result(VResult& vec) const { - vec.resize(size()); - for (off_type i = 0 ; i < size() ; ++i) { - vec[i] = data[i].result(); - } - } - - Result total() const { - Result sum = 0.0; - for (off_type i = 0 ; i < size() ; ++i) { - sum += data[i].result(); - } - return sum; - } - - VResult vresult() const { - VResult vres; - for (off_type i = 0 ; i < size() ; ++i) { - vres[i] = data[i].result(); - } - return vres; - } - - bool check() const { - // We don't separate storage and access as gem5 does. - // So here is always true. - return true; - } - - Element &operator[](off_type index) { - assert(index >= 0 && index < size()); - return data[index]; - } - - bool zero() const { - return (fabs(total()) < eps); - } - - void prepare() { - for (off_type i = 0 ; i < size() ; ++i) { - data[i].prepare(); - } - } - void reset() { - for (off_type i = 0 ; i < size() ; ++i) { - data[i].reset(); - } - } - void print(std::ofstream& file) { - Stat::printname(file); - file.precision(Stat::_precision); - file.width(20); - file << std::fixed << total(); - Stat::printdesc(file); - for (off_type i = 0 ; i < size() ; ++i) { - data[i].print(file); - } - } -}; - -class Vector: public VectorBase { -}; - -class AverageVector: public VectorBase { -}; - -class Distribution: public Stat { - private: - // Parameter part: - Counter param_min; - Counter param_max; - Counter param_bucket_size; - Counter param_buckets; - - // The minimum value to track - Counter min_track; - // The maximum value to track - Counter max_track; - // The number of entries in each bucket - Counter bucket_size; - - Counter min_val; - Counter max_val; - // The number of values sampled less than min - Counter underflow; - // The number of values sampled more than max - Counter overflow; - // The current sum - Counter sum; - // The sum of squares - Counter squares; - // The number of samples - Counter samples; - // Counter for each bucket - VCounter cvec; - - public: - Distribution():param_min(Counter()), param_max(Counter()), - param_bucket_size(Counter()) { reset(); } - void init(Counter min, Counter max, Counter bkt) { - param_min = min; - param_max = max; - param_bucket_size = bkt; - param_buckets = (size_type)ceil((max - min + 1.0) / bkt); - cvec.resize(param_buckets); - - reset(); - } - void sample(Counter val, int number) { - if (val < min_track) - underflow += number; - else if (val > max_track) - overflow += number; - else { - size_type index = - (size_type)std::floor((val - min_track) / bucket_size); - assert(index < size()); - cvec[index] += number; - } - - if (val < min_val) - min_val = val; - - if (val > max_val) - max_val = val; - - sum += val * number; - squares += val * val * number; - samples += number; - } - - size_type size() const {return cvec.size();} - bool zero() const { - return (fabs(samples) < eps); - } - void prepare() {}; - void reset() { - min_track = param_min; - max_track = param_max; - bucket_size = param_bucket_size; - - min_val = CounterLimits::max(); - max_val = CounterLimits::min(); - underflow = Counter(); - overflow = Counter(); - - size_type _size = cvec.size(); - for (off_type i = 0 ; i < _size ; ++i) { - cvec[i] = Counter(); - } - - sum = Counter(); - squares = Counter(); - samples = Counter(); - }; - void add(Distribution &d) { - size_type d_size = d.size(); - assert(size() == d_size); - assert(min_track == d.min_track); - assert(max_track == d.max_track); - - underflow += d.underflow; - overflow += d.overflow; - - sum += d.sum; - squares += d.squares; - samples += d.samples; - - if (d.min_val < min_val) { - min_val = d.min_val; - } - - if (d.max_val > max_val) { - max_val = d.max_val; - } - - for (off_type i = 0 ; i < d_size ; ++i) { - cvec[i] += d.cvec[i]; - } - } -}; - -class Histogram: public Stat { - private: - size_type param_buckets; - - Counter min_bucket; - Counter max_bucket; - Counter bucket_size; - - Counter sum; - Counter logs; - Counter squares; - Counter samples; - VCounter cvec; - - public: - Histogram():param_buckets(0) { reset(); } - Histogram(size_type __buckets):cvec(__buckets) { - init(__buckets); - } - void init(size_type __buckets) { - cvec.resize(__buckets); - param_buckets = __buckets; - reset(); - } - - void grow_up(); - void grow_out(); - void grow_convert(); - void add(Histogram& hs); - void sample(Counter val, int number); - - bool zero() const { - return (fabs(samples) < eps); - } - void prepare() {} - void reset() { - min_bucket = 0; - max_bucket = param_buckets - 1; - bucket_size = 1; - - size_type size = param_buckets; - for (off_type i = 0 ; i < size ; ++i) { - cvec[i] = Counter(); - } - - sum = Counter(); - squares = Counter(); - samples = Counter(); - logs = Counter(); - } - - size_type size() const {return param_buckets;} -}; - -class StandardDeviation: public Stat { - private: - Counter sum; - Counter squares; - Counter samples; - - public: - StandardDeviation():sum(Counter()), squares(Counter()), - samples(Counter()) {} - void sample(Counter val, int number) { - Counter value = val * number; - sum += value; - squares += value * value; - samples += number; - } - size_type size() const {return 1;} - bool zero() const {return (fabs(samples) < eps);} - void prepare() {} - void reset() { - sum = Counter(); - squares = Counter(); - samples = Counter(); - } - void add(StandardDeviation& sd) { - sum += sd.sum; - squares += sd.squares; - samples += sd.samples; - } -}; - -class AverageDeviation: public Stat { - private: - Counter sum; - Counter squares; - - public: - AverageDeviation():sum(Counter()), squares(Counter()) {} - void sample(Counter val, int number) { - Counter value = val * number; - sum += value; - squares += value * value; - } - size_type size() const {return 1;} - bool zero() const {return (fabs(sum) < eps);} - void prepare() {} - void reset() { - sum = Counter(); - squares = Counter(); - } - void add(AverageDeviation& ad) { - sum += ad.sum; - squares += ad.squares; - } -}; - -class Op { - private: - std::string opstring; - public: - Op() {} - Op(std::string __opstring):opstring(__opstring){} - Result operator() (Result r) const { - if (opstring == "-") { - return -r; - } else { - assert("Unary operation can only be unary negation." && false); - } - } - Result operator() (Result l, Result r) const { - if (opstring == "+") { - return l + r; - } else if (opstring == "-") { - return l - r; - } else if (opstring == "*") { - return l * r; - } else if (opstring == "/") { - assert(fabs(r) > 1e-8 || "divide zero error"); - return l / r; - } else { - assert("invalid binary opstring " && false); - } - } -}; - -} // namespace Stats - -#endif diff --git a/TOGSim/extern/ramulator_custom/src/Statistics.h b/TOGSim/extern/ramulator_custom/src/Statistics.h deleted file mode 100644 index 8cf555f7..00000000 --- a/TOGSim/extern/ramulator_custom/src/Statistics.h +++ /dev/null @@ -1,236 +0,0 @@ -#ifndef __STATISTICS_H -#define __STATISTICS_H - -#include - -// FIXME Find better way to decide where does it come from -#include "StatType.h" - -/* - IMPORTANT NOTE - Read this first! - - This version of the file provides wrappers to the gem5 statistics classes. - Feel free to go through this file, though it can be difficult to follow - with the degree of abstraction going on. In short, this file currently - provides the following mapping of stat classes. In almost all cases, the - wrapper provides identical and complete functionality to the gem5 stat - classes. All of our classes are defined in the ramulator namespace. - - GEM5 CLASS --> RAMULATOR CLASS - ============================== - Stat::Scalar --> ScalarStat - Stat::Average --> AverageStat - Stat::Vector --> VectorStat - Stat::AverageVector --> AverageVectorStat - Stat::Distribution --> DistributionStat - Stat::Histogram --> HistogramStat - Stat::StandardDeviation --> StandardDeviationStat - Stat::AverageDeviation --> AverageDeviationStat - - All of the stats that you create will be named "ramulator." - automatically, and will be dumped at the end of simulation into the gem5 - stats file. -*/ - -namespace ram { - -template -class StatBase { // wrapper for Stat::DataWrap - protected: - StatType stat; - std::string statName; - - StatBase & self() { return *this; } - public: - StatBase() {} - - StatBase(std::string _name) { - name(_name); - } - - StatBase(std::string _name, std::string _desc) { - name(_name); - desc(_desc); - } - - StatBase & name(std::string _name) { - statName = _name; - stat.name("ramulator." + _name); - - return self(); - } - - const std::string &name(void) const { return statName; } - - StatBase & setSeparator(const std::string & _sep) { - stat.setSeparator(_sep); - return self(); - } - - const std::string &setSeparator() const { return stat.setSeparator(); } - - StatBase & desc(std::string _desc) { - stat.desc(_desc); - return self(); - } - - StatBase & precision(int _precision) { - stat.precision(_precision); - return self(); - } - - StatBase & flags(Stat::Flags _flags) { - stat.flags(_flags); - return self(); - } - - template - StatBase & prereq(const Stat & _prereq) { - stat.prereq(_prereq); - return self(); - } - - Stat::size_type size(void) const { return stat.size(); } - bool zero(void) const { return stat.zero(); } - void prepare(void) { stat.prepare(); } - void reset(void) { stat.reset(); } -}; - -template -class StatBaseVec : public StatBase { // wrapper for Stat::DataWrapVec - protected: - StatBaseVec & self() { return *this; } - - public: - StatBaseVec & subname(Stat::off_type index, const std::string & name) { - StatBase::stat.subname(index, name); - return self(); - } - - StatBaseVec & subdesc(Stat::off_type index, const std::string & desc) { - StatBase::stat.subdesc(index, desc); - return self(); - } -}; - -template -class ScalarStatBase : public StatBase { // wrapper for Stat::ScalarBase - public: - Stat::Counter value() const { return StatBase::stat.value(); }; - void operator++() { ++StatBase::stat; } - void operator--() { --StatBase::stat; } - - void operator++(int) { StatBase::stat++; } - void operator--(int) { StatBase::stat--; } - - template - void operator=(const U &v) { StatBase::stat = v; } - - template - void operator+=(const U &v) { StatBase::stat += v; } - - template - void operator-=(const U &v) { StatBase::stat -= v; } -}; - -template -class VectorStatBase : public StatBaseVec { // wrapper for Stat::VectorBase - protected: - VectorStatBase & self() { return *this; } - - public: - void value(Stat::VCounter & vec) const { StatBase::stat.value(vec); } - void result(Stat::VResult & vec) const { StatBase::stat.result(vec); } - Stat::Result total(void) const { return StatBase::stat.total(); } - - bool check(void) const { return StatBase::stat.check(); } - - VectorStatBase & init(Stat::size_type size) { - StatBase::stat.init(size); - return self(); - } - - Element &operator[](Stat::off_type index) { return StatBase::stat[index]; } -}; - - -template -class DistStatBase : public StatBase { // wrapper for Stat::DistBase - public: - template - void sample(const U &v, int n = 1) { StatBase::stat.sample(v, n); } - - void add(DistStatBase & d) { StatBase::stat.add(d.StatBase::stat); } -}; - - -/* - nice wrappers for the gem5 stats classes used throughout the rest of the code -*/ - -class ScalarStat : public ScalarStatBase { - public: - using ScalarStatBase::operator=; -}; - -class IntervalScalarStat : public ScalarStatBase { - public: - using ScalarStatBase::operator=; -}; - -class AverageStat : public ScalarStatBase { - public: - using ScalarStatBase::operator=; -}; - -class VectorStat : public VectorStatBase { -}; - -class IntervalVectorStat : public VectorStatBase { -}; - -class AverageVectorStat : public VectorStatBase { -}; - -class DistributionStat : public DistStatBase { - protected: - DistributionStat & self() { return *this; } - - public: - DistributionStat & init(Stat::Counter min, Stat::Counter max, Stat::Counter bkt) { - StatBase::stat.init(min, max, bkt); - return self(); - } - -}; - -class HistogramStat : public DistStatBase { - protected: - HistogramStat & self() { return *this; } - - public: - HistogramStat & init(Stat::size_type size) { - StatBase::stat.init(size); - return self(); - } -}; - -class StandardDeviationStat : public DistStatBase { -}; - -class AverageDeviationStat : public DistStatBase { -}; - -/* - Stats TODO - * Formula - * VectorDistribution - * VectorStandardDeviation - * VectorAverageDeviation - * Vector2d - * SparseHistogram -*/ - -} /* namespace ram */ - -#endif diff --git a/TOGSim/include/Common.h b/TOGSim/include/Common.h index 2fd62681..b228fe45 100644 --- a/TOGSim/include/Common.h +++ b/TOGSim/include/Common.h @@ -28,4 +28,5 @@ typedef uint64_t addr_type; typedef uint64_t cycle_type; bool loadConfig(const std::string& config_path, YAML::Node& config_yaml); -SimulationConfig initialize_config(YAML::Node config); \ No newline at end of file +SimulationConfig initialize_config(const YAML::Node& config, + const std::string& config_file_path = {}); \ No newline at end of file diff --git a/TOGSim/include/Dram.h b/TOGSim/include/Dram.h index 978bcdf9..4a897559 100644 --- a/TOGSim/include/Dram.h +++ b/TOGSim/include/Dram.h @@ -1,5 +1,6 @@ #ifndef DRAM_H #define DRAM_H +#include #include #include #include @@ -35,7 +36,6 @@ class Dram { SimulationConfig _config; CacheConfig _m_cache_config; uint32_t _n_ch; - uint32_t _n_bl; uint32_t _n_partitions; uint32_t _n_ch_per_partition; uint32_t _req_size; @@ -51,6 +51,10 @@ class Dram { class DramRamulator2 : public Dram { public: + static void apply_ramulator_config_to_simulation_config( + SimulationConfig& cfg, const std::string& ramulator_config_path, + std::optional dram_freq_mhz_stated = std::nullopt); + DramRamulator2(SimulationConfig config, cycle_type *core_cycle); virtual bool running() override; @@ -72,6 +76,8 @@ class DramRamulator2 : public Dram { class SimpleDRAM: public Dram { public: + static void apply_yaml_to_simulation_config(const YAML::Node& config, SimulationConfig& cfg); + SimpleDRAM(SimulationConfig config, cycle_type *core_cycle); virtual bool running() override; @@ -87,6 +93,8 @@ class SimpleDRAM: public Dram { private: int _latency = 1; std::vector>> _mem; + std::vector _bw_credit_bytes; + double _bytes_per_dram_cycle = 0.; }; #endif \ No newline at end of file diff --git a/TOGSim/include/SimulationConfig.h b/TOGSim/include/SimulationConfig.h index 090f5520..2ef08618 100644 --- a/TOGSim/include/SimulationConfig.h +++ b/TOGSim/include/SimulationConfig.h @@ -1,5 +1,8 @@ #pragma once +#include +#include +#include #include #include @@ -12,6 +15,9 @@ enum class IcntType { SIMPLE, BOOKSIM2 }; enum class L2CacheType { NOCACHE, DATACACHE }; struct SimulationConfig { + /* Path to the top-level hardware YAML passed to the simulator (empty if not from a file). */ + std::string config_file_path; + /* Core config */ std::vector core_type; std::string stonne_config_path; @@ -30,7 +36,7 @@ struct SimulationConfig { uint32_t dram_channels; uint32_t dram_req_size; uint32_t dram_latency; - uint32_t dram_nbl = 1; + float dram_bandwidth_gbps_per_channel = 0.f; uint32_t dram_print_interval; std::string dram_config_path; @@ -61,7 +67,24 @@ struct SimulationConfig { return addr - (addr % dram_req_size); } - float max_dram_bandwidth() { - return dram_freq_mhz * dram_channels * dram_req_size * 2 / dram_nbl / 1000; // GB/s + float max_dram_bandwidth() const { + if (dram_bandwidth_gbps_per_channel > 0.f) + return dram_bandwidth_gbps_per_channel * static_cast(dram_channels); + return 0.f; + } + + /** Resolve `path` for opening on disk: absolute paths as-is; relative paths against top-level config dir. */ + std::string resolve_against_simulation_config(const std::string& path) const { + namespace fs = std::filesystem; + if (path.empty()) + return path; + fs::path p(path); + fs::path abs = p.is_absolute() ? fs::absolute(p) + : !config_file_path.empty() + ? fs::absolute(fs::path(config_file_path).parent_path() / p) + : fs::absolute(p); + std::error_code ec; + fs::path canon = fs::weakly_canonical(abs, ec); + return (ec ? abs : canon).string(); } }; \ No newline at end of file diff --git a/TOGSim/include/Simulator.h b/TOGSim/include/Simulator.h index a0b8b9c5..e3542d51 100644 --- a/TOGSim/include/Simulator.h +++ b/TOGSim/include/Simulator.h @@ -23,7 +23,7 @@ namespace fs = std::filesystem; class Simulator { public: - Simulator(SimulationConfig config); + Simulator(SimulationConfig config, YAML::Node hardware_config_yaml); void enqueue_graph(int partion_id, std::unique_ptr tile_graph) { if (partion_id < 0 || static_cast(partion_id) >= _config.num_partition) { spdlog::error("[Enqueue_graph] Invalid partition_id: {} (valid range: 0 to {}). " @@ -41,6 +41,8 @@ class Simulator { std::unique_ptr& get_partition_scheduler(int core_id) { return _partition_scheduler.at(get_partition_id(core_id)); } void print_core_stat(); void cycle(); + const SimulationConfig& get_config() const { return _config; } + const YAML::Node& get_hardware_config_yaml() const { return _hardware_config_yaml; } private: void core_cycle(); void dram_cycle(); @@ -49,6 +51,7 @@ class Simulator { void set_cycle_mask(); uint32_t get_dest_node(mem_fetch *access); SimulationConfig _config; + YAML::Node _hardware_config_yaml; uint32_t _n_cores; uint32_t _n_sp_cores; uint32_t _noc_node_per_core; diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc index b15381a6..ede991c8 100644 --- a/TOGSim/src/Common.cc +++ b/TOGSim/src/Common.cc @@ -1,5 +1,9 @@ #include "Common.h" +#include "Dram.h" + +#include + bool loadConfig(const std::string& config_path, YAML::Node& config_yaml) { try { config_yaml = YAML::LoadFile(config_path); @@ -26,8 +30,10 @@ T get_config_value(const YAML::Node& config, std::string key) { } } -SimulationConfig initialize_config(YAML::Node config) { +SimulationConfig initialize_config(const YAML::Node& config, + const std::string& config_file_path) { SimulationConfig parsed_config; + parsed_config.config_file_path = config_file_path; YAML::Emitter emitter; emitter << config; spdlog::info("PyTorchSim config:\n{}", emitter.c_str()); @@ -73,18 +79,25 @@ SimulationConfig initialize_config(YAML::Node config) { if (dram_type_str == "simple") { parsed_config.dram_type = DramType::SIMPLE; - parsed_config.dram_latency = get_config_value(config, "dram_latency"); } else if (dram_type_str == "ramulator2") { parsed_config.dram_type = DramType::RAMULATOR2; - parsed_config.dram_config_path = get_config_value(config, "ramulator_config_path"); + const std::string ramulator_config_rel = + get_config_value(config, "ramulator_config_path"); + parsed_config.dram_config_path = + parsed_config.resolve_against_simulation_config(ramulator_config_rel); } else { throw std::runtime_error(fmt::format("Not implemented dram type {} ", dram_type_str)); } - parsed_config.dram_freq_mhz = get_config_value(config, "dram_freq_mhz"); parsed_config.dram_channels = get_config_value(config, "dram_channels"); - parsed_config.dram_req_size = get_config_value(config, "dram_req_size_byte"); - parsed_config.dram_nbl = get_config_value(config, "dram_num_burst_length"); + + if (parsed_config.dram_type == DramType::RAMULATOR2) { + DramRamulator2::apply_ramulator_config_to_simulation_config( + parsed_config, parsed_config.dram_config_path, + config["dram_freq_mhz"] ? std::optional(config["dram_freq_mhz"].as()) : std::nullopt); + } else { + SimpleDRAM::apply_yaml_to_simulation_config(config, parsed_config); + } if (config["dram_stats_print_period_cycles"]) parsed_config.dram_print_interval = config["dram_stats_print_period_cycles"].as(); diff --git a/TOGSim/src/Dram.cc b/TOGSim/src/Dram.cc index 95a55ca3..798acb7b 100644 --- a/TOGSim/src/Dram.cc +++ b/TOGSim/src/Dram.cc @@ -1,6 +1,18 @@ #include "Dram.h" +#include +#include #include +#include +#include +#include + +#include + +#include "ramulator/base/config.h" +#include "ramulator/base/factory.h" +#include "ramulator/frontend/i_frontend.h" +#include "ramulator/memory_system/i_memory_system.h" namespace { @@ -26,15 +38,15 @@ static uint32_t next_power_of_2_u32(uint32_t n) { return n + 1; } -/** Bytes/s effective GB/s and avg-per-channel utilization % for a window of `window_cycles` DRAM ticks. */ +/** Bytes/s effective GB/s and utilization % vs `peak_gbps_per_channel` (x n_ch aggregate peak). */ struct DramBwSnapshot { double bandwidth_gbs = 0; double util_avg_ch_pct = 0; }; DramBwSnapshot make_dram_bw_snapshot(long long total_rw_transactions, uint64_t window_cycles, - uint32_t n_ch, uint32_t req_size, uint32_t n_bl, - double dram_freq_mhz) { + uint32_t n_ch, uint32_t req_size, double dram_freq_mhz, + float peak_gbps_per_channel) { DramBwSnapshot out; if (window_cycles == 0 || n_ch == 0) return out; @@ -42,13 +54,108 @@ DramBwSnapshot make_dram_bw_snapshot(long long total_rw_transactions, uint64_t w const double w = static_cast(window_cycles); const double bytes_per_cycle = tx * static_cast(req_size) / w; out.bandwidth_gbs = bytes_per_cycle * dram_freq_mhz / 1000.0; - const double avg_per_ch = tx / static_cast(n_ch); - out.util_avg_ch_pct = avg_per_ch * 100.0 * static_cast(n_bl) / (2.0 * w); + const double peak_total_gbs = + static_cast(peak_gbps_per_channel) * static_cast(n_ch); + if (peak_gbps_per_channel > 0.f && peak_total_gbs > 0.0) + out.util_avg_ch_pct = 100.0 * out.bandwidth_gbs / peak_total_gbs; return out; } +static float peak_gbps_per_channel_from_ramulator_yaml(const Ramulator::ConfigNode& cfg) { + const Ramulator::ConfigNode controllers = cfg["memory_system"]["controllers"]; + const auto& ctrls = controllers.seq(); + if (ctrls.empty()) + throw std::runtime_error("memory_system.controllers is empty"); + const Ramulator::ConfigNode dram = ctrls[0]["dram"]; + const int ch_width = dram["channel_width"].as(); + if (ch_width <= 0) + throw std::runtime_error("invalid channel_width"); + const Ramulator::ConfigNode timing_node = dram["timing"]; + const auto& timing = timing_node.seq(); + if (timing.empty()) + throw std::runtime_error("dram.timing is empty"); + const int rate = timing[0].as(); + if (rate <= 0) + throw std::runtime_error("invalid dram.timing[0] (rate / MT/s)"); + + int pseudo_ch = 1; + const std::string impl = dram["impl"].as(""); + if (impl == "HBM2" || impl == "HBM3") { + const Ramulator::ConfigNode org = dram["org"]; + const Ramulator::ConfigNode org_count = org["count"]; + const auto& counts = org_count.seq(); + if (counts.size() > 1) + pseudo_ch = std::max(1, counts[1].as()); + } + + return static_cast(static_cast(rate) * static_cast(pseudo_ch) * + static_cast(ch_width) / 8.0 / 1000.0); +} + } // namespace +void DramRamulator2::apply_ramulator_config_to_simulation_config( + SimulationConfig& cfg, const std::string& ramulator_config_path, + std::optional dram_freq_mhz_stated) { + Ramulator::ConfigNode config = Ramulator::Config::parse_config_file(ramulator_config_path); + Ramulator::ConfigNode frontend_config; + frontend_config.set("impl", std::string("External")); + frontend_config.set("clock_ratio", 1u); + config.set("frontend", frontend_config); + + float peak_gbps = 0.f; + try { + peak_gbps = peak_gbps_per_channel_from_ramulator_yaml(config); + } catch (const std::exception& e) { + throw std::runtime_error(std::string("[Config/DRAM] Ramulator peak GB/s from yaml: ") + e.what() + " (" + + ramulator_config_path + ")"); + } + + Ramulator::IFrontEnd* fe = Ramulator::Factory::create_frontend(config); + Ramulator::IMemorySystem* mem = Ramulator::Factory::create_memory_system(config); + fe->connect_memory_system(mem); + mem->connect_frontend(fe); + + const float tck_ns = mem->get_tCK(); + if (tck_ns <= 0.f) { + fe->finalize(); + mem->finalize(); + delete fe; + delete mem; + throw std::runtime_error("[Config/DRAM] Ramulator probe: invalid get_tCK() for " + ramulator_config_path); + } + + const int tx_bytes = mem->get_tx_bytes(); + if (tx_bytes <= 0) { + fe->finalize(); + mem->finalize(); + delete fe; + delete mem; + throw std::runtime_error("[Config/DRAM] Ramulator probe: invalid get_tx_bytes() for " + ramulator_config_path); + } + + fe->finalize(); + mem->finalize(); + delete fe; + delete mem; + + cfg.dram_req_size = static_cast(tx_bytes); + cfg.dram_freq_mhz = static_cast(std::lround(1000.0f / tck_ns)); + cfg.dram_bandwidth_gbps_per_channel = peak_gbps; + + if (dram_freq_mhz_stated.has_value()) { + if (*dram_freq_mhz_stated != cfg.dram_freq_mhz) { + throw std::runtime_error(fmt::format( + "[Config/DRAM] ramulator2: top-level dram_freq_mhz {} does not match Ramulator timing " + "(DRAM clock {} MHz from tCK={:.6g} ns, i.e. round(1000/tCK)); remove dram_freq_mhz to use the derived " + "value, or align the Ramulator YAML with the top-level yml. ramulator_config_path={}", + *dram_freq_mhz_stated, cfg.dram_freq_mhz, static_cast(tck_ns), ramulator_config_path)); + } + spdlog::info("[Config/DRAM] ramulator2: dram_freq_mhz {} matches Ramulator-derived DRAM clock (tCK={:.6g} ns)", + *dram_freq_mhz_stated, static_cast(tck_ns)); + } +} + new_addr_type Dram::partition_dram_address(new_addr_type raw_addr) const { if (_req_size == 0 || _n_ch_per_partition == 0) return raw_addr; @@ -87,7 +194,6 @@ uint32_t Dram::get_channel_id(mem_fetch* access) { Dram::Dram(SimulationConfig config, cycle_type* core_cycle) { _core_cycles = core_cycle; _n_ch = config.dram_channels; - _n_bl = config.dram_nbl; _req_size = config.dram_req_size; _n_partitions = config.dram_num_partitions; _n_ch_per_partition = config.dram_channels_per_partitions; @@ -127,9 +233,8 @@ DramRamulator2::DramRamulator2(SimulationConfig config, cycle_type* core_cycle) /* Initialize DRAM Channels */ _mem.resize(_n_ch); for (int ch = 0; ch < _n_ch; ch++) { - _mem[ch] = std::make_unique( - ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, _n_bl, - _req_size, config.dram_freq_mhz); + _mem[ch] = std::make_unique(ch, _n_ch, config.dram_config_path, "Ramulator2", + _config.dram_print_interval, _req_size, config.dram_freq_mhz); } _tx_log2 = log2(_req_size); _tx_ch_log2 = log2(_n_ch_per_partition) + _tx_log2; @@ -180,14 +285,14 @@ void DramRamulator2::cycle() { const long long wtxn = _mem[ch]->interval_writes(); r_all += r; w_all += wtxn; - const DramBwSnapshot bw = - make_dram_bw_snapshot(r + wtxn, w, 1u, _req_size, _n_bl, f_mhz); + const DramBwSnapshot bw = make_dram_bw_snapshot( + r + wtxn, w, 1u, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel); spdlog::trace( "[DRAM] ch {} | BW {:.2f} GB/s, {:.2f}% util | {} reads, {} writes (interval {} cycles)", ch, bw.bandwidth_gbs, bw.util_avg_ch_pct, r, wtxn, w); } - const DramBwSnapshot bw_all = - make_dram_bw_snapshot(r_all + w_all, w, _n_ch, _req_size, _n_bl, f_mhz); + const DramBwSnapshot bw_all = make_dram_bw_snapshot( + r_all + w_all, w, _n_ch, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel); spdlog::info( "[DRAM] all {} ch | BW {:.2f} GB/s, {:.2f}% util (avg/ch) | {} reads, {} writes (interval {} cycles)", _n_ch, bw_all.bandwidth_gbs, bw_all.util_avg_ch_pct, r_all, w_all, w); @@ -247,7 +352,7 @@ void DramRamulator2::print_stat() { if (cycles == 0) return; const double f_mhz = static_cast(_config.dram_freq_mhz); - spdlog::info("[DRAM] per-channel avg BW ({} sim cycles):", cycles); + spdlog::info("[DRAM] per-channel avg BW"); long long tr_all = 0; long long tw_all = 0; for (int ch = 0; ch < _n_ch; ch++) { @@ -255,14 +360,14 @@ void DramRamulator2::print_stat() { const long long tw = _mem[ch]->total_writes(); tr_all += tr; tw_all += tw; - const DramBwSnapshot bw = - make_dram_bw_snapshot(tr + tw, cycles, 1u, _req_size, _n_bl, f_mhz); + const DramBwSnapshot bw = make_dram_bw_snapshot( + tr + tw, cycles, 1u, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel); spdlog::info( "[DRAM] ch {} | avg BW {:.2f} GB/s, {:.2f}% util | {} reads, {} writes", ch, bw.bandwidth_gbs, bw.util_avg_ch_pct, tr, tw); } const DramBwSnapshot bw_all = make_dram_bw_snapshot( - tr_all + tw_all, cycles, _n_ch, _req_size, _n_bl, f_mhz); + tr_all + tw_all, cycles, _n_ch, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel); spdlog::info( "[DRAM] all ch 0..{} | avg BW {:.2f} GB/s, {:.2f}% util (avg/ch) | {} reads, {} writes", _n_ch - 1, bw_all.bandwidth_gbs, bw_all.util_avg_ch_pct, tr_all, tw_all); @@ -274,13 +379,78 @@ void DramRamulator2::print_cache_stats() { } } +void SimpleDRAM::apply_yaml_to_simulation_config(const YAML::Node& config, SimulationConfig& cfg) { + if (!config["dram_latency"]) + throw std::runtime_error("[Config/DRAM] simple: dram_latency is required"); + cfg.dram_latency = config["dram_latency"].as(); + + auto yaml_get_u32 = [](const YAML::Node& n, const char* key, uint32_t def) -> uint32_t { + if (n[key]) + return n[key].as(); + return def; + }; + + cfg.dram_req_size = yaml_get_u32(config, "dram_req_size_byte", 32u); + if (cfg.dram_req_size == 0) + throw std::runtime_error("[Config/DRAM] simple: dram_req_size_byte must be > 0"); + + const bool has_per_ch_bw = static_cast(config["dram_bandwidth_gbps_per_channel"]); + const bool has_total_bw = static_cast(config["dram_bandwidth_gbps_total"]); + if (has_per_ch_bw && has_total_bw) + throw std::runtime_error( + "[Config/DRAM] simple: set only one of dram_bandwidth_gbps_per_channel or dram_bandwidth_gbps_total"); + + const bool has_bw_cap = has_per_ch_bw || has_total_bw; + if (has_bw_cap) { + float per_ch = 0.f; + if (has_total_bw) { + const float tot = config["dram_bandwidth_gbps_total"].as(); + if (cfg.dram_channels == 0) + throw std::runtime_error("[Config/DRAM] dram_channels must be > 0 for dram_bandwidth_gbps_total"); + per_ch = tot / static_cast(cfg.dram_channels); + } else { + per_ch = config["dram_bandwidth_gbps_per_channel"].as(); + } + if (per_ch <= 0.f) + throw std::runtime_error("[Config/DRAM] simple: dram_bandwidth_gbps_* must be > 0"); + cfg.dram_bandwidth_gbps_per_channel = per_ch; + } else { + cfg.dram_bandwidth_gbps_per_channel = 0.f; + } + + if (has_bw_cap && !config["dram_freq_mhz"]) + throw std::runtime_error( + "[Config/DRAM] simple: dram_freq_mhz is required when dram_bandwidth_gbps_per_channel or " + "dram_bandwidth_gbps_total is set (credit refill is per simulated DRAM cycle)"); + cfg.dram_freq_mhz = yaml_get_u32(config, "dram_freq_mhz", cfg.core_freq_mhz); + + if (cfg.dram_freq_mhz == 0) { + throw std::runtime_error("[Config/DRAM] simple: dram_freq_mhz must be > 0"); + } +} + SimpleDRAM::SimpleDRAM(SimulationConfig config, cycle_type* core_cycle) : Dram(config, core_cycle) { - /* Initialize DRAM Channels */ - spdlog::info("[SimpleDRAM] DRAM latecny: {}", config.dram_latency); + spdlog::info("[SimpleDRAM] DRAM latency: {}", config.dram_latency); for (int ch = 0; ch < _n_ch; ch++) { _mem.push_back(std::make_unique>("SimpleDRAM", true, -1)); } - _latency = config.dram_latency; + _latency = config.dram_latency; + _bw_credit_bytes.assign(static_cast(_n_ch), static_cast(_req_size) * 2.0); + if (config.dram_freq_mhz > 0 && config.dram_bandwidth_gbps_per_channel > 0.f) { + _bytes_per_dram_cycle = + static_cast(config.dram_bandwidth_gbps_per_channel) * 1000.0 / + static_cast(config.dram_freq_mhz); + } else { + _bytes_per_dram_cycle = 0.; + } + if (config.dram_bandwidth_gbps_per_channel > 0.f) + spdlog::info("[SimpleDRAM] peak {:.2f} GB/s total, {:.2f} GB/s per channel, {:.4f} B/cycle per channel", + config.max_dram_bandwidth(), config.dram_bandwidth_gbps_per_channel, _bytes_per_dram_cycle); + else + spdlog::info( + "[SimpleDRAM] no bandwidth cap (latency-only); dram_latency {} cycles, dram_freq_mhz {} for tick " + "alignment", + config.dram_latency, config.dram_freq_mhz); } bool SimpleDRAM::running() { @@ -297,20 +467,30 @@ void SimpleDRAM::cycle() { for (int ch = 0; ch < _n_ch; ch++) { _mem[ch]->cycle(); + if (_bytes_per_dram_cycle > 0.0) + _bw_credit_bytes[static_cast(ch)] += _bytes_per_dram_cycle; + // From Cache to DRAM if (mem_fetch* req = _m_caches[ch]->top()) { - //spdlog::info("[Cache->DRAM] mem_fetch: addr={:#x}", req->get_addr()); - - _mem[ch]->push(req, _latency); - _m_caches[ch]->pop(); + const double need = static_cast(_req_size); + bool admit = true; + if (_bytes_per_dram_cycle > 0.0) { + if (_bw_credit_bytes[static_cast(ch)] < need) + admit = false; + else + _bw_credit_bytes[static_cast(ch)] -= need; + } + if (admit) { + _mem[ch]->push(req, _latency); + _m_caches[ch]->pop(); + } } // From DRAM to Cache if (_mem[ch]->arrived()) { mem_fetch* req = _mem[ch]->top(); req->set_reply(); - //spdlog::info("[DRAM->Cache] mem_fetch: addr={:#x}", req->get_addr()); - if(_m_caches[ch]->push(req)) + if (_m_caches[ch]->push(req)) _mem[ch]->pop(); } } diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc index d7fe9f1b..9bd3407f 100644 --- a/TOGSim/src/Simulator.cc +++ b/TOGSim/src/Simulator.cc @@ -1,7 +1,9 @@ #include "Simulator.h" -Simulator::Simulator(SimulationConfig config) - : _config(config), _core_cycles(0) { +Simulator::Simulator(SimulationConfig config, YAML::Node hardware_config_yaml) + : _config(config), + _hardware_config_yaml(std::move(hardware_config_yaml)), + _core_cycles(0) { // Create dram object _core_period = 1000000 / (config.core_freq_mhz); _icnt_period = 1000000 / (config.icnt_freq_mhz); diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc index 57e0e696..f985bdf4 100644 --- a/TOGSim/src/main.cc +++ b/TOGSim/src/main.cc @@ -84,11 +84,13 @@ void process_trace_file(Simulator* simulator, std::string trace_file_path, const simulator->cycle(); } -Simulator* create_simulator(const YAML::Node& config_yaml) { - SimulationConfig config = initialize_config(config_yaml); - - auto simulator = new Simulator(config); - return simulator; +Simulator* create_simulator(const std::string& config_path) { + YAML::Node config_yaml; + if (!loadConfig(config_path, config_yaml)) { + return nullptr; + } + SimulationConfig config = initialize_config(config_yaml, config_path); + return new Simulator(config, std::move(config_yaml)); } int main(int argc, char** argv) { @@ -138,21 +140,19 @@ int main(int argc, char** argv) { /* Create simulator */ cmd_parser.set_if_defined("config", &config_path); - // Load config once for reuse - YAML::Node config_yaml; - if (!loadConfig(config_path, config_yaml)) { + auto simulator = create_simulator(config_path); + if (!simulator) { spdlog::error("[TOGSim] Failed to load config file: {}", config_path); exit(1); } - auto simulator = create_simulator(config_yaml); - // Get trace file path cmd_parser.set_if_defined("models_list", &trace_file_path); if (!trace_file_path.empty()) { // Process trace file (unified mode: supports both FIFO and regular file) - process_trace_file(simulator, trace_file_path, config_yaml); + process_trace_file(simulator, trace_file_path, + simulator->get_hardware_config_yaml()); spdlog::info("Simulation finished"); simulator->print_core_stat(); } else { diff --git a/configs/heterogeneous_c2_simple_noc.yml b/configs/heterogeneous_c2_simple_noc.yml index 9c596d85..8a3401fe 100644 --- a/configs/heterogeneous_c2_simple_noc.yml +++ b/configs/heterogeneous_c2_simple_noc.yml @@ -16,8 +16,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/configs/ramulator2_configs/HBM2_TPUv2.yaml b/configs/ramulator2_configs/HBM2_TPUv2.yaml new file mode 100644 index 00000000..88c1adf3 --- /dev/null +++ b/configs/ramulator2_configs/HBM2_TPUv2.yaml @@ -0,0 +1,476 @@ +{ + "frontend": { + "impl": "External", + "clock_ratio": 1 + }, + "memory_system": { + "impl": "GenericDRAM", + "clock_ratio": 1, + "channel_mapper": { + "impl": "PassThroughChannelMapper" + }, + "controllers": [ + { + "impl": "HBM", + "wr_low_watermark": 0.2, + "wr_high_watermark": 0.8, + "read_buffer_size": 64, + "write_buffer_size": 64, + "priority_buffer_size": 1568, + "scheduler": { + "impl": "FRFCFS" + }, + "refresh_manager": { + "impl": "AllBank", + "scope": "PseudoChannel" + }, + "row_policy": { + "impl": "Open" + }, + "addr_mapper": { + "impl": "RoBaRaCoCh" + }, + "dram": { + "impl": "HBM2", + "org": { + "dq": 64, + "count": [ + 1, + 2, + 4, + 4, + 65536, + 32 + ] + }, + "timing": [ + 1400, + 2, + 9, + 9, + 7, + 9, + 22, + 31, + 11, + 4, + 4, + 2, + 4, + 4, + 4, + 5, + 6, + 11, + 245, + 112, + 6, + 2730, + 86, + 1429 + ], + "channel_width": 64, + "read_latency": 11, + "timing_constraints": [ + [ + 0, + [ + 0 + ], + [ + 0, + 1, + 2, + 7, + 8 + ], + 2 + ], + [ + 1, + [ + 3, + 5 + ], + [ + 3, + 5 + ], + 2 + ], + [ + 1, + [ + 4, + 6 + ], + [ + 4, + 6 + ], + 2 + ], + [ + 1, + [ + 3, + 5 + ], + [ + 3, + 5 + ], + 2 + ], + [ + 1, + [ + 4, + 6 + ], + [ + 4, + 6 + ], + 2 + ], + [ + 1, + [ + 3, + 5 + ], + [ + 4, + 6 + ], + 9 + ], + [ + 1, + [ + 4, + 6 + ], + [ + 3, + 5 + ], + 11 + ], + [ + 1, + [ + 3 + ], + [ + 2 + ], + 4 + ], + [ + 1, + [ + 4 + ], + [ + 2 + ], + 17 + ], + [ + 1, + [ + 0 + ], + [ + 0 + ], + 4 + ], + [ + 1, + [ + 0 + ], + [ + 0 + ], + 11, + 4 + ], + [ + 1, + [ + 0 + ], + [ + 2 + ], + 23 + ], + [ + 1, + [ + 2 + ], + [ + 0 + ], + 8 + ], + [ + 1, + [ + 0 + ], + [ + 7 + ], + 32 + ], + [ + 1, + [ + 1, + 2 + ], + [ + 7 + ], + 9 + ], + [ + 1, + [ + 5 + ], + [ + 7 + ], + 13 + ], + [ + 1, + [ + 6 + ], + [ + 7 + ], + 26 + ], + [ + 1, + [ + 7 + ], + [ + 0 + ], + 244 + ], + [ + 1, + [ + 7 + ], + [ + 2 + ], + 245 + ], + [ + 1, + [ + 8 + ], + [ + 0 + ], + 5 + ], + [ + 1, + [ + 0 + ], + [ + 8 + ], + 5 + ], + [ + 2, + [ + 3, + 5 + ], + [ + 3, + 5 + ], + 4 + ], + [ + 2, + [ + 4, + 6 + ], + [ + 4, + 6 + ], + 4 + ], + [ + 2, + [ + 4, + 6 + ], + [ + 3, + 5 + ], + 12 + ], + [ + 2, + [ + 0 + ], + [ + 0 + ], + 4 + ], + [ + 3, + [ + 0 + ], + [ + 0 + ], + 31 + ], + [ + 3, + [ + 0 + ], + [ + 3, + 5 + ], + 10 + ], + [ + 3, + [ + 0 + ], + [ + 4, + 6 + ], + 8 + ], + [ + 3, + [ + 0 + ], + [ + 1 + ], + 23 + ], + [ + 3, + [ + 1 + ], + [ + 0 + ], + 8 + ], + [ + 3, + [ + 3 + ], + [ + 1 + ], + 4 + ], + [ + 3, + [ + 4 + ], + [ + 1 + ], + 17 + ], + [ + 3, + [ + 5 + ], + [ + 0 + ], + 12 + ], + [ + 3, + [ + 6 + ], + [ + 0 + ], + 25 + ], + [ + 3, + [ + 8 + ], + [ + 0 + ], + 111 + ], + [ + 3, + [ + 0 + ], + [ + 8 + ], + 32 + ], + [ + 3, + [ + 1 + ], + [ + 8 + ], + 9 + ] + ] + } + } + ] + } +} \ No newline at end of file diff --git a/configs/ramulator2_configs/HBM2_TPUv3.yaml b/configs/ramulator2_configs/HBM2_TPUv3.yaml index 01cab613..50a3ea3b 100644 --- a/configs/ramulator2_configs/HBM2_TPUv3.yaml +++ b/configs/ramulator2_configs/HBM2_TPUv3.yaml @@ -44,15 +44,15 @@ ] }, "timing": [ - 2000, + 1880, 2, - 14, - 14, - 12, - 14, - 34, - 48, - 16, + 13, + 13, + 11, + 13, + 31, + 44, + 15, 5, 5, 2, @@ -62,15 +62,15 @@ 6, 8, 15, - 350, - 160, + 329, + 151, 8, - 3900, - 122, - 1000 + 3666, + 115, + 1064 ], "channel_width": 64, - "read_latency": 16, + "read_latency": 15, "timing_constraints": [ [ 0, @@ -144,7 +144,7 @@ 4, 6 ], - 13 + 12 ], [ 1, @@ -176,7 +176,7 @@ [ 2 ], - 23 + 22 ], [ 1, @@ -207,7 +207,7 @@ [ 2 ], - 35 + 32 ], [ 1, @@ -217,7 +217,7 @@ [ 0 ], - 13 + 12 ], [ 1, @@ -227,7 +227,7 @@ [ 7 ], - 49 + 45 ], [ 1, @@ -238,7 +238,7 @@ [ 7 ], - 14 + 13 ], [ 1, @@ -248,7 +248,7 @@ [ 7 ], - 19 + 18 ], [ 1, @@ -258,7 +258,7 @@ [ 7 ], - 37 + 35 ], [ 1, @@ -268,7 +268,7 @@ [ 0 ], - 349 + 328 ], [ 1, @@ -278,7 +278,7 @@ [ 2 ], - 350 + 329 ], [ 1, @@ -354,7 +354,7 @@ [ 0 ], - 48 + 44 ], [ 3, @@ -365,7 +365,7 @@ 3, 5 ], - 15 + 14 ], [ 3, @@ -376,7 +376,7 @@ 4, 6 ], - 13 + 12 ], [ 3, @@ -386,7 +386,7 @@ [ 1 ], - 35 + 32 ], [ 3, @@ -396,7 +396,7 @@ [ 0 ], - 13 + 12 ], [ 3, @@ -416,7 +416,7 @@ [ 1 ], - 23 + 22 ], [ 3, @@ -426,7 +426,7 @@ [ 0 ], - 18 + 17 ], [ 3, @@ -436,7 +436,7 @@ [ 0 ], - 36 + 34 ], [ 3, @@ -446,7 +446,7 @@ [ 0 ], - 159 + 150 ], [ 3, @@ -456,7 +456,7 @@ [ 8 ], - 49 + 45 ], [ 3, @@ -466,7 +466,7 @@ [ 8 ], - 14 + 13 ] ] } diff --git a/configs/ramulator2_configs/gen_configs.py b/configs/ramulator2_configs/gen_configs.py index d27cd6de..1c630e5c 100644 --- a/configs/ramulator2_configs/gen_configs.py +++ b/configs/ramulator2_configs/gen_configs.py @@ -83,8 +83,11 @@ def gen_hbm2(): def gen_hbm2_tpuv3(): - # TPUv3 HBM2: 900MHz → ~1.8 Gbps. Closest available preset: HBM2_2000Mbps - dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_2000Mbps") + dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_1880Mbps") + return make_config(dram, clock_ratio=1, refresh_scope="PseudoChannel") + +def gen_hbm2_tpuv2(): + dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_1400Mbps") return make_config(dram, clock_ratio=1, refresh_scope="PseudoChannel") def gen_ddr4(): @@ -107,6 +110,7 @@ def gen_lpddr5x(): CONFIGS = { "HBM2.yaml": gen_hbm2, "HBM2_TPUv3.yaml": gen_hbm2_tpuv3, + "HBM2_TPUv2.yaml": gen_hbm2_tpuv2, "DDR4.yaml": gen_ddr4, "LPDDR5.yaml": gen_lpddr5, "LPDDR5X.yaml": gen_lpddr5x, diff --git a/configs/ramulator_configs/ALDRAM-config.cfg b/configs/ramulator_configs/ALDRAM-config.cfg deleted file mode 100644 index 91cef49c..00000000 --- a/configs/ramulator_configs/ALDRAM-config.cfg +++ /dev/null @@ -1,30 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = ALDRAM - channels = 1 - ranks = 1 - speed = ALDRAM_1600K - org = ALDRAM_4Gb_x8 -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 4 - mem_tick = 1 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/DDR3-config.cfg b/configs/ramulator_configs/DDR3-config.cfg deleted file mode 100644 index 777f6b58..00000000 --- a/configs/ramulator_configs/DDR3-config.cfg +++ /dev/null @@ -1,31 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = DDR3 - channels = 1 - ranks = 1 - speed = DDR3_1600K - org = DDR3_2Gb_x8 -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 4 - mem_tick = 1 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 -# warmup_insts = 100000000 - warmup_insts = 0 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/DDR4-config.cfg b/configs/ramulator_configs/DDR4-config.cfg deleted file mode 100644 index 3f2cd4fd..00000000 --- a/configs/ramulator_configs/DDR4-config.cfg +++ /dev/null @@ -1,31 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = DDR4 - channels = 2 - ranks = 1 - speed = DDR4_3200 - org = DDR4_4Gb_x8 -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 8 - mem_tick = 3 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 -# warmup_insts = 100000000 - warmup_insts = 0 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/DSARP-config.cfg b/configs/ramulator_configs/DSARP-config.cfg deleted file mode 100644 index b67c067c..00000000 --- a/configs/ramulator_configs/DSARP-config.cfg +++ /dev/null @@ -1,31 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = DSARP - subarrays = 8 - channels = 1 - ranks = 1 - speed = DSARP_1333 - org = DSARP_8Gb_x8 -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 4 - mem_tick = 1 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/GDDR5-config.cfg b/configs/ramulator_configs/GDDR5-config.cfg deleted file mode 100644 index 96006841..00000000 --- a/configs/ramulator_configs/GDDR5-config.cfg +++ /dev/null @@ -1,30 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = GDDR5 - channels = 1 - ranks = 1 - speed = GDDR5_6000 - org = GDDR5_8Gb_x16 -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 2 - mem_tick = 1 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/HBM-config.cfg b/configs/ramulator_configs/HBM-config.cfg deleted file mode 100644 index 9e1dcb9e..00000000 --- a/configs/ramulator_configs/HBM-config.cfg +++ /dev/null @@ -1,32 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = HBM - channels = 32 - ranks = 1 - speed = HBM_2Gbps - org = HBM_4Gb -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 32 - mem_tick = 5 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None - mapping = RoBaRaCoCh - scheduler = FRFCFS -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg b/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg deleted file mode 100644 index b8318c23..00000000 --- a/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg +++ /dev/null @@ -1,32 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = HBM - channels = 8 - ranks = 1 - speed = HBM_1Gbps - org = HBM_4Gb -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 32 - mem_tick = 5 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None - mapping = ChRaBaRoCo - scheduler = FRFCFS -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/HBM-config_FCFS.cfg b/configs/ramulator_configs/HBM-config_FCFS.cfg deleted file mode 100644 index cd9aa1e5..00000000 --- a/configs/ramulator_configs/HBM-config_FCFS.cfg +++ /dev/null @@ -1,32 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = HBM - channels = 8 - ranks = 1 - speed = HBM_1Gbps - org = HBM_4Gb -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 32 - mem_tick = 5 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None - mapping = RoBaRaCoCh - scheduler = FCFS -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/HBM-config_FRFCFS.cfg b/configs/ramulator_configs/HBM-config_FRFCFS.cfg deleted file mode 100644 index f08d705f..00000000 --- a/configs/ramulator_configs/HBM-config_FRFCFS.cfg +++ /dev/null @@ -1,32 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = HBM - channels = 8 - ranks = 1 - speed = HBM_1Gbps - org = HBM_4Gb -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 32 - mem_tick = 5 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None - mapping = RoBaRaCoCh - scheduler = FRFCFS -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg b/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg deleted file mode 100644 index 52a68486..00000000 --- a/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg +++ /dev/null @@ -1,32 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = HBM - channels = 8 - ranks = 1 - speed = HBM_1Gbps - org = HBM_4Gb -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 32 - mem_tick = 5 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None - mapping = RoBaRaCoCh - scheduler = FRFCFS_Cap -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg b/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg deleted file mode 100644 index 55d9f4e7..00000000 --- a/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg +++ /dev/null @@ -1,32 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = HBM - channels = 8 - ranks = 1 - speed = HBM_1Gbps - org = HBM_4Gb -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 32 - mem_tick = 5 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None - mapping = RoBaRaCoCh - scheduler = FRFCFS_PriorHit -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg b/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg deleted file mode 100644 index f08d705f..00000000 --- a/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg +++ /dev/null @@ -1,32 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = HBM - channels = 8 - ranks = 1 - speed = HBM_1Gbps - org = HBM_4Gb -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 32 - mem_tick = 5 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None - mapping = RoBaRaCoCh - scheduler = FRFCFS -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg b/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg deleted file mode 100644 index 648e9ab4..00000000 --- a/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg +++ /dev/null @@ -1,32 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = HBM - channels = 8 - ranks = 1 - speed = HBM_1Gbps - org = HBM_4Gb -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 32 - mem_tick = 5 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None - mapping = RoCoBaRaCh - scheduler = FRFCFS -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/HBMx0.5ch-config.cfg b/configs/ramulator_configs/HBMx0.5ch-config.cfg deleted file mode 100644 index 064c8291..00000000 --- a/configs/ramulator_configs/HBMx0.5ch-config.cfg +++ /dev/null @@ -1,30 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = HBM - channels = 4 - ranks = 1 - speed = HBM_1Gbps - org = HBM_4Gb -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 32 - mem_tick = 5 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/HBMx2ch-config.cfg b/configs/ramulator_configs/HBMx2ch-config.cfg deleted file mode 100644 index 17635ad0..00000000 --- a/configs/ramulator_configs/HBMx2ch-config.cfg +++ /dev/null @@ -1,30 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = HBM - channels = 16 - ranks = 1 - speed = HBM_1Gbps - org = HBM_4Gb -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 32 - mem_tick = 5 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/LPDDR3-config.cfg b/configs/ramulator_configs/LPDDR3-config.cfg deleted file mode 100644 index b5618bc3..00000000 --- a/configs/ramulator_configs/LPDDR3-config.cfg +++ /dev/null @@ -1,30 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = LPDDR3 - channels = 1 - ranks = 1 - speed = LPDDR3_1600 - org = LPDDR3_8Gb_x16 -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 4 - mem_tick = 1 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/LPDDR4-config.cfg b/configs/ramulator_configs/LPDDR4-config.cfg deleted file mode 100644 index b74512c9..00000000 --- a/configs/ramulator_configs/LPDDR4-config.cfg +++ /dev/null @@ -1,30 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = LPDDR4 - channels = 2 - ranks = 1 - speed = LPDDR4_2400 - org = LPDDR4_8Gb_x16 -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 8 - mem_tick = 3 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/PCM-config.cfg b/configs/ramulator_configs/PCM-config.cfg deleted file mode 100644 index 1bd7fcce..00000000 --- a/configs/ramulator_configs/PCM-config.cfg +++ /dev/null @@ -1,30 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = PCM - channels = 1 - ranks = 1 - speed = PCM_800D - org = PCM_2Gb_x8 -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 4 - mem_tick = 1 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/SALP-config.cfg b/configs/ramulator_configs/SALP-config.cfg deleted file mode 100644 index 0e5a809a..00000000 --- a/configs/ramulator_configs/SALP-config.cfg +++ /dev/null @@ -1,31 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = SALP-MASA - subarrays = 8 - channels = 1 - ranks = 1 - speed = SALP_1600K - org = SALP_4Gb_x8 -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 4 - mem_tick = 1 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/STTMRAM-config.cfg b/configs/ramulator_configs/STTMRAM-config.cfg deleted file mode 100644 index b689e514..00000000 --- a/configs/ramulator_configs/STTMRAM-config.cfg +++ /dev/null @@ -1,30 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = STTMRAM - channels = 4 - ranks = 1 - speed = STT_1600_1_2 - org = STTMRAM_2Gb_x8 -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 4 - mem_tick = 1 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/TLDRAM-config.cfg b/configs/ramulator_configs/TLDRAM-config.cfg deleted file mode 100644 index 0f7e06e9..00000000 --- a/configs/ramulator_configs/TLDRAM-config.cfg +++ /dev/null @@ -1,31 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = TLDRAM - subarrays = 16 - channels = 1 - ranks = 1 - speed = TLDRAM_1600K - org = TLDRAM_4Gb_x8 -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 4 - mem_tick = 1 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/WideIO-config.cfg b/configs/ramulator_configs/WideIO-config.cfg deleted file mode 100644 index 5270d3cb..00000000 --- a/configs/ramulator_configs/WideIO-config.cfg +++ /dev/null @@ -1,30 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = WideIO - channels = 4 - ranks = 1 - speed = WideIO_266 - org = WideIO_8Gb -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 4 - mem_tick = 1 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translation = None, Random (default value is None) -# -######################## diff --git a/configs/ramulator_configs/WideIO2-config.cfg b/configs/ramulator_configs/WideIO2-config.cfg deleted file mode 100644 index 324b78fe..00000000 --- a/configs/ramulator_configs/WideIO2-config.cfg +++ /dev/null @@ -1,30 +0,0 @@ -######################## -# Example config file -# Comments start with # -# There are restrictions for valid channel/rank numbers - standard = WideIO2 - channels = 8 - ranks = 1 - speed = WideIO2_1066 - org = WideIO2_8Gb -# record_cmd_trace: (default is off): on, off - record_cmd_trace = off -# print_cmd_trace: (default is off): on, off - print_cmd_trace = off - -### Below are parameters only for CPU trace - cpu_tick = 6 - mem_tick = 1 -### Below are parameters only for multicore mode -# When early_exit is on, all cores will be terminated when the earliest one finishes. - early_exit = on -# early_exit = on, off (default value is on) -# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. - expected_limit_insts = 200000000 - warmup_insts = 100000000 - cache = no -# cache = no, L1L2, L3, all (default value is no) - translation = None -# translatino = None, Random (default value is None) -# -######################## diff --git a/configs/stonne_big_c1_simple_noc.yml b/configs/stonne_big_c1_simple_noc.yml index b14838c8..9bbfd6df 100644 --- a/configs/stonne_big_c1_simple_noc.yml +++ b/configs/stonne_big_c1_simple_noc.yml @@ -10,8 +10,6 @@ num_stonne_port: 64 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 8 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycless: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/configs/stonne_single_c1_simple_noc.yml b/configs/stonne_single_c1_simple_noc.yml index 0ed7962c..d1087301 100644 --- a/configs/stonne_single_c1_simple_noc.yml +++ b/configs/stonne_single_c1_simple_noc.yml @@ -10,8 +10,6 @@ num_stonne_port: 8 dram_type: ramulator2 dram_freq_mhz: 700 dram_channels: 8 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml index 08149005..fb07eb6a 100644 --- a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml +++ b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml @@ -9,8 +9,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 700 dram_channels: 16 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml index 12304ce2..f830419b 100644 --- a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml +++ b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml index aec29ff8..6277cc39 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml @@ -9,8 +9,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 700 dram_channels: 32 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycless: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml index 72873f1c..ff976784 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml index c2e962e3..2ed1bb12 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 8 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml index a7607108..1bcc9bb3 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml index 0415876d..3328cf77 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 1200 dram_channels: 16 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml l2d_type: datacache diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml index e411c0f3..bf01913b 100644 --- a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml +++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 32 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml index f164b108..8c71c528 100644 --- a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml +++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml @@ -9,13 +9,10 @@ vpu_spad_size_kb_per_lane: 128 vpu_vector_length_bits: 256 dram_type: ramulator2 -dram_freq: 940 +dram_freq_mhz: 940 dram_channels: 8 -dram_req_size: 32 -dram_latency: 10 -dram_nbl: 2 -dram_print_interval: 10000 -dram_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml +dram_stats_print_period_cycles: 10000 +ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml icnt_type: booksim2 icnt_latency_cycles: 10 diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml index e38f091f..d058f188 100644 --- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml +++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 32 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 dram_num_partitions: 2 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml index 57696243..019a0f0f 100644 --- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml +++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 32 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 dram_num_partitions: 1 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml index f0686055..918510d8 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml @@ -9,8 +9,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 700 dram_channels: 32 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml index 511a5a09..a0985aec 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 32 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml index ce2d932d..166e2e25 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml @@ -13,8 +13,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 32 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml index 499ad823..6119e83d 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 32 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml index da40f01e..9e87511f 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 1200 dram_channels: 32 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml l2d_type: datacache diff --git a/configs/systolic_ws_8x8_c1_booksim.yml b/configs/systolic_ws_8x8_c1_booksim.yml index 6fd305f9..f46d380e 100644 --- a/configs/systolic_ws_8x8_c1_booksim.yml +++ b/configs/systolic_ws_8x8_c1_booksim.yml @@ -9,8 +9,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 800 dram_channels: 1 -dram_req_size_byte: 64 -dram_num_burst_length: 4 dram_stats_print_period_cycles: 100000 ramulator_config_path: ../configs/ramulator2_configs/DDR4.yaml diff --git a/configs/systolic_ws_8x8_c1_simple_noc.yml b/configs/systolic_ws_8x8_c1_simple_noc.yml index 274f633c..1be24b85 100644 --- a/configs/systolic_ws_8x8_c1_simple_noc.yml +++ b/configs/systolic_ws_8x8_c1_simple_noc.yml @@ -9,8 +9,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 800 dram_channels: 1 -dram_req_size_byte: 64 -dram_num_burst_length: 4 dram_stats_print_period_cycles: 100000 ramulator_config_path: ../configs/ramulator2_configs/DDR4.yaml diff --git a/tutorial/session1/togsim_configs/togsim_config.yml b/tutorial/session1/togsim_configs/togsim_config.yml index 72873f1c..ff976784 100644 --- a/tutorial/session1/togsim_configs/togsim_config.yml +++ b/tutorial/session1/togsim_configs/togsim_config.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/tutorial/session1/togsim_configs/togsim_config_2_cores.yml b/tutorial/session1/togsim_configs/togsim_config_2_cores.yml index 3b9b8fc8..a3a4ab93 100644 --- a/tutorial/session1/togsim_configs/togsim_config_2_cores.yml +++ b/tutorial/session1/togsim_configs/togsim_config_2_cores.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 32 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/tutorial/session1/togsim_configs/togsim_config_autotune.yml b/tutorial/session1/togsim_configs/togsim_config_autotune.yml index 2726736a..1ec99521 100644 --- a/tutorial/session1/togsim_configs/togsim_config_autotune.yml +++ b/tutorial/session1/togsim_configs/togsim_config_autotune.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml b/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml index 468a0b44..58c8165d 100644 --- a/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml +++ b/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/tutorial/session1/togsim_configs/togsim_config_functional_only.yml b/tutorial/session1/togsim_configs/togsim_config_functional_only.yml index a1f1b432..b53ca4e0 100644 --- a/tutorial/session1/togsim_configs/togsim_config_functional_only.yml +++ b/tutorial/session1/togsim_configs/togsim_config_functional_only.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml index 62d627a6..e47b63eb 100644 --- a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml +++ b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml diff --git a/tutorial/session1/togsim_configs/togsim_config_timing_only.yml b/tutorial/session1/togsim_configs/togsim_config_timing_only.yml index 0024c073..24017861 100644 --- a/tutorial/session1/togsim_configs/togsim_config_timing_only.yml +++ b/tutorial/session1/togsim_configs/togsim_config_timing_only.yml @@ -10,8 +10,6 @@ vpu_vector_length_bits: 256 dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 -dram_req_size_byte: 32 -dram_num_burst_length: 2 dram_stats_print_period_cycles: 10000 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml From 67d87ce3e87825164521823e4228058bc55b75da Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 21 Apr 2026 23:30:34 +0900 Subject: [PATCH 178/194] [TOGSim/Log] Improve simulator log clarity and wording --- TOGSim/src/Common.cc | 8 ++++---- TOGSim/src/Core.cc | 24 ++++++++++++------------ TOGSim/src/Dram.cc | 20 +++++++++++--------- TOGSim/src/Simulator.cc | 15 +++++++-------- TOGSim/src/main.cc | 11 ++++++----- 5 files changed, 40 insertions(+), 38 deletions(-) diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc index ede991c8..ccb30760 100644 --- a/TOGSim/src/Common.cc +++ b/TOGSim/src/Common.cc @@ -7,7 +7,7 @@ bool loadConfig(const std::string& config_path, YAML::Node& config_yaml) { try { config_yaml = YAML::LoadFile(config_path); - spdlog::info("[LoadConfig] Success to open \"{}\"", config_path); + spdlog::info("[LoadConfig] Loaded configuration file \"{}\"", config_path); return true; } catch (const YAML::BadFile& e) { spdlog::error("[LoadConfig] Failed to open \"{}\" (File not found or inaccessible)", config_path); @@ -159,16 +159,16 @@ SimulationConfig initialize_config(const YAML::Node& config, if (config["partition"][core_partition]) { uint32_t partition_id = config["partition"][core_partition].as(); parsed_config.partiton_map[i] = partition_id; - spdlog::info("[Config/Core] CPU {}: Partition {}", i, partition_id); + spdlog::info("[Config/Core] core_id: {}, partition_id: {}", i, partition_id); } else { - spdlog::warn("[Config/Core] CPU {}: Partition key not found, defaulting to 0", i); + spdlog::warn("[Config/Core] core_id: {}, partition: missing in config, using partition_id 0", i); parsed_config.partiton_map[i] = 0; } } } else { for (int i=0; i(_stat_tot_sa_compute_cycle.at(i) * 100) / _core_cycle); for (int i=0; i<_num_systolic_array_per_core; i++) - spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i), + spdlog::info("Core [{}] : Systolic array [{}] utilization(%): {:.2f}, active_cycles: {}, idle_cycles: {}", _id, i, sa_utilization.at(i), _stat_tot_sa_compute_cycle.at(i), _stat_tot_sa_compute_idle_cycle.at(i)); float dram_bw = _config.dram_req_size * _stat_tot_mem_response * _config.core_freq_mhz / (_core_cycle * 1000); // B/cycle - spdlog::info("Core [{}] : DMA active_cycles, {} DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_tot_dma_cycle, _stat_tot_dma_idle_cycle, dram_bw, _stat_tot_mem_response); - spdlog::info("Core [{}] : Vector unit utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id, + spdlog::info("Core [{}] : DMA active_cycles: {}, DMA idle_cycles: {}, DRAM BW: {:.3f} GB/s ({} responses)", _id, _stat_tot_dma_cycle, _stat_tot_dma_idle_cycle, dram_bw, _stat_tot_mem_response); + spdlog::info("Core [{}] : Vector unit utilization(%): {:.2f}, active cycle: {}, idle_cycle: {}", _id, static_cast(_stat_tot_vu_compute_cycle * 100) / _core_cycle, _stat_tot_vu_compute_cycle, _stat_tot_vu_compute_idle_cycle); spdlog::info("Core [{}] : NUMA local memory: {} requests, remote memory: {} requests", _id, _stat_numa_local_access, _stat_numa_remote_access); - spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle); + spdlog::info("Core [{}] : Total_cycles: {}", _id, _core_cycle); } void Core::print_current_stats() { @@ -485,12 +485,12 @@ void Core::print_current_stats() { spdlog::info("========= Core stat ========="); for (int i=0; i<_num_systolic_array_per_core; i++) - spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i), + spdlog::info("Core [{}] : Systolic array [{}] utilization(%): {:.2f}, active_cycles: {}, idle_cycles: {}", _id, i, sa_utilization.at(i), _stat_sa_compute_cycle.at(i), _stat_sa_compute_idle_cycle.at(i)); - spdlog::info("Core [{}] : DMA active_cycles {}, DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_dma_cycle, _stat_dma_idle_cycle, dram_bw, _stat_mem_response); - spdlog::info("Core [{}] : Vector unit Utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, + spdlog::info("Core [{}] : DMA active_cycles: {}, DMA idle_cycles: {}, DRAM BW: {:.3f} GB/s ({} responses)", _id, _stat_dma_cycle, _stat_dma_idle_cycle, dram_bw, _stat_mem_response); + spdlog::info("Core [{}] : Vector unit Utilization(%): {:.2f}, active_cycles: {}, idle_cycles: {}", _id, static_cast(_stat_vu_compute_cycle * 100) / _config.core_print_interval, _stat_vu_compute_cycle, _stat_vu_compute_idle_cycle); - spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle); + spdlog::info("Core [{}] : Total_cycles: {}", _id, _core_cycle); update_stats(); } diff --git a/TOGSim/src/Dram.cc b/TOGSim/src/Dram.cc index 798acb7b..5211ef47 100644 --- a/TOGSim/src/Dram.cc +++ b/TOGSim/src/Dram.cc @@ -151,8 +151,6 @@ void DramRamulator2::apply_ramulator_config_to_simulation_config( "value, or align the Ramulator YAML with the top-level yml. ramulator_config_path={}", *dram_freq_mhz_stated, cfg.dram_freq_mhz, static_cast(tck_ns), ramulator_config_path)); } - spdlog::info("[Config/DRAM] ramulator2: dram_freq_mhz {} matches Ramulator-derived DRAM clock (tCK={:.6g} ns)", - *dram_freq_mhz_stated, static_cast(tck_ns)); } } @@ -200,7 +198,8 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) { _config = config; _tx_log2 = static_cast(std::log2(_req_size)); - spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}B", config.max_dram_bandwidth(), config.dram_freq_mhz, _n_ch, _req_size); + spdlog::info("[Config/DRAM] Total bandwidth {:.2f} GB/s, {} MHz, {} channels, {} bytes per request", + static_cast(config.max_dram_bandwidth()), config.dram_freq_mhz, _n_ch, _req_size); /* Initialize DRAM Channels */ for (int ch = 0; ch < _n_ch; ch++) { m_to_crossbar_queue.push_back(std::queue()); @@ -288,13 +287,15 @@ void DramRamulator2::cycle() { const DramBwSnapshot bw = make_dram_bw_snapshot( r + wtxn, w, 1u, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel); spdlog::trace( - "[DRAM] ch {} | BW {:.2f} GB/s, {:.2f}% util | {} reads, {} writes (interval {} cycles)", + "[DRAM] channel {} | {:.2f} GB/s avg., {:.2f}% of utilization | {} reads, {} writes " + "(interval {} cycles)", ch, bw.bandwidth_gbs, bw.util_avg_ch_pct, r, wtxn, w); } const DramBwSnapshot bw_all = make_dram_bw_snapshot( r_all + w_all, w, _n_ch, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel); spdlog::info( - "[DRAM] all {} ch | BW {:.2f} GB/s, {:.2f}% util (avg/ch) | {} reads, {} writes (interval {} cycles)", + "[DRAM] all {} channels combined | {:.2f} GB/s aggregate, {:.2f}% of utilization (avg. per channel) | " + "{} reads, {} writes (interval {} cycles)", _n_ch, bw_all.bandwidth_gbs, bw_all.util_avg_ch_pct, r_all, w_all, w); for (int ch = 0; ch < _n_ch; ch++) { _mem[ch]->reset_interval_bw_counters(); @@ -333,7 +334,7 @@ void DramRamulator2::pop(uint32_t cid) { } void DramRamulator2::print_stat() { - spdlog::info("========= DRAM stat ========="); + spdlog::info("=== DRAM statistics ==="); if (_n_ch == 0) return; @@ -352,7 +353,7 @@ void DramRamulator2::print_stat() { if (cycles == 0) return; const double f_mhz = static_cast(_config.dram_freq_mhz); - spdlog::info("[DRAM] per-channel avg BW"); + spdlog::info("[DRAM] Per-channel average bandwidth"); long long tr_all = 0; long long tw_all = 0; for (int ch = 0; ch < _n_ch; ch++) { @@ -363,13 +364,14 @@ void DramRamulator2::print_stat() { const DramBwSnapshot bw = make_dram_bw_snapshot( tr + tw, cycles, 1u, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel); spdlog::info( - "[DRAM] ch {} | avg BW {:.2f} GB/s, {:.2f}% util | {} reads, {} writes", + "[DRAM] channel {} | {:.2f} GB/s avg., {:.2f}% of utilization | {} reads, {} writes", ch, bw.bandwidth_gbs, bw.util_avg_ch_pct, tr, tw); } const DramBwSnapshot bw_all = make_dram_bw_snapshot( tr_all + tw_all, cycles, _n_ch, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel); spdlog::info( - "[DRAM] all ch 0..{} | avg BW {:.2f} GB/s, {:.2f}% util (avg/ch) | {} reads, {} writes", + "[DRAM] channels 0..{} combined | {:.2f} GB/s aggregate, {:.2f}% of utilization (avg. per channel) | " + "{} reads, {} writes", _n_ch - 1, bw_all.bandwidth_gbs, bw_all.util_avg_ch_pct, tr_all, tw_all); } diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc index 9bd3407f..eb3b8670 100644 --- a/TOGSim/src/Simulator.cc +++ b/TOGSim/src/Simulator.cc @@ -25,11 +25,11 @@ Simulator::Simulator(SimulationConfig config, YAML::Node hardware_config_yaml) _cores.resize(_n_cores); for (int core_index = 0; core_index < _n_cores; core_index++) { if (config.core_type[core_index] == CoreType::WS_MESH) { - spdlog::info("[Config/Core] Core {}: {} MHz, Systolic array per core: {}", - core_index, config.core_freq_mhz, config.num_systolic_array_per_core); + spdlog::info("[Config/Core] Core {}: core_freq_mhz: {}, systolic_arrays_per_core: {}", + core_index, config.core_freq_mhz, config.num_systolic_array_per_core); _cores.at(core_index) = std::make_unique(core_index, _config); } else if(config.core_type[core_index] == CoreType::STONNE) { - spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq_mhz); + spdlog::info("[Config/Core] Core {}: core_freq_mhz: {}, core_type: Stonne", core_index, config.core_freq_mhz); _cores.at(core_index) = std::make_unique(core_index, _config); } else { throw std::runtime_error(fmt::format("Not implemented Core type {} ", @@ -46,8 +46,7 @@ Simulator::Simulator(SimulationConfig config, YAML::Node hardware_config_yaml) .string(); spdlog::info("[Config/DRAM] Ramulator2 config path: {}", ramulator_config); YAML::Node dram_config = YAML::LoadFile(ramulator_config); - spdlog::info("Ramulator2 config: "); - std::cout << dram_config << std::endl; + spdlog::info("[Config/DRAM] Ramulator2 configuration:\n{}", YAML::Dump(dram_config)); config.dram_config_path = ramulator_config; _dram = std::make_unique(config, &_core_cycles); } else { @@ -56,12 +55,12 @@ Simulator::Simulator(SimulationConfig config, YAML::Node hardware_config_yaml) } // Create interconnect object - spdlog::info("[Config/Interconnect] Interconnect freq: {} MHz", config.icnt_freq_mhz); + spdlog::info("[Config/Interconnect] interconnect_freq_mhz: {}", config.icnt_freq_mhz); if (config.icnt_type == IcntType::SIMPLE) { - spdlog::info("[Config/Interconnect] SimpleInerconnect selected"); + spdlog::info("[Config/Interconnect] Simple interconnect selected"); _icnt = std::make_unique(config); } else if (config.icnt_type == IcntType::BOOKSIM2) { - spdlog::info("[Config/Interconnect] BookSim2 selected"); + spdlog::info("[Config/Interconnect] BookSim2 interconnect selected"); _icnt = std::make_unique(config); } else { spdlog::error("[Configuration] Invalid interconnect type...!"); diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc index f985bdf4..010826ef 100644 --- a/TOGSim/src/main.cc +++ b/TOGSim/src/main.cc @@ -13,13 +13,14 @@ namespace fs = std::filesystem; namespace po = boost::program_options; -void launchKernel(Simulator* simulator, unsigned int kernel_id, std::string onnx_path, std::string attribute_path, const YAML::Node& config_yaml, cycle_type request_time=0, int partiton_id=0, int device_id=0) { +void launchKernel(Simulator* simulator, unsigned int kernel_id, std::string onnx_path, std::string attribute_path, const YAML::Node& config_yaml, cycle_type request_time=0, int partition_id=0, int device_id=0) { auto graph_praser = TileGraphParser(onnx_path, attribute_path, config_yaml); std::unique_ptr& tile_graph = graph_praser.get_tile_graph(); tile_graph->set_arrival_time(request_time ? request_time : simulator->get_core_cycle()); tile_graph->set_kernel_id(kernel_id); - spdlog::info("[Scheduler {}] Enqueued kernel id: {}, tog_path: {}, operation: {}, request_time: {}", partiton_id, kernel_id, onnx_path, tile_graph->get_name(), request_time); - simulator->enqueue_graph(partiton_id, std::move(tile_graph)); + spdlog::info("[Scheduler {}] Enqueued kernel_id: {}, tog_path: {}, operation: {}, request_time_cycles: {}", + partition_id, kernel_id, onnx_path, tile_graph->get_name(), request_time); + simulator->enqueue_graph(partition_id, std::move(tile_graph)); } void process_trace_file(Simulator* simulator, std::string trace_file_path, const YAML::Node& config_yaml) { @@ -30,7 +31,7 @@ void process_trace_file(Simulator* simulator, std::string trace_file_path, const spdlog::error("[TOGSim] Failed to open trace file: {}", trace_file_path); return; } - spdlog::info("[TOGSim] Reading from trace file: {}", trace_file_path); + spdlog::info("[TOGSim] Reading trace file: {}", trace_file_path); // Read all available commands and process them std::string line; @@ -123,7 +124,7 @@ int main(int argc, char** argv) { if (i > 0) cmd_oss << " "; cmd_oss << argv[i]; } - spdlog::info("[TOGSim] Run command: {}", cmd_oss.str()); + spdlog::info("[TOGSim] Command line: {}", cmd_oss.str()); std::string level = "info"; cmd_parser.set_if_defined("log_level", &level); From 28745d641c55a1ed9c72991d9c1241712f9e68d8 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 21 Apr 2026 23:41:12 +0900 Subject: [PATCH 179/194] [Tutorial] Update session2 jupyter notebook --- tutorial/session2/Hands_on.ipynb | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tutorial/session2/Hands_on.ipynb b/tutorial/session2/Hands_on.ipynb index 9a7c35e3..a2e6899f 100644 --- a/tutorial/session2/Hands_on.ipynb +++ b/tutorial/session2/Hands_on.ipynb @@ -37,18 +37,20 @@ "\n", "device = torch.device(\"npu:0\")\n", "\n", - "def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):\n", - " if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):\n", + "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n", + " if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n", " message = f\"|{name} Test Passed|\"\n", " print(\"-\" * len(message))\n", " print(message)\n", " print(\"-\" * len(message))\n", + " print(\"npu out: \", npu_out.cpu().reshape(-1)[:5])\n", + " print(\"cpu out: \", cpu_out.reshape(-1)[:5])\n", " else:\n", " message = f\"|{name} Test Failed|\"\n", " print(\"-\" * len(message))\n", " print(message)\n", " print(\"-\" * len(message))\n", - " print(\"custom out: \", out.cpu())\n", + " print(\"npu out: \", npu_out.cpu())\n", " print(\"cpu out: \", cpu_out)\n", " exit(1)\n", "\n", @@ -91,6 +93,8 @@ } ], "source": [ + "# os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_functional_only.yml\" \n", + "\n", "input = torch.randn(16, 16)\n", "npu_x = input.to(device=device)\n", "cpu_x = input.to(\"cpu\")\n", From 3cdfb7c352b4d53ba99efe872daaf0dca39dbf0c Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Tue, 21 Apr 2026 23:56:02 +0900 Subject: [PATCH 180/194] [Tutorial] Fix ramulator config path --- tutorial/session1/togsim_configs/togsim_config.yml | 2 +- tutorial/session1/togsim_configs/togsim_config_2_cores.yml | 2 +- tutorial/session1/togsim_configs/togsim_config_autotune.yml | 2 +- .../session1/togsim_configs/togsim_config_external_mapping.yml | 2 +- .../session1/togsim_configs/togsim_config_functional_only.yml | 2 +- .../togsim_configs/togsim_config_no_compiler_optimization.yml | 2 +- tutorial/session1/togsim_configs/togsim_config_timing_only.yml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tutorial/session1/togsim_configs/togsim_config.yml b/tutorial/session1/togsim_configs/togsim_config.yml index ff976784..eb23c833 100644 --- a/tutorial/session1/togsim_configs/togsim_config.yml +++ b/tutorial/session1/togsim_configs/togsim_config.yml @@ -11,7 +11,7 @@ dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 dram_stats_print_period_cycles: 10000 -ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml +ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml icnt_type: simple icnt_latency_cycles: 10 diff --git a/tutorial/session1/togsim_configs/togsim_config_2_cores.yml b/tutorial/session1/togsim_configs/togsim_config_2_cores.yml index a3a4ab93..09be00fe 100644 --- a/tutorial/session1/togsim_configs/togsim_config_2_cores.yml +++ b/tutorial/session1/togsim_configs/togsim_config_2_cores.yml @@ -11,7 +11,7 @@ dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 32 dram_stats_print_period_cycles: 10000 -ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml +ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml icnt_type: simple icnt_latency_cycles: 10 diff --git a/tutorial/session1/togsim_configs/togsim_config_autotune.yml b/tutorial/session1/togsim_configs/togsim_config_autotune.yml index 1ec99521..669c592f 100644 --- a/tutorial/session1/togsim_configs/togsim_config_autotune.yml +++ b/tutorial/session1/togsim_configs/togsim_config_autotune.yml @@ -11,7 +11,7 @@ dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 dram_stats_print_period_cycles: 10000 -ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml +ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml icnt_type: simple icnt_latency_cycles: 10 diff --git a/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml b/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml index 58c8165d..485956bb 100644 --- a/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml +++ b/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml @@ -11,7 +11,7 @@ dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 dram_stats_print_period_cycles: 10000 -ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml +ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml icnt_type: simple icnt_latency_cycles: 10 diff --git a/tutorial/session1/togsim_configs/togsim_config_functional_only.yml b/tutorial/session1/togsim_configs/togsim_config_functional_only.yml index b53ca4e0..990b955c 100644 --- a/tutorial/session1/togsim_configs/togsim_config_functional_only.yml +++ b/tutorial/session1/togsim_configs/togsim_config_functional_only.yml @@ -11,7 +11,7 @@ dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 dram_stats_print_period_cycles: 10000 -ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml +ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml icnt_type: simple icnt_latency_cycles: 10 diff --git a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml index e47b63eb..f56ab6f1 100644 --- a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml +++ b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml @@ -11,7 +11,7 @@ dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 dram_stats_print_period_cycles: 10000 -ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml +ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml icnt_type: simple icnt_latency_cycles: 10 diff --git a/tutorial/session1/togsim_configs/togsim_config_timing_only.yml b/tutorial/session1/togsim_configs/togsim_config_timing_only.yml index 24017861..ad4fb90e 100644 --- a/tutorial/session1/togsim_configs/togsim_config_timing_only.yml +++ b/tutorial/session1/togsim_configs/togsim_config_timing_only.yml @@ -11,7 +11,7 @@ dram_type: ramulator2 dram_freq_mhz: 940 dram_channels: 16 dram_stats_print_period_cycles: 10000 -ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml +ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml icnt_type: simple icnt_latency_cycles: 10 From 46b8e3df8a96cd7b9dca6169814925f4b3d27c32 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Wed, 22 Apr 2026 03:13:48 +0000 Subject: [PATCH 181/194] [Tutorial] Clean up session1 notebooks - Remove cell execution timestamps from metadata - Simplify path setup: remove base_dir/sys.path.append, use absolute paths - Replace extension_config.CONFIG_TOGSIM_CONFIG with direct config paths - Update log file paths to latest run timestamps - Adjust tensor sizes and minor wording fixes --- tutorial/session1/CompilerOptimization.ipynb | 68 ++------ tutorial/session1/DNNServing.ipynb | 39 +---- tutorial/session1/ExecutionMode.ipynb | 158 ++----------------- tutorial/session1/Inference.ipynb | 35 ++-- tutorial/session1/LogAnalysis.ipynb | 35 +--- tutorial/session1/Mapping.ipynb | 94 +++-------- tutorial/session1/TOGSimConfig.ipynb | 97 ++++++++++++ tutorial/session1/Training.ipynb | 87 +++------- 8 files changed, 183 insertions(+), 430 deletions(-) create mode 100644 tutorial/session1/TOGSimConfig.ipynb diff --git a/tutorial/session1/CompilerOptimization.ipynb b/tutorial/session1/CompilerOptimization.ipynb index 6c23bfec..f8eea728 100644 --- a/tutorial/session1/CompilerOptimization.ipynb +++ b/tutorial/session1/CompilerOptimization.ipynb @@ -10,23 +10,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T10:34:23.862488Z", - "iopub.status.busy": "2026-04-16T10:34:23.862221Z", - "iopub.status.idle": "2026-04-16T10:34:26.839597Z", - "shell.execute_reply": "2026-04-16T10:34:26.838615Z", - "shell.execute_reply.started": "2026-04-16T10:34:23.862467Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import torch\n", "import os\n", - "import sys\n", - "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", - "sys.path.append(base_dir)\n", - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"" + "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"" ] }, { @@ -39,15 +28,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T10:34:26.840859Z", - "iopub.status.busy": "2026-04-16T10:34:26.840581Z", - "iopub.status.idle": "2026-04-16T10:34:46.109858Z", - "shell.execute_reply": "2026-04-16T10:34:46.108862Z", - "shell.execute_reply.started": "2026-04-16T10:34:26.840841Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"fused\")\n", @@ -66,43 +47,28 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T10:41:01.000313Z", - "iopub.status.busy": "2026-04-16T10:41:00.999980Z", - "iopub.status.idle": "2026-04-16T10:41:01.273172Z", - "shell.execute_reply": "2026-04-16T10:41:01.272081Z", - "shell.execute_reply.started": "2026-04-16T10:41:01.000290Z" - } - }, + "metadata": {}, "outputs": [], "source": [ - "!cat /workspace/PyTorchSim/togsim_results/20260416_103442_5281e75b.log | grep \"Total execution cycle\"" + "log_path = \"\"\n", + "!cat $log_path | grep \"Total execution cycle\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Disable fusion" + "### Disabling fusion" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T10:44:29.448759Z", - "iopub.status.busy": "2026-04-16T10:44:29.448400Z", - "iopub.status.idle": "2026-04-16T10:44:41.303261Z", - "shell.execute_reply": "2026-04-16T10:44:41.302462Z", - "shell.execute_reply.started": "2026-04-16T10:44:29.448732Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"non_fused\")\n", - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml\"\n", + "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml\"\n", "\n", "input = torch.randn(1024, 1024).to(device=device)\n", "weight = torch.randn(1024, 1024).to(device=device)\n", @@ -117,19 +83,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T10:46:37.996794Z", - "iopub.status.busy": "2026-04-16T10:46:37.996476Z", - "iopub.status.idle": "2026-04-16T10:46:38.497173Z", - "shell.execute_reply": "2026-04-16T10:46:38.496104Z", - "shell.execute_reply.started": "2026-04-16T10:46:37.996776Z" - } - }, + "metadata": {}, "outputs": [], "source": [ - "!cat /workspace/PyTorchSim/togsim_results/20260416_104436_000cb9bc.log | grep \"Total execution cycle\"\n", - "!cat /workspace/PyTorchSim/togsim_results/20260416_104440_e50cdae1.log | grep \"Total execution cycle\"" + "log_path = \"\"\n", + "!cat $log_path | grep \"Total execution cycle\"\n", + "log_path = \"\"\n", + "!cat $log_path | grep \"Total execution cycle\"" ] }, { diff --git a/tutorial/session1/DNNServing.ipynb b/tutorial/session1/DNNServing.ipynb index 0b4e0837..f7f2ea4d 100644 --- a/tutorial/session1/DNNServing.ipynb +++ b/tutorial/session1/DNNServing.ipynb @@ -10,22 +10,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T11:17:18.018872Z", - "iopub.status.busy": "2026-04-16T11:17:18.018643Z", - "iopub.status.idle": "2026-04-16T11:17:20.890421Z", - "shell.execute_reply": "2026-04-16T11:17:20.889693Z", - "shell.execute_reply.started": "2026-04-16T11:17:18.018853Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import torch\n", - "import os\n", - "import sys\n", - "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", - "sys.path.append(base_dir)" + "import os" ] }, { @@ -38,15 +27,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T11:17:20.891167Z", - "iopub.status.busy": "2026-04-16T11:17:20.890953Z", - "iopub.status.idle": "2026-04-16T11:19:42.197046Z", - "shell.execute_reply": "2026-04-16T11:19:42.196023Z", - "shell.execute_reply.started": "2026-04-16T11:17:20.891152Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import torch\n", @@ -55,7 +36,7 @@ "from PyTorchSimFrontend import extension_config\n", "\n", "device = torch.device(\"npu:0\")\n", - "config = extension_config.CONFIG_TOGSIM_CONFIG\n", + "config = \"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", "\n", "model = resnet18().eval()\n", "input = torch.randn(1, 3, 224, 224).to(device=device)\n", @@ -81,14 +62,7 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", - "import torch\n", - "from torchvision.models import resnet18\n", - "from Simulator.simulator import TOGSimulator\n", - "from PyTorchSimFrontend import extension_config\n", - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", "from Scheduler.scheduler import poisson_request_generator\n", - "TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", "\n", "model0_lambda = 5.0\n", "max_time_msec = 1000.0\n", @@ -96,7 +70,7 @@ "target_model1 = resnet18().eval()\n", "\n", "device = torch.device(\"npu:0\")\n", - "config = extension_config.CONFIG_TOGSIM_CONFIG\n", + "config = \"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", "opt_model0 = torch.compile(target_model1.to(device=device, memory_format=torch.channels_last), dynamic=False)\n", "\n", "events = []\n", @@ -104,9 +78,6 @@ "for t in poisson_request_generator(model0_lambda, max_msec_time=max_time_msec):\n", " events.append((t, 0, opt_model0, (x,))) # stream_index 0 → queue / partition 0\n", "\n", - "events.sort(key=lambda e: e[0])\n", - "\n", - "\n", "with TOGSimulator(config_path=config):\n", " for t_msec, stream_index, model, args in events:\n", " torch.npu.launch_model(\n", diff --git a/tutorial/session1/ExecutionMode.ipynb b/tutorial/session1/ExecutionMode.ipynb index bd7d7d73..9d0b051f 100644 --- a/tutorial/session1/ExecutionMode.ipynb +++ b/tutorial/session1/ExecutionMode.ipynb @@ -10,22 +10,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:56:08.883802Z", - "iopub.status.busy": "2026-04-16T05:56:08.883406Z", - "iopub.status.idle": "2026-04-16T05:56:11.858647Z", - "shell.execute_reply": "2026-04-16T05:56:11.857788Z", - "shell.execute_reply.started": "2026-04-16T05:56:08.883784Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import torch\n", - "import os\n", - "import sys\n", - "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", - "sys.path.append(base_dir)" + "import os" ] }, { @@ -38,21 +27,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:56:11.859394Z", - "iopub.status.busy": "2026-04-16T05:56:11.859139Z", - "iopub.status.idle": "2026-04-16T05:56:31.283787Z", - "shell.execute_reply": "2026-04-16T05:56:31.282907Z", - "shell.execute_reply.started": "2026-04-16T05:56:11.859372Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "device = torch.device(\"npu:0\")\n", "\n", - "input = torch.randn(1024, 1024).to(device=device)\n", - "weight = torch.randn(1024, 1024).to(device=device)\n", + "input = torch.randn(512, 512).to(device=device)\n", + "weight = torch.randn(512, 512).to(device=device)\n", "\n", "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", "npu_out = opt_fn(input, weight)" @@ -62,57 +43,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Functional only mode" + "### Functional-only mode" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:56:37.980561Z", - "iopub.status.busy": "2026-04-16T05:56:37.980194Z", - "iopub.status.idle": "2026-04-16T05:56:46.194881Z", - "shell.execute_reply": "2026-04-16T05:56:46.194059Z", - "shell.execute_reply.started": "2026-04-16T05:56:37.980534Z" - } - }, - "outputs": [], - "source": [ - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_functional_only.yml\"\n", - "\n", - "input = torch.randn(1024, 1024).to(device=device)\n", - "weight = torch.randn(1024, 1024).to(device=device)\n", - "\n", - "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", - "npu_out = opt_fn(input, weight)" - ] - }, - { - "cell_type": "markdown", "metadata": {}, - "source": [ - "### Timing only mode" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:56:46.195666Z", - "iopub.status.busy": "2026-04-16T05:56:46.195511Z", - "iopub.status.idle": "2026-04-16T05:56:49.736201Z", - "shell.execute_reply": "2026-04-16T05:56:49.735438Z", - "shell.execute_reply.started": "2026-04-16T05:56:46.195650Z" - } - }, "outputs": [], "source": [ - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", + "os.environ['TOGSIM_CONFIG']=f\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_functional_only.yml\"\n", "\n", - "input = torch.randn(1024, 1024).to(device=device)\n", - "weight = torch.randn(1024, 1024).to(device=device)\n", + "input = torch.randn(512, 512).to(device=device)\n", + "weight = torch.randn(512, 512).to(device=device)\n", "\n", "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", "npu_out = opt_fn(input, weight)" @@ -122,97 +65,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## TOGSim Configuration\n", - "### Single Core" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:59:18.661437Z", - "iopub.status.busy": "2026-04-16T05:59:18.661188Z", - "iopub.status.idle": "2026-04-16T05:59:53.388013Z", - "shell.execute_reply": "2026-04-16T05:59:53.387130Z", - "shell.execute_reply.started": "2026-04-16T05:59:18.661408Z" - } - }, - "outputs": [], - "source": [ - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", - "\n", - "input = torch.randn(2048, 2048).to(device=device)\n", - "weight = torch.randn(2048, 2048).to(device=device)\n", - "\n", - "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", - "npu_out = opt_fn(input, weight)" + "### Timing-only mode" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T06:00:06.720227Z", - "iopub.status.busy": "2026-04-16T06:00:06.719962Z", - "iopub.status.idle": "2026-04-16T06:00:06.979872Z", - "shell.execute_reply": "2026-04-16T06:00:06.978988Z", - "shell.execute_reply.started": "2026-04-16T06:00:06.720210Z" - } - }, - "outputs": [], - "source": [ - "!cat /workspace/PyTorchSim/togsim_results/20260416_055926_3c61ae14.log | grep \"Total execution cycle\"" - ] - }, - { - "cell_type": "markdown", "metadata": {}, - "source": [ - "### Multi-Core" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T06:01:00.604737Z", - "iopub.status.busy": "2026-04-16T06:01:00.604494Z", - "iopub.status.idle": "2026-04-16T06:01:34.826968Z", - "shell.execute_reply": "2026-04-16T06:01:34.826043Z", - "shell.execute_reply.started": "2026-04-16T06:01:00.604717Z" - } - }, "outputs": [], "source": [ - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_2_cores.yml\"\n", + "os.environ['TOGSIM_CONFIG']=f\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", "\n", - "input = torch.randn(2048, 2048).to(device=device)\n", - "weight = torch.randn(2048, 2048).to(device=device)\n", + "input = torch.randn(512, 512).to(device=device)\n", + "weight = torch.randn(512, 512).to(device=device)\n", "\n", "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", "npu_out = opt_fn(input, weight)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T06:01:53.294075Z", - "iopub.status.busy": "2026-04-16T06:01:53.293728Z", - "iopub.status.idle": "2026-04-16T06:01:53.549156Z", - "shell.execute_reply": "2026-04-16T06:01:53.548315Z", - "shell.execute_reply.started": "2026-04-16T06:01:53.294047Z" - } - }, - "outputs": [], - "source": [ - "!cat /workspace/PyTorchSim/togsim_results/20260416_060100_05df9481.log | grep \"Total execution cycle\"" - ] - }, { "cell_type": "code", "execution_count": null, @@ -223,7 +93,7 @@ ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/tutorial/session1/Inference.ipynb b/tutorial/session1/Inference.ipynb index caa5924e..18325d80 100644 --- a/tutorial/session1/Inference.ipynb +++ b/tutorial/session1/Inference.ipynb @@ -11,22 +11,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:42:44.479626Z", - "iopub.status.busy": "2026-04-16T05:42:44.479480Z", - "iopub.status.idle": "2026-04-16T05:42:47.646477Z", - "shell.execute_reply": "2026-04-16T05:42:47.645578Z", - "shell.execute_reply.started": "2026-04-16T05:42:44.479609Z" - } - }, + "metadata": {}, "outputs": [], "source": [ - "import torch\n", - "import os\n", - "import sys\n", - "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", - "sys.path.append(base_dir)" + "import torch" ] }, { @@ -39,15 +27,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:42:47.968708Z", - "iopub.status.busy": "2026-04-16T05:42:47.968420Z", - "iopub.status.idle": "2026-04-16T05:42:49.772696Z", - "shell.execute_reply": "2026-04-16T05:42:49.771704Z", - "shell.execute_reply.started": "2026-04-16T05:42:47.968688Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", @@ -90,13 +70,16 @@ "outputs": [], "source": [ "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n", + " torch.set_printoptions(edgeitems=3)\n", " if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n", - " message = f\"|{name} Test Passed|\"\n", + " message = f\"|{name} Functionality Test Passed|\"\n", " print(\"-\" * len(message))\n", " print(message)\n", " print(\"-\" * len(message))\n", + " print(\"npu out: \", npu_out.cpu()[0, :5])\n", + " print(\"cpu out: \", cpu_out[0, :5])\n", " else:\n", - " message = f\"|{name} Test Failed|\"\n", + " message = f\"|{name} Functionality Test Failed|\"\n", " print(\"-\" * len(message))\n", " print(message)\n", " print(\"-\" * len(message))\n", @@ -124,7 +107,7 @@ ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/tutorial/session1/LogAnalysis.ipynb b/tutorial/session1/LogAnalysis.ipynb index 5cd14f41..9b393384 100644 --- a/tutorial/session1/LogAnalysis.ipynb +++ b/tutorial/session1/LogAnalysis.ipynb @@ -10,23 +10,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T10:00:05.422374Z", - "iopub.status.busy": "2026-04-16T10:00:05.422205Z", - "iopub.status.idle": "2026-04-16T10:00:08.512084Z", - "shell.execute_reply": "2026-04-16T10:00:08.511285Z", - "shell.execute_reply.started": "2026-04-16T10:00:05.422359Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import torch\n", "import os\n", - "import sys\n", - "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", - "sys.path.append(base_dir)\n", - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", + "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", "os.environ['TORCHSIM_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")" ] }, @@ -40,15 +29,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T10:00:46.974212Z", - "iopub.status.busy": "2026-04-16T10:00:46.973814Z", - "iopub.status.idle": "2026-04-16T10:00:52.152064Z", - "shell.execute_reply": "2026-04-16T10:00:52.151231Z", - "shell.execute_reply.started": "2026-04-16T10:00:46.974195Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "device = torch.device(\"npu:0\")\n", @@ -70,15 +51,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T10:25:36.625640Z", - "iopub.status.busy": "2026-04-16T10:25:36.625388Z", - "iopub.status.idle": "2026-04-16T10:25:40.123959Z", - "shell.execute_reply": "2026-04-16T10:25:40.123131Z", - "shell.execute_reply.started": "2026-04-16T10:25:36.625622Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "os.environ['TOGSIM_DEBUG_LEVEL']=\"trace\"\n", diff --git a/tutorial/session1/Mapping.ipynb b/tutorial/session1/Mapping.ipynb index 92ddd5a8..d463c287 100644 --- a/tutorial/session1/Mapping.ipynb +++ b/tutorial/session1/Mapping.ipynb @@ -10,22 +10,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:49:05.540163Z", - "iopub.status.busy": "2026-04-16T05:49:05.539948Z", - "iopub.status.idle": "2026-04-16T05:49:08.550103Z", - "shell.execute_reply": "2026-04-16T05:49:08.549146Z", - "shell.execute_reply.started": "2026-04-16T05:49:05.540146Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import torch\n", - "import os\n", - "import sys\n", - "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", - "sys.path.append(base_dir)" + "import os" ] }, { @@ -38,15 +27,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:49:08.550908Z", - "iopub.status.busy": "2026-04-16T05:49:08.550691Z", - "iopub.status.idle": "2026-04-16T05:49:28.225867Z", - "shell.execute_reply": "2026-04-16T05:49:28.225051Z", - "shell.execute_reply.started": "2026-04-16T05:49:08.550893Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "device = torch.device(\"npu:0\")\n", @@ -61,45 +42,30 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:49:44.788982Z", - "iopub.status.busy": "2026-04-16T05:49:44.788640Z", - "iopub.status.idle": "2026-04-16T05:49:45.048201Z", - "shell.execute_reply": "2026-04-16T05:49:45.047229Z", - "shell.execute_reply.started": "2026-04-16T05:49:44.788954Z" - } - }, + "metadata": {}, "outputs": [], "source": [ - "!cat /workspace/PyTorchSim/togsim_results/20260416_054924_5e1428f9.log | grep \"Total execution cycle\"" + "log_path = \"\"\n", + "!cat $log_path | grep \"Total execution cycle\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Manual Mapping\n", - "User can set tile size manually." + "### External Mapping\n", + "User can set tile size manually from external file." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:49:53.216985Z", - "iopub.status.busy": "2026-04-16T05:49:53.216635Z", - "iopub.status.idle": "2026-04-16T05:50:11.043854Z", - "shell.execute_reply": "2026-04-16T05:50:11.042989Z", - "shell.execute_reply.started": "2026-04-16T05:49:53.216960Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "torch._dynamo.reset()\n", "\n", - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml\"\n", + "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml\"\n", "\n", "input = torch.randn(1024, 1024).to(device=device)\n", "weight = torch.randn(1024, 1024).to(device=device)\n", @@ -111,18 +77,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:50:18.200344Z", - "iopub.status.busy": "2026-04-16T05:50:18.200118Z", - "iopub.status.idle": "2026-04-16T05:50:18.456838Z", - "shell.execute_reply": "2026-04-16T05:50:18.455901Z", - "shell.execute_reply.started": "2026-04-16T05:50:18.200327Z" - } - }, + "metadata": {}, "outputs": [], "source": [ - "!cat /workspace/PyTorchSim/togsim_results/20260416_055004_6ef0f564.log | grep \"Total execution cycle\"" + "log_path = \"\"\n", + "!cat $log_path | grep \"Total execution cycle\"" ] }, { @@ -135,20 +94,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T11:22:40.778257Z", - "iopub.status.busy": "2026-04-16T11:22:40.777947Z", - "iopub.status.idle": "2026-04-16T11:23:10.573193Z", - "shell.execute_reply": "2026-04-16T11:23:10.572225Z", - "shell.execute_reply.started": "2026-04-16T11:22:40.778230Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "torch._dynamo.reset()\n", - "\n", - "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_autotune.yml\"\n", + "os.environ[\"TORCHINDUCTOR_CACHE_DIR\"]=os.path.join(os.getcwd(), \"autotune\")\n", + "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_autotune.yml\"\n", "\n", "input = torch.randn(1024, 1024).to(device=device)\n", "weight = torch.randn(1024, 1024).to(device=device)\n", @@ -160,18 +111,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T11:56:35.774938Z", - "iopub.status.busy": "2026-04-16T11:56:35.774682Z", - "iopub.status.idle": "2026-04-16T11:56:36.022450Z", - "shell.execute_reply": "2026-04-16T11:56:36.020569Z", - "shell.execute_reply.started": "2026-04-16T11:56:35.774921Z" - } - }, + "metadata": {}, "outputs": [], "source": [ - "!cat /workspace/PyTorchSim/togsim_results/20260416_112306_10ad96fd.log | grep \"Total execution cycle\"" + "log_path = \"\"\n", + "!cat $log_path | grep \"Total execution cycle\"" ] }, { diff --git a/tutorial/session1/TOGSimConfig.ipynb b/tutorial/session1/TOGSimConfig.ipynb new file mode 100644 index 00000000..a8c1bb6e --- /dev/null +++ b/tutorial/session1/TOGSimConfig.ipynb @@ -0,0 +1,97 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TOGSim Configuration\n", + "### Single Core" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n", + "\n", + "input = torch.randn(2048, 2048).to(device=device)\n", + "weight = torch.randn(2048, 2048).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "log_path = \"\"\n", + "!cat $log_path | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Multi-Core" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_2_cores.yml\"\n", + "\n", + "input = torch.randn(2048, 2048).to(device=device)\n", + "weight = torch.randn(2048, 2048).to(device=device)\n", + "\n", + "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n", + "npu_out = opt_fn(input, weight)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "log_path = \"\"\n", + "!cat $log_path | grep \"Total execution cycle\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorial/session1/Training.ipynb b/tutorial/session1/Training.ipynb index 1f86a5b8..0ec85a3d 100644 --- a/tutorial/session1/Training.ipynb +++ b/tutorial/session1/Training.ipynb @@ -10,25 +10,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:45:04.361593Z", - "iopub.status.busy": "2026-04-16T05:45:04.361471Z", - "iopub.status.idle": "2026-04-16T05:45:07.515245Z", - "shell.execute_reply": "2026-04-16T05:45:07.514397Z", - "shell.execute_reply.started": "2026-04-16T05:45:04.361578Z" - } - }, + "metadata": {}, "outputs": [], "source": [ - "import os\n", - "import sys\n", - "import torch\n", - "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n", - "sys.path.append(base_dir)\n", - "\n", - "cpu_device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - "npu_device = torch.device(\"npu:0\")" + "import torch" ] }, { @@ -41,23 +26,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:45:07.516141Z", - "iopub.status.busy": "2026-04-16T05:45:07.515901Z", - "iopub.status.idle": "2026-04-16T05:45:07.635695Z", - "shell.execute_reply": "2026-04-16T05:45:07.634872Z", - "shell.execute_reply.started": "2026-04-16T05:45:07.516123Z" - } - }, + "metadata": {}, "outputs": [], "source": [ + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", "torch.manual_seed(0)\n", - "cpu_input = torch.randn(128, 128).to(cpu_device)\n", - "cpu_weight = torch.randn(128, 128).to(cpu_device)\n", - "cpu_target = torch.randn(128, 128).to(cpu_device)\n", - "cpu_input.requires_grad = True\n", - "cpu_weight.requires_grad = True\n", + "cpu_input = torch.randn(128, 128).to(device).requires_grad_()\n", + "cpu_weight = torch.randn(128, 128).to(device).requires_grad_()\n", + "cpu_target = torch.randn(128, 128).to(device).requires_grad_()\n", "\n", "opt_fn = torch.compile(torch.matmul)\n", "cpu_out = opt_fn(cpu_input, cpu_weight)\n", @@ -77,23 +54,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:45:07.636349Z", - "iopub.status.busy": "2026-04-16T05:45:07.636190Z", - "iopub.status.idle": "2026-04-16T05:45:13.350714Z", - "shell.execute_reply": "2026-04-16T05:45:13.349588Z", - "shell.execute_reply.started": "2026-04-16T05:45:07.636333Z" - } - }, + "metadata": {}, "outputs": [], "source": [ + "device = torch.device(\"npu:0\")\n", + "\n", "torch.manual_seed(0)\n", - "npu_input = torch.randn(128, 128).to(npu_device)\n", - "npu_weight = torch.randn(128, 128).to(npu_device)\n", - "npu_target = torch.randn(128, 128).to(npu_device)\n", - "npu_input.requires_grad = True\n", - "npu_weight.requires_grad = True\n", + "npu_input = torch.randn(128, 128).to(device).requires_grad_()\n", + "npu_weight = torch.randn(128, 128).to(device).requires_grad_()\n", + "npu_target = torch.randn(128, 128).to(device).requires_grad_()\n", "\n", "opt_fn = torch.compile(torch.matmul)\n", "npu_out = opt_fn(npu_input, npu_weight)\n", @@ -106,25 +75,19 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:45:13.351955Z", - "iopub.status.busy": "2026-04-16T05:45:13.351757Z", - "iopub.status.idle": "2026-04-16T05:45:13.356589Z", - "shell.execute_reply": "2026-04-16T05:45:13.355757Z", - "shell.execute_reply.started": "2026-04-16T05:45:13.351935Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n", " if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n", - " message = f\"|{name} Test Passed|\"\n", + " message = f\"|{name} Functionality Test Passed|\"\n", " print(\"-\" * len(message))\n", " print(message)\n", " print(\"-\" * len(message))\n", + " print(\"npu out: \", npu_out.cpu()[0, :5])\n", + " print(\"cpu out: \", cpu_out[0, :5])\n", " else:\n", - " message = f\"|{name} Test Failed|\"\n", + " message = f\"|{name} Functionality Test Failed|\"\n", " print(\"-\" * len(message))\n", " print(message)\n", " print(\"-\" * len(message))\n", @@ -136,15 +99,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-16T05:45:13.357014Z", - "iopub.status.busy": "2026-04-16T05:45:13.356871Z", - "iopub.status.idle": "2026-04-16T05:45:13.361392Z", - "shell.execute_reply": "2026-04-16T05:45:13.360681Z", - "shell.execute_reply.started": "2026-04-16T05:45:13.357000Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "test_result(\"MatMul Input Grad\", npu_input.grad, cpu_input.grad)\n", @@ -161,7 +116,7 @@ ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, From 9df9b078ac2d55463590e2ada08334e856a6e0db Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 22 Apr 2026 17:53:08 +0900 Subject: [PATCH 182/194] [Doc] update README for v1.1.0 release --- README.md | 296 +++++++++++++++++++++++++++++------------------------- 1 file changed, 159 insertions(+), 137 deletions(-) diff --git a/README.md b/README.md index a6dd399a..6f6a6abc 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ PyTorchSim is a comprehensive, high-speed, cycle-accurate NPU simulation framewo For more details, please refer to our [paper](https://doi.org/10.1145/3725843.3756045)! +> **Disclaimer.** PyTorchSim is an independent project. It is neither part of the official [PyTorch](https://pytorch.org/) distribution nor affiliated with or endorsed by the PyTorch Foundation. The name reflects that this work builds on the open-source PyTorch compiler stack as its front-end for research purposes. ## Navigation [Overview](#pytorchsim-framework-overview) | [Model Zoo](#model-zoo) | [Getting Started](#getting-started) @@ -22,12 +23,13 @@ For more details, please refer to our [paper](https://doi.org/10.1145/3725843.37 ## PyTorchSim Framework Overview ![Overview](/docs/overview.jpg) PyTorchSim consists of **two main** components: -- **Compiler**: Integrated of [PyTorch2](https://github.com/pytorch/pytorch) compiler stack and generates NPU machine code and TOG for existing PyTorch models. +- **Compiler**: Integrated with the [PyTorch2](https://github.com/pytorch/pytorch) compiler stack; it generates NPU machine code and TOG for existing PyTorch models. - **TOGSim**: Executes TOG for high-speed simulation and accurately models shared resources (DRAM, NoC) through integrated cycle-accurate simulators ([BookSim](https://github.com/booksim/booksim2) and [Ramulator2](https://github.com/CMU-SAFARI/ramulator2)). PyTorchSim **supports**: - DNN inference and [training](#training) - Data-dependent timing modeling (e.g. sparsity) +- [One continuous TOGSim session](#one-togsim-session-one-continuous-log) (single log across multiple forwards) - [Multi-tenancy](#multi-tenancy) - [Compiler optimizations](#compiler-optimizations) - [Mapping](#mapping) @@ -38,13 +40,16 @@ PyTorchSim **supports**: |---|:-:|:-:|---| | ResNet-18 | | ✅ | channel last format | | ResNet-50 | | ✅ | channel last format | +| MobileNet-v2 | | ✅ | `tests/MobileNet/` (torchvision) | +| YOLOv5 | | ✅ | `tests/Yolov5/` | | BERT | | ✅ | | | GPT-2 | | ✅ | | -| ViT | | ✅ | | +| ViT | | ✅ | `tests/test_vit.py` | | Mistral | | ✅ | | -| Diffusion | 🤗 | ✅ | | -| Llama-4 | 🤗 | ⏳ | Under Development | -| DeepSeek v1 | 🤗 | ⏳ | Under Development | +| Stable-diffusion v1 | 🤗 | ✅ | | +| Llama 2/3 | 🤗 | ✅ | `tests/Llama/` (blocks & decode-style paths) | +| DeepSeek-V3 (base) | 🤗 | ✅ | `tests/DeepSeek/` — several ops(e.g., gate ops) are not cycle-modeled | +| Llama-4 | 🤗 | ⏳ | Under development |